mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[AMDGPU] Optimized indirect multi-VGPR addressing
SelectMOVRELOffset prevents peeling of a constant from an index if final base could be negative. isBaseWithConstantOffset() succeeds if a value is an "add" or "or" operator. In case of "or" it shall be an add-like "or" which never changes a sign of the sum given a non-negative offset. I.e. we can safely allow peeling if operator is an "or". Differential Revision: https://reviews.llvm.org/D79898
This commit is contained in:
parent
2e59c57eca
commit
475fa9072b
@ -1902,7 +1902,9 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
|
||||
// (add n0, c0)
|
||||
// Don't peel off the offset (c0) if doing so could possibly lead
|
||||
// the base (n0) to be negative.
|
||||
if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
|
||||
// (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
|
||||
if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
|
||||
(Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
|
||||
Base = N0;
|
||||
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
|
||||
return true;
|
||||
|
@ -160,16 +160,14 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: Should be able to copy to m0 only once and increment base instead.
|
||||
|
||||
; GCN-LABEL: {{^}}double8_extelt:
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-NOT: s_or_b32
|
||||
; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ZERO]]
|
||||
; GCN-DAG: s_mov_b32 m0, [[IND0:s[0-9]+]]
|
||||
; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0]], 1
|
||||
; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], [[BASE]]
|
||||
; GCN: s_mov_b32 m0, [[IND1:s[0-9]+]]
|
||||
; GCN: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], [[BASE]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
|
||||
; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
|
||||
; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
|
||||
; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
|
||||
; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
|
||||
define amdgpu_kernel void @double8_extelt(double addrspace(1)* %out, i32 %sel) {
|
||||
entry:
|
||||
@ -179,13 +177,13 @@ entry:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}double7_extelt:
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-NOT: s_or_b32
|
||||
; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ZERO]]
|
||||
; GCN-DAG: s_mov_b32 m0, [[IND0:s[0-9]+]]
|
||||
; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0]], 1
|
||||
; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], [[BASE]]
|
||||
; GCN: s_mov_b32 m0, [[IND1:s[0-9]+]]
|
||||
; GCN: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], [[BASE]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
|
||||
; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
|
||||
; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
|
||||
; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
|
||||
; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
|
||||
define amdgpu_kernel void @double7_extelt(double addrspace(1)* %out, i32 %sel) {
|
||||
entry:
|
||||
|
@ -590,12 +590,11 @@ entry:
|
||||
; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
|
||||
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
|
||||
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
|
||||
; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1
|
||||
|
||||
; MOVREL: s_mov_b32 m0, [[IDX_FIN]]
|
||||
; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
|
||||
; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], gpr_idx(SRC0)
|
||||
; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0)
|
||||
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) {
|
||||
@ -611,12 +610,11 @@ entry:
|
||||
; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
|
||||
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
|
||||
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
|
||||
; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1
|
||||
|
||||
; MOVREL: s_mov_b32 m0, [[IDX_FIN]]
|
||||
; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
|
||||
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], gpr_idx(DST)
|
||||
; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST)
|
||||
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {
|
||||
|
@ -268,16 +268,14 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: We should be able not to write to m0 twice and just increment base.
|
||||
|
||||
; GCN-LABEL: {{^}}double8_inselt:
|
||||
; GCN-NOT: v_cndmask
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0:s[0-9]+]], 1
|
||||
; GCN-DAG: s_mov_b32 m0, [[IND0]]
|
||||
; GCN-DAG: v_movreld_b32_e32 [[BASE:v[0-9]+]],
|
||||
; GCN: s_mov_b32 m0, [[IND1]]
|
||||
; GCN: v_movreld_b32_e32 [[BASE]]
|
||||
; GCN-NOT: s_or_b32
|
||||
; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
|
||||
; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
|
||||
; GCN-NOT: s_mov_b32 m0
|
||||
; GCN: v_movreld_b32_e32 v[[#BASE+1]],
|
||||
define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
|
||||
@ -288,11 +286,11 @@ entry:
|
||||
; GCN-LABEL: {{^}}double7_inselt:
|
||||
; GCN-NOT: v_cndmask
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0:s[0-9]+]], 1
|
||||
; GCN-DAG: s_mov_b32 m0, [[IND0]]
|
||||
; GCN-DAG: v_movreld_b32_e32 [[BASE:v[0-9]+]],
|
||||
; GCN: s_mov_b32 m0, [[IND1]]
|
||||
; GCN: v_movreld_b32_e32 [[BASE]]
|
||||
; GCN-NOT: s_or_b32
|
||||
; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
|
||||
; GCN-DAG: v_movreld_b32_e32 v[[#BASE]], 0
|
||||
; GCN-NOT: s_mov_b32 m0
|
||||
; GCN: v_movreld_b32_e32 v[[#BASE+1]],
|
||||
define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
|
||||
|
@ -1643,7 +1643,6 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: s_lshl_b32 s4, s4, 1
|
||||
; SI-NEXT: s_mov_b32 m0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s11
|
||||
@ -1659,10 +1658,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
|
||||
; SI-NEXT: v_mov_b32_e32 v13, s21
|
||||
; SI-NEXT: v_mov_b32_e32 v14, s22
|
||||
; SI-NEXT: v_mov_b32_e32 v15, s23
|
||||
; SI-NEXT: s_or_b32 s4, s4, 1
|
||||
; SI-NEXT: v_movreld_b32_e32 v0, 0
|
||||
; SI-NEXT: s_mov_b32 m0, s4
|
||||
; SI-NEXT: v_movreld_b32_e32 v0, v16
|
||||
; SI-NEXT: v_movreld_b32_e32 v0, 0
|
||||
; SI-NEXT: v_movreld_b32_e32 v1, v16
|
||||
; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
|
||||
; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
|
||||
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
||||
@ -1680,7 +1678,6 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VI-NEXT: s_lshl_b32 s4, s4, 1
|
||||
; VI-NEXT: s_mov_b32 m0, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s11
|
||||
@ -1696,10 +1693,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
|
||||
; VI-NEXT: v_mov_b32_e32 v13, s21
|
||||
; VI-NEXT: v_mov_b32_e32 v14, s22
|
||||
; VI-NEXT: v_mov_b32_e32 v15, s23
|
||||
; VI-NEXT: s_or_b32 s4, s4, 1
|
||||
; VI-NEXT: v_movreld_b32_e32 v0, 0
|
||||
; VI-NEXT: s_mov_b32 m0, s4
|
||||
; VI-NEXT: v_movreld_b32_e32 v0, v16
|
||||
; VI-NEXT: v_movreld_b32_e32 v0, 0
|
||||
; VI-NEXT: v_movreld_b32_e32 v1, v16
|
||||
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
|
||||
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
|
||||
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
||||
|
Loading…
x
Reference in New Issue
Block a user