1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[AMDGPU] Optimized indirect multi-VGPR addressing

SelectMOVRELOffset prevents peeling of a constant from an index
if final base could be negative. isBaseWithConstantOffset() succeeds
if a value is an "add" or "or" operator. In case of "or" it shall
be an add-like "or" which never changes a sign of the sum given a
non-negative offset. I.e. we can safely allow peeling if operator is
an "or".

Differential Revision: https://reviews.llvm.org/D79898
This commit is contained in:
Stanislav Mekhanoshin 2020-05-13 11:46:28 -07:00
parent 2e59c57eca
commit 475fa9072b
5 changed files with 33 additions and 41 deletions

View File

@ -1902,7 +1902,9 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
// (add n0, c0)
// Don't peel off the offset (c0) if doing so could possibly lead
// the base (n0) to be negative.
if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
// (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
(Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
Base = N0;
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
return true;

View File

@ -160,16 +160,14 @@ entry:
ret void
}
; TODO: Should be able to copy to m0 only once and increment base instead.
; GCN-LABEL: {{^}}double8_extelt:
; GCN-NOT: buffer_
; GCN-NOT: s_or_b32
; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ZERO]]
; GCN-DAG: s_mov_b32 m0, [[IND0:s[0-9]+]]
; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0]], 1
; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], [[BASE]]
; GCN: s_mov_b32 m0, [[IND1:s[0-9]+]]
; GCN: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], [[BASE]]
; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
define amdgpu_kernel void @double8_extelt(double addrspace(1)* %out, i32 %sel) {
entry:
@ -179,13 +177,13 @@ entry:
}
; GCN-LABEL: {{^}}double7_extelt:
; GCN-NOT: buffer_
; GCN-NOT: s_or_b32
; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ZERO]]
; GCN-DAG: s_mov_b32 m0, [[IND0:s[0-9]+]]
; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0]], 1
; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], [[BASE]]
; GCN: s_mov_b32 m0, [[IND1:s[0-9]+]]
; GCN: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], [[BASE]]
; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
define amdgpu_kernel void @double7_extelt(double addrspace(1)* %out, i32 %sel) {
entry:

View File

@ -590,12 +590,11 @@ entry:
; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1
; MOVREL: s_mov_b32 m0, [[IDX_FIN]]
; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], gpr_idx(SRC0)
; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0)
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) {
@ -611,12 +610,11 @@ entry:
; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1
; MOVREL: s_mov_b32 m0, [[IDX_FIN]]
; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], gpr_idx(DST)
; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST)
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {

View File

@ -268,16 +268,14 @@ entry:
ret void
}
; TODO: We should be able not to write to m0 twice and just increment base.
; GCN-LABEL: {{^}}double8_inselt:
; GCN-NOT: v_cndmask
; GCN-NOT: buffer_
; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0:s[0-9]+]], 1
; GCN-DAG: s_mov_b32 m0, [[IND0]]
; GCN-DAG: v_movreld_b32_e32 [[BASE:v[0-9]+]],
; GCN: s_mov_b32 m0, [[IND1]]
; GCN: v_movreld_b32_e32 [[BASE]]
; GCN-NOT: s_or_b32
; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0
; GCN-NOT: s_mov_b32 m0
; GCN: v_movreld_b32_e32 v[[#BASE+1]],
define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
entry:
%v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
@ -288,11 +286,11 @@ entry:
; GCN-LABEL: {{^}}double7_inselt:
; GCN-NOT: v_cndmask
; GCN-NOT: buffer_
; GCN-DAG: s_or_b32 [[IND1:s[0-9]+]], [[IND0:s[0-9]+]], 1
; GCN-DAG: s_mov_b32 m0, [[IND0]]
; GCN-DAG: v_movreld_b32_e32 [[BASE:v[0-9]+]],
; GCN: s_mov_b32 m0, [[IND1]]
; GCN: v_movreld_b32_e32 [[BASE]]
; GCN-NOT: s_or_b32
; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
; GCN-DAG: v_movreld_b32_e32 v[[#BASE]], 0
; GCN-NOT: s_mov_b32 m0
; GCN: v_movreld_b32_e32 v[[#BASE+1]],
define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) {
entry:
%v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel

View File

@ -1643,7 +1643,6 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: s_lshl_b32 s4, s4, 1
; SI-NEXT: s_mov_b32 m0, s4
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
@ -1659,10 +1658,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; SI-NEXT: v_mov_b32_e32 v13, s21
; SI-NEXT: v_mov_b32_e32 v14, s22
; SI-NEXT: v_mov_b32_e32 v15, s23
; SI-NEXT: s_or_b32 s4, s4, 1
; SI-NEXT: v_movreld_b32_e32 v0, 0
; SI-NEXT: s_mov_b32 m0, s4
; SI-NEXT: v_movreld_b32_e32 v0, v16
; SI-NEXT: v_movreld_b32_e32 v0, 0
; SI-NEXT: v_movreld_b32_e32 v1, v16
; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
@ -1680,7 +1678,6 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_lshl_b32 s4, s4, 1
; VI-NEXT: s_mov_b32 m0, s4
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
@ -1696,10 +1693,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v13, s21
; VI-NEXT: v_mov_b32_e32 v14, s22
; VI-NEXT: v_mov_b32_e32 v15, s23
; VI-NEXT: s_or_b32 s4, s4, 1
; VI-NEXT: v_movreld_b32_e32 v0, 0
; VI-NEXT: s_mov_b32 m0, s4
; VI-NEXT: v_movreld_b32_e32 v0, v16
; VI-NEXT: v_movreld_b32_e32 v0, 0
; VI-NEXT: v_movreld_b32_e32 v1, v16
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16