1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

AMDGPU: Don't peel of the offset if the resulting base could possibly be negative in Indirect addressing.

Summary:
  Don't peel of the offset if the resulting base could possibly be negative in Indirect addressing.
This is because the M0 field is of unsigned.

This patch achieves the similar goal as https://reviews.llvm.org/D55241, but keeps the optimization
if the base is known unsigned.

Reviewers:
  arsemn

Differential Revision:
  https://reviews.llvm.org/D55568

llvm-svn: 349951
This commit is contained in:
Changpeng Fang 2018-12-21 20:57:34 +00:00
parent bf3176552a
commit b0f778424c
4 changed files with 97 additions and 39 deletions

View File

@ -1454,9 +1454,13 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
// (add n0, c0)
Base = N0;
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
return true;
// Don't peel off the offset (c0) if doing so could possibly lead
// the base (n0) to be negative.
if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
Base = N0;
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
return true;
}
}
if (isa<ConstantSDNode>(Index))

View File

@ -14,8 +14,9 @@
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
; GCN-DAG: v_add_u32_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]]
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc
@ -36,8 +37,8 @@
; GCN: s_mov_b64 [[MASK]], exec
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]]
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_mov_b32 m0, [[READLANE]]

View File

@ -17,8 +17,9 @@
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
; GCN-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]]
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc
@ -39,8 +40,8 @@
; GCN: s_mov_b64 [[MASK]], exec
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]]
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_mov_b32 m0, [[READLANE]]

View File

@ -7,11 +7,12 @@
; indexing of vectors.
; GCN-LABEL: {{^}}extract_w_offset:
; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 2.0
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
; MOVREL-DAG: s_mov_b32 m0, [[IN]]
; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
@ -29,16 +30,17 @@ entry:
; XXX: Could do v_or_b32 directly
; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
; GCN-DAG: s_or_b32
; GCN-DAG: s_or_b32
; GCN-DAG: s_or_b32
; GCN-DAG: s_or_b32
; MOVREL: s_mov_b32 m0
; GCN-DAG: s_or_b32
; GCN-DAG: s_or_b32
; GCN-DAG: s_or_b32
; GCN-DAG: s_or_b32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; MOVREL: v_movrels_b32_e32
; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
@ -176,7 +178,8 @@ entry:
}
; GCN-LABEL: {{^}}insert_w_offset:
; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
; MOVREL-DAG: s_mov_b32 m0, [[IN]]
; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
@ -185,7 +188,7 @@ entry:
; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]]
; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
entry:
@ -195,6 +198,51 @@ entry:
ret void
}
; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset:
; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff
; MOVREL: s_mov_b32 m0, [[BASE]]
; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_on [[BASE]], dst
; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @insert_unsigned_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) {
entry:
%base = zext i16 %in to i32
%add = add i32 %base, 1
%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
store <16 x float> %ins, <16 x float> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}insert_signed_base_plus_offset:
; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]]
; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1
; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]]
; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], dst
; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @insert_signed_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) {
entry:
%base = sext i16 %in to i32
%add = add i32 %base, 1
%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
store <16 x float> %ins, <16 x float> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}insert_wo_offset:
; GCN: s_load_dword [[IN:s[0-9]+]]
@ -354,8 +402,12 @@ entry:
; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
; GCN: s_waitcnt vmcnt(0)
; PREGFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]]
; GFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]]
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc
@ -373,20 +425,20 @@ entry:
; FIXME: Redundant copy
; GCN: s_mov_b64 exec, [[MASK]]
; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]]
; GCN: v_mov_b32_e32 [[VEC_ELT0_2:v[0-9]+]], [[S_ELT0]]
; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]]
; GCN: s_and_saveexec_b64 vcc, vcc
; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]]
; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]]
; IDXMODE: s_set_gpr_idx_off
; GCN-NEXT: s_xor_b64 exec, exec, vcc
@ -492,13 +544,15 @@ bb:
; offset puts outside of superegister bounaries, so clamp to 1st element.
; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\].* offset:48}}
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]
; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]]
; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15
; MOVREL: s_mov_b32 m0, [[IDX]]
; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
; IDXMODE: s_set_gpr_idx_on [[IDX]], src0
; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
; IDXMODE: s_set_gpr_idx_off
; GCN: buffer_store_dword [[EXTRACT]]
@ -514,10 +568,11 @@ entry:
; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
; MOVREL: s_add_i32 m0, [[IDX]], 16
; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
; MOVREL: s_mov_b32 m0, [[ADD_IDX]]
; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
; IDXMODE: s_set_gpr_idx_off
@ -532,18 +587,15 @@ entry:
ret void
}
; Test that the or is folded into the base address register instead of
; added to m0
; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
; GCN-NOT: [[IDX_SHL]]
; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1
; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
; MOVREL: s_mov_b32 m0, [[IDX_FIN]]
; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], src0
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) {
@ -559,12 +611,12 @@ entry:
; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
; GCN-NOT: [[IDX_SHL]]
; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1
; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
; MOVREL: s_mov_b32 m0, [[IDX_FIN]]
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], dst
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {