AMDGPU: If a store defines (alias) a load, it clobbers the load.

Summary: If a store defines (must alias) a load, it clobbers the load. Fixes: SWDEV-258915 Reviewers: arsenm Differential Revision: https://reviews.llvm.org/D92951
2025-01-31 20:51:52 +01:00 · 2020-12-14 16:34:32 -08:00 · 2020-12-14 16:34:32 -08:00 · 23ab920e40
commit 23ab920e40
parent be3f0f958e
3 changed files with 51 additions and 4 deletions
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@ -110,7 +110,9 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
      BasicBlock::iterator(Load) : BB->end();
    auto Q = MDR->getPointerDependencyFrom(
        MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load);
-    if (Q.isClobber() || Q.isUnknown())
+    if (Q.isClobber() || Q.isUnknown() ||
        // Store defines the load and thus clobbers it.
        (Q.isDef() && Q.getInst()->mayWriteToMemory()))
      return true;
  }
  return false;
--- a/test/CodeGen/AMDGPU/store-clobbers-load.ll
+++ b/test/CodeGen/AMDGPU/store-clobbers-load.ll
@ -0,0 +1,43 @@
 ; RUN: opt -S --amdgpu-annotate-uniform < %s | FileCheck -check-prefix=OPT %s
 target datalayout = "A5"
 ; "load vaddr" depends on the store, so we should not mark vaddr as amdgpu.noclobber.
 ; OPT-LABEL: @store_clobbers_load(
 ; OPT:      %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*, !amdgpu.uniform !0
 ; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16
 define amdgpu_kernel void @store_clobbers_load(i32 addrspace(1)* %out, i32 %index) {
 entry:
  %alloca = alloca [4 x i32], addrspace(5)
  %addr0 = bitcast [4 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
  store i32 0, i32 addrspace(5)* %addr0
  %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
  %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16
  %one = insertelement <4 x i32> %zero, i32 1, i32 1
  %two = insertelement <4 x i32> %one, i32 2, i32 2
  %three = insertelement <4 x i32> %two, i32 3, i32 3
  store <4 x i32> %three, <4 x i32> addrspace(5)* %vaddr, align 16
  %rslt = extractelement <4 x i32> %three, i32 %index
  store i32 %rslt, i32 addrspace(1)* %out, align 4
  ret void
 }
 declare i32 @llvm.amdgcn.workitem.id.x()
@lds0 = addrspace(3) global [512 x i32] undef, align 4
 ; To check that %arrayidx0 is not marked as amdgpu.noclobber.
 ; OPT-LABEL: @atomicrmw_clobbers_load(
 ; OPT:       %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0, !amdgpu.uniform !0
 ; OPT-NEXT:  %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
 define amdgpu_kernel void @atomicrmw_clobbers_load(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  %idx.0 = add nsw i32 %tid.x, 2
  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
  %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
  %load = load i32, i32 addrspace(3)* %arrayidx0, align 4
  store i32 %val, i32 addrspace(1)* %out0, align 4
  store i32 %load, i32 addrspace(1)* %out1, align 4
  ret void
 }
--- a/test/CodeGen/AMDGPU/wave32.ll
+++ b/test/CodeGen/AMDGPU/wave32.ll
@ -231,9 +231,9 @@ bb13:
 ; GCN:   ; %bb.{{[0-9]+}}: ; %.preheader
 ; GCN:   BB{{.*}}:
 ; GCN:     global_store_dword
 ; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo
 ; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc
 ; GCN:     global_store_dword
 ; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo
 ; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec
 ; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo
@ -249,10 +249,12 @@ bb13:
 ; GFX1064: s_andn2_b64 exec, exec, [[ACC]]
 ; GCN:     s_cbranch_execz
 ; GCN:   BB{{.*}}:
-; GCN: s_load_dword [[LOAD:s[0-9]+]]
+
 ; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], exec_lo
 ; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], exec
-; GCN: s_cmp_lt_i32 [[LOAD]], 11
+; GCN: global_load_dword [[LOAD:v[0-9]+]]
 ; GFX1032: v_cmp_gt_i32_e32 vcc_lo, 11, [[LOAD]]
 ; GFX1064: v_cmp_gt_i32_e32 vcc, 11, [[LOAD]]
 define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 {
 bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()