[AMDGPU] Move atomic expand past infer address spaces

There are cases where infer address spaces pass cannot yet infer an address space in the opt pipeline and then in the llc pipeline it runs too late for atomic expand pass to benefit from a specific address space. Move atomic expand pass past the infer address spaces. Fixes: SWDEV-293410 Differential Revision: https://reviews.llvm.org/D105511
2024-11-22 10:42:39 +01:00 · 2021-07-06 14:34:28 -07:00 · 2021-07-06 14:34:28 -07:00 · 9e3245ac4b
commit 9e3245ac4b
parent fdcf893dae
3 changed files with 85 additions and 9 deletions
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -888,9 +888,6 @@ void AMDGPUPassConfig::addIRPasses() {
  // A call to propagate attributes pass in the backend in case opt was not run.
  addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));

-  addPass(createAtomicExpandPass());
-
-
  addPass(createAMDGPULowerIntrinsicsPass());

  // Function calls are not supported, so make sure we inline everything.
@ -921,8 +918,12 @@ void AMDGPUPassConfig::addIRPasses() {
    addPass(createAMDGPULowerModuleLDSPass());
  }

-  if (TM.getOptLevel() > CodeGenOpt::None) {
+  if (TM.getOptLevel() > CodeGenOpt::None)
    addPass(createInferAddressSpacesPass());
+
+  addPass(createAtomicExpandPass());
+
+  if (TM.getOptLevel() > CodeGenOpt::None) {
    addPass(createAMDGPUPromoteAlloca());

    if (EnableSROA)
--- a/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/test/CodeGen/AMDGPU/global-atomics-fp.ll
@ -724,6 +724,81 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)*
  ret void
 }

+define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0 {
+; GFX900-LABEL: infer_as_before_atomic:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, s4
+; GFX900-NEXT:  BB9_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_add_f32_e32 v0, 1.0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v0
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX900-NEXT:    s_cbranch_execnz BB9_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_endpgm
+;
+; GFX908-LABEL: infer_as_before_atomic:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: infer_as_before_atomic:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX10-LABEL: infer_as_before_atomic:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:  BB9_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_execnz BB9_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_endpgm
+  %load = load float*, float* addrspace(4)* %arg
+  %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
+  ret void
+}
+
 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" }
--- a/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/test/CodeGen/AMDGPU/llc-pipeline.ll
@ -34,7 +34,6 @@
 ; GCN-O0-NEXT:     Fix function bitcasts for AMDGPU
 ; GCN-O0-NEXT:     FunctionPass Manager
 ; GCN-O0-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O0-NEXT:       Expand Atomic instructions
 ; GCN-O0-NEXT:     AMDGPU Lower Intrinsics
 ; GCN-O0-NEXT:     AMDGPU Inline All Functions
 ; GCN-O0-NEXT:     CallGraph Construction
@ -44,6 +43,7 @@
 ; GCN-O0-NEXT:     Lower OpenCL enqueued blocks
 ; GCN-O0-NEXT:     Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:     FunctionPass Manager
+; GCN-O0-NEXT:     Expand Atomic instructions
 ; GCN-O0-NEXT:       Lower Garbage Collection Instructions
 ; GCN-O0-NEXT:       Shadow Stack GC Lowering
 ; GCN-O0-NEXT:       Lower constant intrinsics
@ -177,7 +177,6 @@
 ; GCN-O1-NEXT:     Fix function bitcasts for AMDGPU
 ; GCN-O1-NEXT:     FunctionPass Manager
 ; GCN-O1-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O1-NEXT:       Expand Atomic instructions
 ; GCN-O1-NEXT:     AMDGPU Lower Intrinsics
 ; GCN-O1-NEXT:     AMDGPU Inline All Functions
 ; GCN-O1-NEXT:     CallGraph Construction
@ -188,6 +187,7 @@
 ; GCN-O1-NEXT:     Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-NEXT:     FunctionPass Manager
 ; GCN-O1-NEXT:       Infer address spaces
+; GCN-O1-NEXT:       Expand Atomic instructions
 ; GCN-O1-NEXT:       AMDGPU Promote Alloca
 ; GCN-O1-NEXT:       Dominator Tree Construction
 ; GCN-O1-NEXT:       SROA
@ -425,7 +425,6 @@
 ; GCN-O1-OPTS-NEXT:     Fix function bitcasts for AMDGPU
 ; GCN-O1-OPTS-NEXT:     FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O1-OPTS-NEXT:       Expand Atomic instructions
 ; GCN-O1-OPTS-NEXT:     AMDGPU Lower Intrinsics
 ; GCN-O1-OPTS-NEXT:     AMDGPU Inline All Functions
 ; GCN-O1-OPTS-NEXT:     CallGraph Construction
@ -436,6 +435,7 @@
 ; GCN-O1-OPTS-NEXT:     Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-OPTS-NEXT:     FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:       Infer address spaces
+; GCN-O1-OPTS-NEXT:       Expand Atomic instructions
 ; GCN-O1-OPTS-NEXT:       AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:       Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:       SROA
@ -706,7 +706,6 @@
 ; GCN-O2-NEXT:     Fix function bitcasts for AMDGPU
 ; GCN-O2-NEXT:     FunctionPass Manager
 ; GCN-O2-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O2-NEXT:       Expand Atomic instructions
 ; GCN-O2-NEXT:     AMDGPU Lower Intrinsics
 ; GCN-O2-NEXT:     AMDGPU Inline All Functions
 ; GCN-O2-NEXT:     CallGraph Construction
@ -717,6 +716,7 @@
 ; GCN-O2-NEXT:     Lower uses of LDS variables from non-kernel functions
 ; GCN-O2-NEXT:     FunctionPass Manager
 ; GCN-O2-NEXT:       Infer address spaces
+; GCN-O2-NEXT:       Expand Atomic instructions
 ; GCN-O2-NEXT:       AMDGPU Promote Alloca
 ; GCN-O2-NEXT:       Dominator Tree Construction
 ; GCN-O2-NEXT:       SROA
@ -988,7 +988,6 @@
 ; GCN-O3-NEXT:     Fix function bitcasts for AMDGPU
 ; GCN-O3-NEXT:     FunctionPass Manager
 ; GCN-O3-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O3-NEXT:       Expand Atomic instructions
 ; GCN-O3-NEXT:     AMDGPU Lower Intrinsics
 ; GCN-O3-NEXT:     AMDGPU Inline All Functions
 ; GCN-O3-NEXT:     CallGraph Construction
@ -999,6 +998,7 @@
 ; GCN-O3-NEXT:     Lower uses of LDS variables from non-kernel functions
 ; GCN-O3-NEXT:     FunctionPass Manager
 ; GCN-O3-NEXT:       Infer address spaces
+; GCN-O3-NEXT:       Expand Atomic instructions
 ; GCN-O3-NEXT:       AMDGPU Promote Alloca
 ; GCN-O3-NEXT:       Dominator Tree Construction
 ; GCN-O3-NEXT:       SROA