mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
AMDGPU/GlobalISel: Fix not using global atomics on gfx9+
For some reason the flat/global atomics end up in the generated matcher table in a different order from SelectionDAG. Use AddedComplexity to prefer checking for global atomics first.
This commit is contained in:
parent
533c97e3ea
commit
77c2d662f8
@ -279,6 +279,7 @@ multiclass FLAT_Atomic_Pseudo<
|
||||
AtomicNoRet <opName, 0> {
|
||||
let PseudoInstr = NAME;
|
||||
let FPAtomic = isFP;
|
||||
let AddedComplexity = -1; // Prefer global atomics if available
|
||||
}
|
||||
|
||||
def _RTN : FLAT_AtomicRet_Pseudo <opName,
|
||||
@ -290,6 +291,7 @@ multiclass FLAT_Atomic_Pseudo<
|
||||
GlobalSaddrTable<0, opName#"_rtn">,
|
||||
AtomicNoRet <opName, 1>{
|
||||
let FPAtomic = isFP;
|
||||
let AddedComplexity = -1; // Prefer global atomics if available
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,15 +30,15 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 4, addrspace 1)
|
||||
@ -69,13 +69,13 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_nortn
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 4, addrspace 1)
|
||||
@ -119,15 +119,15 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s64) = G_CONSTANT i64 2047
|
||||
@ -172,13 +172,13 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s64) = G_CONSTANT i64 2047
|
||||
@ -224,8 +224,8 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
@ -241,8 +241,8 @@ body: |
|
||||
; GFX10: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s64) = G_CONSTANT i64 2048
|
||||
@ -287,7 +287,7 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
@ -303,7 +303,7 @@ body: |
|
||||
; GFX10: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s64) = G_CONSTANT i64 2048
|
||||
@ -349,8 +349,8 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
@ -366,8 +366,8 @@ body: |
|
||||
; GFX10: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s64) = G_CONSTANT i64 4095
|
||||
@ -412,7 +412,7 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
@ -428,7 +428,7 @@ body: |
|
||||
; GFX10: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s64) = G_CONSTANT i64 4095
|
||||
@ -484,8 +484,8 @@ body: |
|
||||
; GFX9: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
@ -501,8 +501,8 @@ body: |
|
||||
; GFX10: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s64) = G_CONSTANT i64 4097
|
||||
@ -557,7 +557,7 @@ body: |
|
||||
; GFX9: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
@ -573,7 +573,7 @@ body: |
|
||||
; GFX10: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s32) = COPY $vgpr2
|
||||
%2:vgpr(s64) = G_CONSTANT i64 4097
|
||||
@ -607,15 +607,15 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s64
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s64) = COPY $vgpr2_vgpr3
|
||||
%2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 8, addrspace 1)
|
||||
@ -646,13 +646,13 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s64_nortn
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s64) = COPY $vgpr2_vgpr3
|
||||
%2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 8, addrspace 1)
|
||||
@ -696,8 +696,8 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
@ -713,8 +713,8 @@ body: |
|
||||
; GFX10: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s64) = COPY $vgpr2_vgpr3
|
||||
%2:vgpr(s64) = G_CONSTANT i64 4095
|
||||
@ -759,7 +759,7 @@ body: |
|
||||
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
|
||||
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX10: $vcc_hi = IMPLICIT_DEF
|
||||
@ -775,7 +775,7 @@ body: |
|
||||
; GFX10: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
|
||||
; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
|
||||
; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %10, %subreg.sub1
|
||||
; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
|
||||
; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
|
||||
%0:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
%1:vgpr(s64) = COPY $vgpr2_vgpr3
|
||||
%2:vgpr(s64) = G_CONSTANT i64 4095
|
||||
|
@ -227,10 +227,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
||||
; GFX9-NEXT: global_atomic_inc v2, v[0:1], v2, off glc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
|
||||
@ -280,10 +280,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o
|
||||
; GFX9-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
|
||||
; GFX9-NEXT: global_atomic_inc v2, v[0:1], v2, off glc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
|
||||
@ -293,31 +293,75 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
|
||||
; GCN-LABEL: global_atomic_inc_noret_i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 42
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
|
||||
; GCN-NEXT: s_endpgm
|
||||
; CI-LABEL: global_atomic_inc_noret_i32:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, 42
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: global_atomic_inc_noret_i32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 42
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: global_atomic_inc_noret_i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 42
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
|
||||
; GCN-LABEL: global_atomic_inc_noret_i32_offset:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 42
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_add_u32 s0, s0, 16
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
|
||||
; GCN-NEXT: s_endpgm
|
||||
; CI-LABEL: global_atomic_inc_noret_i32_offset:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, 42
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_add_u32 s0, s0, 16
|
||||
; CI-NEXT: s_addc_u32 s1, s1, 0
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: global_atomic_inc_noret_i32_offset:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 42
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s0, s0, 16
|
||||
; VI-NEXT: s_addc_u32 s1, s1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: global_atomic_inc_noret_i32_offset:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 42
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, 16
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
|
||||
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
|
||||
ret void
|
||||
@ -390,11 +434,11 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v2, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v5 glc
|
||||
; GFX9-NEXT: global_atomic_inc v3, v[0:1], v5, off glc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v3, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -463,7 +507,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v5 glc
|
||||
; GFX9-NEXT: global_atomic_inc v0, v[0:1], v5, off glc
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
|
||||
@ -762,10 +806,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
||||
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
|
||||
@ -818,10 +862,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
|
||||
; GFX9-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
||||
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
|
||||
@ -831,33 +875,81 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
|
||||
; GCN-LABEL: global_atomic_inc_noret_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 42
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
||||
; GCN-NEXT: s_endpgm
|
||||
; CI-LABEL: global_atomic_inc_noret_i64:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, 42
|
||||
; CI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: global_atomic_inc_noret_i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 42
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: global_atomic_inc_noret_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 42
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
|
||||
; GCN-LABEL: global_atomic_inc_noret_i64_offset:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 42
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_add_u32 s0, s0, 32
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
||||
; GCN-NEXT: s_endpgm
|
||||
; CI-LABEL: global_atomic_inc_noret_i64_offset:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, 42
|
||||
; CI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_add_u32 s0, s0, 32
|
||||
; CI-NEXT: s_addc_u32 s1, s1, 0
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: global_atomic_inc_noret_i64_offset:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 42
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s0, s0, 32
|
||||
; VI-NEXT: s_addc_u32 s1, s1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: global_atomic_inc_noret_i64_offset:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 42
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, 32
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
|
||||
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
|
||||
ret void
|
||||
@ -933,11 +1025,11 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
|
||||
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], v[0:1], off glc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -1009,7 +1101,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
|
||||
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
|
||||
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], v[0:1], off glc
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
|
||||
|
Loading…
Reference in New Issue
Block a user