mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 12:43:36 +01:00
[AMDGPU] Always expand system scope fp atomics on gfx90a
FP atomics in system scope cannot be used and shall always be expanded in a CAS loop. Differential Revision: https://reviews.llvm.org/D98085
This commit is contained in:
parent
d0b7eeb58b
commit
2ed90deb94
@ -11949,9 +11949,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
|
||||
.getValueAsString() != "true")
|
||||
return AtomicExpansionKind::CmpXChg;
|
||||
|
||||
if (Subtarget->hasGFX90AInsts())
|
||||
if (Subtarget->hasGFX90AInsts()) {
|
||||
auto SSID = RMW->getSyncScopeID();
|
||||
if (SSID == SyncScope::System ||
|
||||
SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
|
||||
return AtomicExpansionKind::CmpXChg;
|
||||
|
||||
return (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) ?
|
||||
AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
|
||||
}
|
||||
|
||||
if (!Subtarget->hasGFX90AInsts() && AS != AMDGPUAS::GLOBAL_ADDRESS)
|
||||
return AtomicExpansionKind::CmpXChg;
|
||||
|
@ -415,32 +415,12 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)*
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspace(1)* %ptr) #0 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: BB25_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: BB24_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
@ -455,7 +435,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspa
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB25_1
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB24_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
@ -463,6 +443,87 @@ main_body:
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(double addrspace(1)* %ptr) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(double addrspace(1)* %ptr) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: BB26_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB26_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspace(1)* %ptr) #0 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: BB27_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB27_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define double @global_atomic_fadd_f64_rtn(double addrspace(1)* %ptr, double %data) {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
@ -479,21 +540,82 @@ define double @global_atomic_fadd_f64_rtn_pat(double addrspace(1)* %ptr, double
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: BB29_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc scc
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB29_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @global_atomic_fadd_f64_rtn_pat_agent(double addrspace(1)* %ptr, double %data) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @global_atomic_fadd_f64_rtn_pat_system(double addrspace(1)* %ptr, double %data) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: BB31_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB31_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %data) {
|
||||
; GFX90A-LABEL: global_atomic_fmax_f64_rtn:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
@ -522,42 +644,165 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: BB34_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] scc
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB34_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: BB36_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB36_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: BB37_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc scc
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB37_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @flat_atomic_fadd_f64_rtn_pat_system(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: BB39_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB39_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(double* %ptr, double %data) {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
@ -696,7 +941,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspac
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX90A-NEXT: ds_read_b64 v[0:1], v0
|
||||
; GFX90A-NEXT: BB41_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: BB49_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
|
||||
@ -708,7 +953,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspac
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB41_1
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB49_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
|
@ -59,15 +59,29 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr)
|
||||
; GFX90A-LABEL: global_atomic_fadd_ret_f32:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX90A-NEXT: BB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc scc
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
|
||||
@ -156,7 +170,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
@ -202,15 +216,12 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -289,7 +300,162 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)*
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB3_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)* %ptr) #0 {
|
||||
; GFX900-LABEL: global_atomic_fadd_ret_f32_agent:
|
||||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX900-NEXT: BB4_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_wbinvl1_vol
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: s_cbranch_execnz BB4_1
|
||||
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX908-LABEL: global_atomic_fadd_ret_f32_agent:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: BB4_1: ; %atomicrmw.start
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: buffer_wbinvl1_vol
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_execnz BB4_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX908-NEXT: s_endpgm
|
||||
;
|
||||
; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(float addrspace(1)* %ptr) #0 {
|
||||
; GFX900-LABEL: global_atomic_fadd_ret_f32_system:
|
||||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX900-NEXT: BB5_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_wbinvl1_vol
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: s_cbranch_execnz BB5_1
|
||||
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX908-LABEL: global_atomic_fadd_ret_f32_system:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: BB5_1: ; %atomicrmw.start
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: buffer_wbinvl1_vol
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_execnz BB5_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX908-NEXT: s_endpgm
|
||||
;
|
||||
; GFX90A-LABEL: global_atomic_fadd_ret_f32_system:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX90A-NEXT: BB5_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB5_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -302,7 +468,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addr
|
||||
; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: BB4_1: ; %atomicrmw.start
|
||||
; GCN-NEXT: BB6_1: ; %atomicrmw.start
|
||||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
@ -314,12 +480,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addr
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GCN-NEXT: s_cbranch_execnz BB4_1
|
||||
; GCN-NEXT: s_cbranch_execnz BB6_1
|
||||
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
@ -335,7 +501,7 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float ad
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1_vol
|
||||
; GCN-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user