1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00
llvm-mirror/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
Austin Kerbow f238c8d80a [AMDGPU] Revise handling of preexisting waitcnt
Preexisting waitcnt may not update the scoreboard if the instruction
being examined needed to wait on fewer counters than what was encoded in
the old waitcnt instruction. Fixing this results in the elimination of
some redudnat waitcnt.

These changes also enable combining consecutive waitcnt into a single
S_WAITCNT or S_WAITCNT_VSCNT instruction.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D100281
2021-05-05 17:21:33 -07:00

6357 lines
274 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX7-LABEL: flat_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX7-LABEL: flat_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX7-LABEL: flat_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX7-LABEL: flat_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX7-LABEL: flat_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX7-LABEL: flat_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_release_store(
; GFX7-LABEL: flat_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_release_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX7-LABEL: flat_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_release_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release
ret void
}
define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire
store i32 %val, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel
store i32 %val, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX7-LABEL: flat_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
store i32 %val, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
store i32 %val, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
store i32 %val, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 16
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s4, s0, 16
; GFX7-NEXT: s_addc_u32 s5, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, i32* %out, align 4
ret void
}