1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[AMDGPU] Extend and reorganize memory legalizer tests

* Rename some tests to try to make a convention (where all components
  are optional) of:

    <addrspace>_<syncscope>_<memory-orders>_<operation>

* Split up at a level of granularity appropriate for the different RUN
  lines (i.e. split on addrspace so GFX6 can avoid FLAT) and that makes
  running a specific test reasonable in terms of wall time taken. This
  also means when run as part of the test suite the testing is not one
  serial bottleneck.

* Auto-generate check lines with `update_llc_test_checks.py` to make
  future maintenance more tractable.

Reviewed By: rampitec, t-tye

Differential Revision: https://reviews.llvm.org/D91545
This commit is contained in:
Scott Linder 2020-11-30 16:59:09 +00:00
parent 6d987ac858
commit d8132da4e7
27 changed files with 74831 additions and 8120 deletions

View File

@ -1,526 +0,0 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10,CACHE_INV10 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,SKIP_CACHE_INV %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10,SKIP_CACHE_INV %s
; FUNC-LABEL: {{^}}system_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @system_acquire() {
entry:
fence acquire
ret void
}
; FUNC-LABEL: {{^}}system_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN: s_endpgm
define amdgpu_kernel void @system_release() {
entry:
fence release
ret void
}
; FUNC-LABEL: {{^}}system_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @system_acq_rel() {
entry:
fence acq_rel
ret void
}
; FUNC-LABEL: {{^}}system_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @system_seq_cst() {
entry:
fence seq_cst
ret void
}
; FUNC-LABEL: {{^}}system_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @system_one_as_acquire() {
entry:
fence syncscope("one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}system_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN: s_endpgm
define amdgpu_kernel void @system_one_as_release() {
entry:
fence syncscope("one-as") release
ret void
}
; FUNC-LABEL: {{^}}system_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @system_one_as_acq_rel() {
entry:
fence syncscope("one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}system_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @system_one_as_seq_cst() {
entry:
fence syncscope("one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}singlethread_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_acquire() {
entry:
fence syncscope("singlethread") acquire
ret void
}
; FUNC-LABEL: {{^}}singlethread_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_release() {
entry:
fence syncscope("singlethread") release
ret void
}
; FUNC-LABEL: {{^}}singlethread_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_acq_rel() {
entry:
fence syncscope("singlethread") acq_rel
ret void
}
; FUNC-LABEL: {{^}}singlethread_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_seq_cst() {
entry:
fence syncscope("singlethread") seq_cst
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_one_as_acquire() {
entry:
fence syncscope("singlethread-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_one_as_release() {
entry:
fence syncscope("singlethread-one-as") release
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_one_as_acq_rel() {
entry:
fence syncscope("singlethread-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_one_as_seq_cst() {
entry:
fence syncscope("singlethread-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}agent_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @agent_acquire() {
entry:
fence syncscope("agent") acquire
ret void
}
; FUNC-LABEL: {{^}}agent_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN: s_endpgm
define amdgpu_kernel void @agent_release() {
entry:
fence syncscope("agent") release
ret void
}
; FUNC-LABEL: {{^}}agent_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @agent_acq_rel() {
entry:
fence syncscope("agent") acq_rel
ret void
}
; FUNC-LABEL: {{^}}agent_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @agent_seq_cst() {
entry:
fence syncscope("agent") seq_cst
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @agent_one_as_acquire() {
entry:
fence syncscope("agent-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN: s_endpgm
define amdgpu_kernel void @agent_one_as_release() {
entry:
fence syncscope("agent-one-as") release
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @agent_one_as_acq_rel() {
entry:
fence syncscope("agent-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; CACHE_INV: buffer_wbinvl1{{$}}
; CACHE_INV10: buffer_gl0_inv
; CACHE_INV10: buffer_gl1_inv
; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}}
; SKIP_CACHE_INV-NOT: buffer_gl
; GCN: s_endpgm
define amdgpu_kernel void @agent_one_as_seq_cst() {
entry:
fence syncscope("agent-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}workgroup_acquire:
; GCN: %bb.0
; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_acquire() {
entry:
fence syncscope("workgroup") acquire
ret void
}
; FUNC-LABEL: {{^}}workgroup_release:
; GCN: %bb.0
; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_release() {
entry:
fence syncscope("workgroup") release
ret void
}
; FUNC-LABEL: {{^}}workgroup_acq_rel:
; GCN: %bb.0
; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_acq_rel() {
entry:
fence syncscope("workgroup") acq_rel
ret void
}
; FUNC-LABEL: {{^}}workgroup_seq_cst:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_seq_cst() {
entry:
fence syncscope("workgroup") seq_cst
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_acquire:
; GCN: %bb.0
; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt vmcnt(0)
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_one_as_acquire() {
entry:
fence syncscope("workgroup-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_release:
; GCN: %bb.0
; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt vmcnt(0)
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_one_as_release() {
entry:
fence syncscope("workgroup-one-as") release
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel:
; GCN: %bb.0
; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt vmcnt(0)
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_one_as_acq_rel() {
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst:
; GCN: %bb.0
; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN10: s_waitcnt vmcnt(0)
; GCN10: s_waitcnt_vscnt null, 0x0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_one_as_seq_cst() {
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}wavefront_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_acquire() {
entry:
fence syncscope("wavefront") acquire
ret void
}
; FUNC-LABEL: {{^}}wavefront_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_release() {
entry:
fence syncscope("wavefront") release
ret void
}
; FUNC-LABEL: {{^}}wavefront_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_acq_rel() {
entry:
fence syncscope("wavefront") acq_rel
ret void
}
; FUNC-LABEL: {{^}}wavefront_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_seq_cst() {
entry:
fence syncscope("wavefront") seq_cst
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_one_as_acquire() {
entry:
fence syncscope("wavefront-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_one_as_release() {
entry:
fence syncscope("wavefront-one-as") release
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_one_as_acq_rel() {
entry:
fence syncscope("wavefront-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_one_as_seq_cst() {
entry:
fence syncscope("wavefront-one-as") seq_cst
ret void
}

File diff suppressed because it is too large Load Diff

View File

@ -1,719 +0,0 @@
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX6,GFX68 %s
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10CU %s
; FUNC-LABEL: {{^}}system_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0){{$}}
; GFX6-NEXT: buffer_wbinvl1{{$}}
; GFX8: s_waitcnt vmcnt(0){{$}}
; GFX8-NEXT: buffer_wbinvl1_vol{{$}}
; GFX10: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel system_one_as_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_one_as_acquire() {
entry:
fence syncscope("one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}system_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel system_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_one_as_release() {
entry:
fence syncscope("one-as") release
ret void
}
; FUNC-LABEL: {{^}}system_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX6: buffer_wbinvl1{{$}}
; GFX8: buffer_wbinvl1_vol{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel system_one_as_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_one_as_acq_rel() {
entry:
fence syncscope("one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}system_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX6: buffer_wbinvl1{{$}}
; GFX8: buffer_wbinvl1_vol{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel system_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_one_as_seq_cst() {
entry:
fence syncscope("one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel singlethread_one_as_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_one_as_acquire() {
entry:
fence syncscope("singlethread-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel singlethread_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_one_as_release() {
entry:
fence syncscope("singlethread-one-as") release
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_one_as_acq_rel() {
entry:
fence syncscope("singlethread-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_one_as_seq_cst() {
entry:
fence syncscope("singlethread-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0){{$}}
; GFX6-NEXT: buffer_wbinvl1{{$}}
; GFX8: s_waitcnt vmcnt(0){{$}}
; GFX8-NEXT: buffer_wbinvl1_vol{{$}}
; GFX10: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel agent_one_as_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_one_as_acquire() {
entry:
fence syncscope("agent-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel agent_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_one_as_release() {
entry:
fence syncscope("agent-one-as") release
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX6: buffer_wbinvl1{{$}}
; GFX8: buffer_wbinvl1_vol{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel agent_one_as_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_one_as_acq_rel() {
entry:
fence syncscope("agent-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX6: buffer_wbinvl1{{$}}
; GFX8: buffer_wbinvl1_vol{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel agent_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_one_as_seq_cst() {
entry:
fence syncscope("agent-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_acquire:
; GCN: %bb.0
; GFX68-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
; GFX10CU-NOT: buffer_gl0_inv{{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel workgroup_one_as_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_one_as_acquire() {
entry:
fence syncscope("workgroup-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_release:
; GCN: %bb.0
; GFX68-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10-NOT: buffer_gl0_inv
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel workgroup_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_one_as_release() {
entry:
fence syncscope("workgroup-one-as") release
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel:
; GCN: %bb.0
; GFX68-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: buffer_gl0_inv{{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_one_as_acq_rel() {
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst:
; GCN: %bb.0
; GFX68-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: buffer_gl0_inv{{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_one_as_seq_cst() {
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel wavefront_one_as_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_one_as_acquire() {
entry:
fence syncscope("wavefront-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel wavefront_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_one_as_release() {
entry:
fence syncscope("wavefront-one-as") release
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_one_as_acq_rel() {
entry:
fence syncscope("wavefront-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_one_as_seq_cst() {
entry:
fence syncscope("wavefront-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}system_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX6-NEXT: buffer_wbinvl1{{$}}
; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8-NEXT: buffer_wbinvl1_vol{{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel system_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_acquire() {
entry:
fence acquire
ret void
}
; FUNC-LABEL: {{^}}system_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel system_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_release() {
entry:
fence release
ret void
}
; FUNC-LABEL: {{^}}system_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX6: buffer_wbinvl1{{$}}
; GFX8: buffer_wbinvl1_vol{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel system_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_acq_rel() {
entry:
fence acq_rel
ret void
}
; FUNC-LABEL: {{^}}system_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX6: buffer_wbinvl1{{$}}
; GFX8: buffer_wbinvl1_vol{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel system_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_seq_cst() {
entry:
fence seq_cst
ret void
}
; FUNC-LABEL: {{^}}singlethread_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel singlethread_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_acquire() {
entry:
fence syncscope("singlethread") acquire
ret void
}
; FUNC-LABEL: {{^}}singlethread_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel singlethread_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_release() {
entry:
fence syncscope("singlethread") release
ret void
}
; FUNC-LABEL: {{^}}singlethread_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel singlethread_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_acq_rel() {
entry:
fence syncscope("singlethread") acq_rel
ret void
}
; FUNC-LABEL: {{^}}singlethread_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel singlethread_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_seq_cst() {
entry:
fence syncscope("singlethread") seq_cst
ret void
}
; FUNC-LABEL: {{^}}agent_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX6-NEXT: buffer_wbinvl1{{$}}
; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8-NEXT: buffer_wbinvl1_vol{{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel agent_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_acquire() {
entry:
fence syncscope("agent") acquire
ret void
}
; FUNC-LABEL: {{^}}agent_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel agent_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_release() {
entry:
fence syncscope("agent") release
ret void
}
; FUNC-LABEL: {{^}}agent_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX6: buffer_wbinvl1{{$}}
; GFX8: buffer_wbinvl1_vol{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel agent_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_acq_rel() {
entry:
fence syncscope("agent") acq_rel
ret void
}
; FUNC-LABEL: {{^}}agent_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX6: buffer_wbinvl1{{$}}
; GFX8: buffer_wbinvl1_vol{{$}}
; GFX10-NEXT: buffer_gl0_inv{{$}}
; GFX10-NEXT: buffer_gl1_inv{{$}}
; GCN: s_endpgm
; GFX10: .amdhsa_kernel agent_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_seq_cst() {
entry:
fence syncscope("agent") seq_cst
ret void
}
; FUNC-LABEL: {{^}}workgroup_acquire:
; GCN: %bb.0
; GFX68-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
; GFX10CU-NOT: buffer_gl0_inv{{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel workgroup_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_acquire() {
entry:
fence syncscope("workgroup") acquire
ret void
}
; FUNC-LABEL: {{^}}workgroup_release:
; GCN: %bb.0
; GFX68-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10-NOT: buffer_gl0_inv
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel workgroup_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_release() {
entry:
fence syncscope("workgroup") release
ret void
}
; FUNC-LABEL: {{^}}workgroup_acq_rel:
; GCN: %bb.0
; GFX68-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: buffer_gl0_inv{{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel workgroup_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_acq_rel() {
entry:
fence syncscope("workgroup") acq_rel
ret void
}
; FUNC-LABEL: {{^}}workgroup_seq_cst:
; GCN: %bb.0
; GFX68-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: buffer_gl0_inv{{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel workgroup_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_seq_cst() {
entry:
fence syncscope("workgroup") seq_cst
ret void
}
; FUNC-LABEL: {{^}}wavefront_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel wavefront_acquire
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_acquire() {
entry:
fence syncscope("wavefront") acquire
ret void
}
; FUNC-LABEL: {{^}}wavefront_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel wavefront_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_release() {
entry:
fence syncscope("wavefront") release
ret void
}
; FUNC-LABEL: {{^}}wavefront_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel wavefront_acq_rel
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_acq_rel() {
entry:
fence syncscope("wavefront") acq_rel
ret void
}
; FUNC-LABEL: {{^}}wavefront_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
; GFX10: .amdhsa_kernel wavefront_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_seq_cst() {
entry:
fence syncscope("wavefront") seq_cst
ret void
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,260 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1] glc slc
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc slc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
; SKIP-CACHE-INV-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4, !nontemporal !0
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX7-NEXT: flat_load_dword v2, v[2:3] glc slc
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[2:3] glc slc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
%val = load i32, i32* %val.gep, align 4, !nontemporal !0
store i32 %val, i32* %out
ret void
}
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0 glc slc
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 glc slc
; SKIP-CACHE-INV-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4
store i32 %val, i32* %out, !nontemporal !0
ret void
}
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: flat_load_dword v2, v[1:2]
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc
; SKIP-CACHE-INV-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32* %in, align 4
%out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
store i32 %val, i32* %out.gep, !nontemporal !0
ret void
}
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,302 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
define amdgpu_kernel void @global_nontemporal_load_0(
; GFX6-LABEL: global_nontemporal_load_0:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX6-NEXT: s_mov_b32 s4, s2
; GFX6-NEXT: s_mov_b32 s5, s3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_nontemporal_load_0:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
store i32 %val, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_nontemporal_load_1(
; GFX6-LABEL: global_nontemporal_load_1:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s0, s6
; GFX6-NEXT: s_mov_b32 s1, s7
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, s3
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX7-NEXT: flat_load_dword v2, v[2:3] glc slc
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_load_dword v0, v0, s[0:1] slc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_load_dword v0, v0, s[0:1] slc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_nontemporal_load_1:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
%val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0
store i32 %val, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_nontemporal_store_0(
; GFX6-LABEL: global_nontemporal_store_0:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX6-NEXT: s_mov_b32 s4, s2
; GFX6-NEXT: s_mov_b32 s5, s3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 glc slc
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 glc slc
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
store i32 %val, i32 addrspace(1)* %out, !nontemporal !0
ret void
}
define amdgpu_kernel void @global_nontemporal_store_1(
; GFX6-LABEL: global_nontemporal_store_1:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3]
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(1)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0
ret void
}
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,313 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
define amdgpu_kernel void @local_nontemporal_load_0(
; GFX6-LABEL: local_nontemporal_load_0:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: ds_read_b32 v0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_read_b32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: ds_read_b32 v0, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
store i32 %val, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @local_nontemporal_load_1(
; GFX6-LABEL: local_nontemporal_load_1:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: ds_read_b32 v0, v0
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; GFX7-NEXT: ds_read_b32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-CU-NEXT: ds_read_b32 v0, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
%val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
store i32 %val, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @local_nontemporal_store_0(
; GFX6-LABEL: local_nontemporal_store_0:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
ret void
}
define amdgpu_kernel void @local_nontemporal_store_1(
; GFX6-LABEL: local_nontemporal_store_1:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(1)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
ret void
}
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,440 +0,0 @@
; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
; FUNC-LABEL: {{^}}system_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN-NEXT: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @system_one_as_acquire() {
entry:
fence syncscope("one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}system_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN: s_endpgm
define amdgpu_kernel void @system_one_as_release() {
entry:
fence syncscope("one-as") release
ret void
}
; FUNC-LABEL: {{^}}system_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @system_one_as_acq_rel() {
entry:
fence syncscope("one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}system_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @system_one_as_seq_cst() {
entry:
fence syncscope("one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_one_as_acquire() {
entry:
fence syncscope("singlethread-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_one_as_release() {
entry:
fence syncscope("singlethread-one-as") release
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_one_as_acq_rel() {
entry:
fence syncscope("singlethread-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_one_as_seq_cst() {
entry:
fence syncscope("singlethread-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN-NEXT: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @agent_one_as_acquire() {
entry:
fence syncscope("agent-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN: s_endpgm
define amdgpu_kernel void @agent_one_as_release() {
entry:
fence syncscope("agent-one-as") release
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @agent_one_as_acq_rel() {
entry:
fence syncscope("agent-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}agent_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0){{$}}
; GCN: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @agent_one_as_seq_cst() {
entry:
fence syncscope("agent-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_one_as_acquire() {
entry:
fence syncscope("workgroup-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_release:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_one_as_release() {
entry:
fence syncscope("workgroup-one-as") release
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_one_as_acq_rel() {
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_one_as_seq_cst() {
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_one_as_acquire() {
entry:
fence syncscope("wavefront-one-as") acquire
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_one_as_release() {
entry:
fence syncscope("wavefront-one-as") release
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_one_as_acq_rel() {
entry:
fence syncscope("wavefront-one-as") acq_rel
ret void
}
; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_one_as_seq_cst() {
entry:
fence syncscope("wavefront-one-as") seq_cst
ret void
}
; FUNC-LABEL: {{^}}system_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN-NEXT: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @system_acquire() {
entry:
fence acquire
ret void
}
; FUNC-LABEL: {{^}}system_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN: s_endpgm
define amdgpu_kernel void @system_release() {
entry:
fence release
ret void
}
; FUNC-LABEL: {{^}}system_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @system_acq_rel() {
entry:
fence acq_rel
ret void
}
; FUNC-LABEL: {{^}}system_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @system_seq_cst() {
entry:
fence seq_cst
ret void
}
; FUNC-LABEL: {{^}}singlethread_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_acquire() {
entry:
fence syncscope("singlethread") acquire
ret void
}
; FUNC-LABEL: {{^}}singlethread_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_release() {
entry:
fence syncscope("singlethread") release
ret void
}
; FUNC-LABEL: {{^}}singlethread_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_acq_rel() {
entry:
fence syncscope("singlethread") acq_rel
ret void
}
; FUNC-LABEL: {{^}}singlethread_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @singlethread_seq_cst() {
entry:
fence syncscope("singlethread") seq_cst
ret void
}
; FUNC-LABEL: {{^}}agent_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN-NEXT: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @agent_acquire() {
entry:
fence syncscope("agent") acquire
ret void
}
; FUNC-LABEL: {{^}}agent_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN: s_endpgm
define amdgpu_kernel void @agent_release() {
entry:
fence syncscope("agent") release
ret void
}
; FUNC-LABEL: {{^}}agent_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @agent_acq_rel() {
entry:
fence syncscope("agent") acq_rel
ret void
}
; FUNC-LABEL: {{^}}agent_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN: buffer_wbinvl1{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @agent_seq_cst() {
entry:
fence syncscope("agent") seq_cst
ret void
}
; FUNC-LABEL: {{^}}workgroup_acquire:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_acquire() {
entry:
fence syncscope("workgroup") acquire
ret void
}
; FUNC-LABEL: {{^}}workgroup_release:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_release() {
entry:
fence syncscope("workgroup") release
ret void
}
; FUNC-LABEL: {{^}}workgroup_acq_rel:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_acq_rel() {
entry:
fence syncscope("workgroup") acq_rel
ret void
}
; FUNC-LABEL: {{^}}workgroup_seq_cst:
; GCN: %bb.0
; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @workgroup_seq_cst() {
entry:
fence syncscope("workgroup") seq_cst
ret void
}
; FUNC-LABEL: {{^}}wavefront_acquire:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_acquire() {
entry:
fence syncscope("wavefront") acquire
ret void
}
; FUNC-LABEL: {{^}}wavefront_release:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_release() {
entry:
fence syncscope("wavefront") release
ret void
}
; FUNC-LABEL: {{^}}wavefront_acq_rel:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_acq_rel() {
entry:
fence syncscope("wavefront") acq_rel
ret void
}
; FUNC-LABEL: {{^}}wavefront_seq_cst:
; GCN: %bb.0
; GCN-NOT: ATOMIC_FENCE
; GCN: s_endpgm
define amdgpu_kernel void @wavefront_seq_cst() {
entry:
fence syncscope("wavefront") seq_cst
ret void
}

View File

@ -0,0 +1,395 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
define amdgpu_kernel void @private_nontemporal_load_0(
; GFX6-LABEL: private_nontemporal_load_0:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: s_mov_b32 s11, 0xe8f000
; GFX6-NEXT: s_add_u32 s8, s8, s3
; GFX6-NEXT: s_addc_u32 s9, s9, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: private_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_add_u32 s8, s8, s7
; GFX7-NEXT: s_addc_u32 s9, s9, 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: private_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: private_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9]
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3
; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
store i32 %val, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @private_nontemporal_load_1(
; GFX6-LABEL: private_nontemporal_load_1:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: s_mov_b32 s11, 0xe8f000
; GFX6-NEXT: s_add_u32 s8, s8, s3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: s_addc_u32 s9, s9, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: private_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; GFX7-NEXT: s_add_u32 s8, s8, s7
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_addc_u32 s9, s9, 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: private_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: private_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9]
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3
; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid
%val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0
store i32 %val, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @private_nontemporal_store_0(
; GFX6-LABEL: private_nontemporal_store_0:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s7, 0xe8f000
; GFX6-NEXT: s_add_u32 s4, s4, s3
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
; GFX6-NEXT: s_addc_u32 s5, s5, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s1
; GFX6-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: private_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_add_u32 s8, s8, s7
; GFX7-NEXT: s_addc_u32 s9, s9, 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: private_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: private_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
store i32 %val, i32 addrspace(5)* %out, !nontemporal !0
ret void
}
define amdgpu_kernel void @private_nontemporal_store_1(
; GFX6-LABEL: private_nontemporal_store_1:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s7, 0xe8f000
; GFX6-NEXT: s_add_u32 s4, s4, s3
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: s_addc_u32 s5, s5, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: private_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
; GFX7-NEXT: s_add_u32 s8, s8, s7
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_addc_u32 s9, s9, 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: private_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-WGP-NEXT: s_clause 0x1
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: private_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-CU-NEXT: s_clause 0x1
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(1)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid
store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0
ret void
}
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()

View File

@ -1,754 +0,0 @@
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s
declare i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: {{^}}system_one_as_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel system_one_as_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_one_as_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}system_one_as_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel system_one_as_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_one_as_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}system_one_as_release:
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel system_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_one_as_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") release, align 4
ret void
}
; GCN-LABEL: {{^}}system_one_as_seq_cst:
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel system_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_one_as_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}singlethread_one_as_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel singlethread_one_as_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_one_as_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}singlethread_one_as_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel singlethread_one_as_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_one_as_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}singlethread_one_as_release:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel singlethread_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_one_as_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
ret void
}
; GCN-LABEL: {{^}}singlethread_one_as_seq_cst:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_one_as_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}agent_one_as_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel agent_one_as_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_one_as_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}agent_one_as_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel agent_one_as_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_one_as_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}agent_one_as_release:
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel agent_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_one_as_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4
ret void
}
; GCN-LABEL: {{^}}agent_one_as_seq_cst:
; GCN: s_waitcnt vmcnt(0){{$}}
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel agent_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_one_as_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}workgroup_one_as_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel workgroup_one_as_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_one_as_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}workgroup_one_as_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel workgroup_one_as_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_one_as_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}workgroup_one_as_release:
; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel workgroup_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_one_as_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4
ret void
}
; GCN-LABEL: {{^}}workgroup_one_as_seq_cst:
; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_one_as_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}wavefront_one_as_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel wavefront_one_as_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_one_as_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}wavefront_one_as_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel wavefront_one_as_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_one_as_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}wavefront_one_as_release:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel wavefront_one_as_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_one_as_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4
ret void
}
; GCN-LABEL: {{^}}wavefront_one_as_seq_cst:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_one_as_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}nontemporal_private_0:
; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_private_0
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @nontemporal_private_0(
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
store i32 %val, i32 addrspace(5)* %out, !nontemporal !0
ret void
}
; GCN-LABEL: {{^}}nontemporal_private_1:
; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_private_1
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @nontemporal_private_1(
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(1)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid
store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0
ret void
}
; GCN-LABEL: {{^}}nontemporal_global_0:
; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} glc slc{{$}}
; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_global_0
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @nontemporal_global_0(
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
store i32 %val, i32 addrspace(1)* %out, !nontemporal !0
ret void
}
; GCN-LABEL: {{^}}nontemporal_global_1:
; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_global_1
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @nontemporal_global_1(
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(1)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0
ret void
}
; GCN-LABEL: {{^}}nontemporal_local_0:
; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel nontemporal_local_0
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @nontemporal_local_0(
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
ret void
}
; GCN-LABEL: {{^}}nontemporal_local_1:
; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel nontemporal_local_1
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @nontemporal_local_1(
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(1)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
ret void
}
; GCN-LABEL: {{^}}nontemporal_flat_0:
; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_flat_0
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @nontemporal_flat_0(
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4
store i32 %val, i32* %out, !nontemporal !0
ret void
}
; GCN-LABEL: {{^}}nontemporal_flat_1:
; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_flat_1
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @nontemporal_flat_1(
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32* %in, align 4
%out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
store i32 %val, i32* %out.gep, !nontemporal !0
ret void
}
; GCN-LABEL: {{^}}system_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel system_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out unordered, align 4
ret void
}
; GCN-LABEL: {{^}}system_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel system_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}system_release:
; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt lgkmcnt(0){{$}}
; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel system_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out release, align 4
ret void
}
; GCN-LABEL: {{^}}system_seq_cst:
; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt lgkmcnt(0){{$}}
; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel system_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @system_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}singlethread_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel singlethread_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}singlethread_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel singlethread_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}singlethread_release:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel singlethread_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
ret void
}
; GCN-LABEL: {{^}}singlethread_seq_cst:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel singlethread_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @singlethread_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}agent_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel agent_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}agent_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel agent_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}agent_release:
; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt lgkmcnt(0){{$}}
; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel agent_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") release, align 4
ret void
}
; GCN-LABEL: {{^}}agent_seq_cst:
; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10: s_waitcnt lgkmcnt(0){{$}}
; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel agent_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @agent_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}workgroup_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel workgroup_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}workgroup_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel workgroup_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}workgroup_release:
; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel workgroup_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4
ret void
}
; GCN-LABEL: {{^}}workgroup_seq_cst:
; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel workgroup_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @workgroup_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}wavefront_unordered:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel wavefront_unordered
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_unordered(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4
ret void
}
; GCN-LABEL: {{^}}wavefront_monotonic:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel wavefront_monotonic
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_monotonic(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4
ret void
}
; GCN-LABEL: {{^}}wavefront_release:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel wavefront_release
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_release(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4
ret void
}
; GCN-LABEL: {{^}}wavefront_seq_cst:
; GCN-NOT: s_waitcnt vmcnt(0){{$}}
; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX10: .amdhsa_kernel wavefront_seq_cst
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GFX10-NOT: .amdhsa_memory_ordered 0
define amdgpu_kernel void @wavefront_seq_cst(
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4
ret void
}
!0 = !{i32 1}