[AMDGPU] gfx1010 memory legalizer

Differential Revision: https://reviews.llvm.org/D61535 llvm-svn: 360087
2024-11-22 18:54:02 +01:00 · 2019-05-06 21:57:02 +00:00 · 2019-05-06 21:57:02 +00:00 · 32c2919cd9
commit 32c2919cd9
parent 55dff7252c
7 changed files with 4909 additions and 1009 deletions
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@ -352,6 +352,40 @@ public:

 };

+class SIGfx10CacheControl : public SIGfx7CacheControl {
+protected:
+  bool CuMode = false;
+
+  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
+  /// is modified, false otherwise.
+  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
+    return enableNamedBit<AMDGPU::OpName::dlc>(MI);
+  }
+
+public:
+
+  SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
+    SIGfx7CacheControl(ST), CuMode(CuMode) {};
+
+  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+  bool insertWait(MachineBasicBlock::iterator &MI,
+                  SIAtomicScope Scope,
+                  SIAtomicAddrSpace AddrSpace,
+                  SIMemOp Op,
+                  bool IsCrossAddrSpaceOrdering,
+                  Position Pos) const override;
+};
+
 class SIMemoryLegalizer final : public MachineFunctionPass {
 private:

@ -623,7 +657,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
  GCNSubtarget::Generation Generation = ST.getGeneration();
  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
    return make_unique<SIGfx6CacheControl>(ST);
-  return make_unique<SIGfx7CacheControl>(ST);
+  if (Generation < AMDGPUSubtarget::GFX10)
+    return make_unique<SIGfx7CacheControl>(ST);
+  return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
 }

 bool SIGfx6CacheControl::enableLoadCacheBypass(
@ -860,6 +896,231 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
  return Changed;
 }

+bool SIGfx10CacheControl::enableLoadCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && !MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    /// TODO Do not set glc for rmw atomic operations as they
+    /// implicitly bypass the L0/L1 caches.
+
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      Changed |= enableGLCBit(MI);
+      Changed |= enableDLCBit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In WGP mode the waves of a work-group can be executing on either CU of
+      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
+      // CU mode and all waves of a work-group are on the same CU, and so the
+      // L0 does not need to be bypassed.
+      if (!CuMode) Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  return Changed;
+}
+
+bool SIGfx10CacheControl::enableNonTemporal(
+    const MachineBasicBlock::iterator &MI) const {
+  assert(MI->mayLoad() ^ MI->mayStore());
+  bool Changed = false;
+
+  Changed |= enableSLCBit(MI);
+  /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
+
+  return Changed;
+}
+
+bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                                SIAtomicScope Scope,
+                                                SIAtomicAddrSpace AddrSpace,
+                                                Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In WGP mode the waves of a work-group can be executing on either CU of
+      // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
+      // in CU mode and all waves of a work-group are on the same CU, and so the
+      // L0 does not need to be invalidated.
+      if (!CuMode) {
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+        Changed = true;
+      }
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
+}
+
+bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace,
+                                     SIMemOp Op,
+                                     bool IsCrossAddrSpaceOrdering,
+                                     Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  bool VMCnt = false;
+  bool VSCnt = false;
+  bool LGKMCnt = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+        VMCnt |= true;
+      if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+        VSCnt |= true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In WGP mode the waves of a work-group can be executing on either CU of
+      // the WGP. Therefore need to wait for operations to complete to ensure
+      // they are visible to waves in the other CU as the L0 is per CU.
+      // Otherwise in CU mode and all waves of a work-group are on the same CU
+      // which shares the same L0.
+      if (!CuMode) {
+        if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+          VMCnt |= true;
+        if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+          VSCnt |= true;
+      }
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The L0 cache keeps all memory operations in order for
+      // work-items in the same wavefront.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+      // If no cross address space ordering then an LDS waitcnt is not
+      // needed as LDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/GDS memory as LDS operations
+      // could be reordered with respect to later global/GDS memory
+      // operations of the same wave.
+      LGKMCnt |= IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The LDS keeps all memory operations in order for
+      // the same wavesfront.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      // If no cross address space ordering then an GDS waitcnt is not
+      // needed as GDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/LDS memory as GDS operations
+      // could be reordered with respect to later global/LDS memory
+      // operations of the same wave.
+      LGKMCnt |= IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The GDS keeps all memory operations in order for
+      // the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if (VMCnt || LGKMCnt) {
+    unsigned WaitCntImmediate =
+      AMDGPU::encodeWaitcnt(IV,
+                            VMCnt ? 0 : getVmcntBitMask(IV),
+                            getExpcntBitMask(IV),
+                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+    Changed = true;
+  }
+
+  if (VSCnt) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+      .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+      .addImm(0);
+    Changed = true;
+  }
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
  if (AtomicPseudoMIs.empty())
    return false;
--- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll
--- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll
@ -1,6 +1,8 @@
 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX6,GFX68 %s
 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10CU %s

 ; FUNC-LABEL: {{^}}system_one_as_acquire:
 ; GCN:        %bb.0
@ -9,7 +11,15 @@
 ; GFX6-NEXT:  buffer_wbinvl1{{$}}
 ; GFX8:       s_waitcnt vmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol{{$}}
+; GFX10:      s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel system_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_acquire() {
 entry:
  fence syncscope("one-as") acquire
@ -20,7 +30,12 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel system_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_release() {
 entry:
  fence syncscope("one-as") release
@ -31,9 +46,16 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel system_one_as_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_acq_rel() {
 entry:
  fence syncscope("one-as") acq_rel
@ -44,9 +66,16 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel system_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_seq_cst() {
 entry:
  fence syncscope("one-as") seq_cst
@ -57,6 +86,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel singlethread_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_acquire() {
 entry:
  fence syncscope("singlethread-one-as") acquire
@ -67,6 +100,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel singlethread_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_release() {
 entry:
  fence syncscope("singlethread-one-as") release
@ -77,6 +114,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel singlethread_one_as_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_acq_rel() {
 entry:
  fence syncscope("singlethread-one-as") acq_rel
@ -87,6 +128,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel singlethread_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_seq_cst() {
 entry:
  fence syncscope("singlethread-one-as") seq_cst
@ -100,7 +145,15 @@ entry:
 ; GFX6-NEXT:  buffer_wbinvl1{{$}}
 ; GFX8:       s_waitcnt vmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol{{$}}
+; GFX10:      s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel agent_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_acquire() {
 entry:
  fence syncscope("agent-one-as") acquire
@ -111,7 +164,12 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel agent_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_release() {
 entry:
  fence syncscope("agent-one-as") release
@ -122,9 +180,16 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel agent_one_as_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_acq_rel() {
 entry:
  fence syncscope("agent-one-as") acq_rel
@ -135,53 +200,99 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel agent_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_seq_cst() {
 entry:
  fence syncscope("agent-one-as") seq_cst
  ret void
 }

-; FUNC-LABEL: {{^}}workgroup_one_as_acquire:
-; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_endpgm
+; FUNC-LABEL:    {{^}}workgroup_one_as_acquire:
+; GCN:           %bb.0
+; GFX68-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
+; GFX10CU-NOT:   buffer_gl0_inv{{$}}
+; GCN-NOT:       ATOMIC_FENCE
+; GCN:           s_endpgm
+; GFX10:         .amdhsa_kernel workgroup_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_acquire() {
 entry:
  fence syncscope("workgroup-one-as") acquire
  ret void
 }

-; FUNC-LABEL: {{^}}workgroup_one_as_release:
-; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_endpgm
+; FUNC-LABEL:    {{^}}workgroup_one_as_release:
+; GCN:           %bb.0
+; GFX68-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10-NOT:     buffer_gl0_inv
+; GCN-NOT:       ATOMIC_FENCE
+; GCN:           s_endpgm
+; GFX10:         .amdhsa_kernel workgroup_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_release() {
 entry:
  fence syncscope("workgroup-one-as") release
  ret void
 }

-; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel:
-; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_endpgm
+; FUNC-LABEL:    {{^}}workgroup_one_as_acq_rel:
+; GCN:           %bb.0
+; GFX68-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   buffer_gl0_inv{{$}}
+; GCN-NOT:       ATOMIC_FENCE
+; GCN:           s_endpgm
+; GFX10:         .amdhsa_kernel workgroup_one_as_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_acq_rel() {
 entry:
  fence syncscope("workgroup-one-as") acq_rel
  ret void
 }

-; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst:
-; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_endpgm
+; FUNC-LABEL:    {{^}}workgroup_one_as_seq_cst:
+; GCN:           %bb.0
+; GFX68-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   buffer_gl0_inv{{$}}
+; GCN-NOT:       ATOMIC_FENCE
+; GCN:           s_endpgm
+; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_seq_cst() {
 entry:
  fence syncscope("workgroup-one-as") seq_cst
@ -192,6 +303,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel wavefront_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_acquire() {
 entry:
  fence syncscope("wavefront-one-as") acquire
@ -202,6 +317,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel wavefront_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_release() {
 entry:
  fence syncscope("wavefront-one-as") release
@ -212,6 +331,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel wavefront_one_as_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_acq_rel() {
 entry:
  fence syncscope("wavefront-one-as") acq_rel
@ -222,6 +345,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel wavefront_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_seq_cst() {
 entry:
  fence syncscope("wavefront-one-as") seq_cst
@ -235,7 +362,15 @@ entry:
 ; GFX6-NEXT:  buffer_wbinvl1{{$}}
 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol{{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel system_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_acquire() {
 entry:
  fence acquire
@ -245,8 +380,15 @@ entry:
 ; FUNC-LABEL: {{^}}system_release:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX6:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel system_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_release() {
 entry:
  fence release
@ -256,10 +398,19 @@ entry:
 ; FUNC-LABEL: {{^}}system_acq_rel:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX6:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel system_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_acq_rel() {
 entry:
  fence acq_rel
@ -269,10 +420,19 @@ entry:
 ; FUNC-LABEL: {{^}}system_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX6:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel system_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_seq_cst() {
 entry:
  fence seq_cst
@ -283,6 +443,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel singlethread_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_acquire() {
 entry:
  fence syncscope("singlethread") acquire
@ -293,6 +457,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel singlethread_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_release() {
 entry:
  fence syncscope("singlethread") release
@ -303,6 +471,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel singlethread_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_acq_rel() {
 entry:
  fence syncscope("singlethread") acq_rel
@ -313,6 +485,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel singlethread_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_seq_cst() {
 entry:
  fence syncscope("singlethread") seq_cst
@ -326,7 +502,15 @@ entry:
 ; GFX6-NEXT:  buffer_wbinvl1{{$}}
 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol{{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel agent_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_acquire() {
 entry:
  fence syncscope("agent") acquire
@ -336,8 +520,15 @@ entry:
 ; FUNC-LABEL: {{^}}agent_release:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX6:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel agent_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_release() {
 entry:
  fence syncscope("agent") release
@ -347,10 +538,19 @@ entry:
 ; FUNC-LABEL: {{^}}agent_acq_rel:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX6:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel agent_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_acq_rel() {
 entry:
  fence syncscope("agent") acq_rel
@ -360,54 +560,102 @@ entry:
 ; FUNC-LABEL: {{^}}agent_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX6:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
+; GFX10-NEXT: buffer_gl0_inv{{$}}
+; GFX10-NEXT: buffer_gl1_inv{{$}}
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel agent_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_seq_cst() {
 entry:
  fence syncscope("agent") seq_cst
  ret void
 }

-; FUNC-LABEL: {{^}}workgroup_acquire:
-; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_endpgm
+; FUNC-LABEL:    {{^}}workgroup_acquire:
+; GCN:           %bb.0
+; GFX68-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
+; GFX10CU-NOT:   buffer_gl0_inv{{$}}
+; GCN-NOT:       ATOMIC_FENCE
+; GCN:           s_endpgm
+; GFX10:         .amdhsa_kernel workgroup_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_acquire() {
 entry:
  fence syncscope("workgroup") acquire
  ret void
 }

-; FUNC-LABEL: {{^}}workgroup_release:
-; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_endpgm
+; FUNC-LABEL:    {{^}}workgroup_release:
+; GCN:           %bb.0
+; GFX68-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10-NOT:     buffer_gl0_inv
+; GCN-NOT:       ATOMIC_FENCE
+; GCN:           s_endpgm
+; GFX10:         .amdhsa_kernel workgroup_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_release() {
 entry:
  fence syncscope("workgroup") release
  ret void
 }

-; FUNC-LABEL: {{^}}workgroup_acq_rel:
-; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_endpgm
+; FUNC-LABEL:    {{^}}workgroup_acq_rel:
+; GCN:           %bb.0
+; GFX68-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   buffer_gl0_inv{{$}}
+; GCN-NOT:       ATOMIC_FENCE
+; GCN:           s_endpgm
+; GFX10:         .amdhsa_kernel workgroup_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_acq_rel() {
 entry:
  fence syncscope("workgroup") acq_rel
  ret void
 }

-; FUNC-LABEL: {{^}}workgroup_seq_cst:
-; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_endpgm
+; FUNC-LABEL:    {{^}}workgroup_seq_cst:
+; GCN:           %bb.0
+; GFX68-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10WGP-NEXT: buffer_gl0_inv{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   buffer_gl0_inv{{$}}
+; GCN-NOT:       ATOMIC_FENCE
+; GCN:           s_endpgm
+; GFX10:         .amdhsa_kernel workgroup_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_seq_cst() {
 entry:
  fence syncscope("workgroup") seq_cst
@ -418,6 +666,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel wavefront_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_acquire() {
 entry:
  fence syncscope("wavefront") acquire
@ -428,6 +680,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel wavefront_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_release() {
 entry:
  fence syncscope("wavefront") release
@ -438,6 +694,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel wavefront_acq_rel
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_acq_rel() {
 entry:
  fence syncscope("wavefront") acq_rel
@ -448,6 +708,10 @@ entry:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
+; GFX10:         .amdhsa_kernel wavefront_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_seq_cst() {
 entry:
  fence syncscope("wavefront") seq_cst
--- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll
--- a/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
@ -1,5 +1,6 @@
 ; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s
 ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s 2>&1 | FileCheck %s

 ; CHECK: error: <unknown>:0:0: in function invalid_fence void (): Unsupported atomic synchronization scope
 define amdgpu_kernel void @invalid_fence() {
--- a/test/CodeGen/AMDGPU/memory-legalizer-load.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-load.ll
@ -2,15 +2,24 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s

 declare i32 @llvm.amdgcn.workitem.id.x()

 ; GCN-LABEL: {{^}}system_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel system_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_unordered(
    i32* %in, i32* %out) {
 entry:
@ -21,10 +30,18 @@ entry:

 ; GCN-LABEL: {{^}}system_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel system_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -35,10 +52,18 @@ entry:

 ; GCN-LABEL: {{^}}system_one_as_acquire:
 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
-; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel system_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_acquire(
    i32* %in, i32* %out) {
 entry:
@ -49,10 +74,18 @@ entry:

 ; GCN-LABEL: {{^}}system_one_as_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
-; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel system_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -63,10 +96,17 @@ entry:

 ; GCN-LABEL: {{^}}singlethread_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel singlethread_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_unordered(
    i32* %in, i32* %out) {
 entry:
@ -77,10 +117,17 @@ entry:

 ; GCN-LABEL: {{^}}singlethread_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel singlethread_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -91,10 +138,17 @@ entry:

 ; GCN-LABEL: {{^}}singlethread_one_as_acquire:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel singlethread_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_acquire(
    i32* %in, i32* %out) {
 entry:
@ -105,10 +159,17 @@ entry:

 ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel singlethread_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -119,10 +180,17 @@ entry:

 ; GCN-LABEL: {{^}}agent_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel agent_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_unordered(
    i32* %in, i32* %out) {
 entry:
@ -133,10 +201,18 @@ entry:

 ; GCN-LABEL: {{^}}agent_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel agent_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -147,10 +223,18 @@ entry:

 ; GCN-LABEL: {{^}}agent_one_as_acquire:
 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
-; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel agent_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_acquire(
    i32* %in, i32* %out) {
 entry:
@ -161,10 +245,18 @@ entry:

 ; GCN-LABEL: {{^}}agent_one_as_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
-; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel agent_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -175,10 +267,17 @@ entry:

 ; GCN-LABEL: {{^}}workgroup_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel workgroup_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_unordered(
    i32* %in, i32* %out) {
 entry:
@ -187,12 +286,21 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_one_as_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GFX89-NOT: buffer_wbinvl1_vol
-; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GCN-LABEL:    {{^}}workgroup_one_as_monotonic:
+; GCN-NOT:      s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX10WGP:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10CU-NOT:  flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN-NOT:      s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89-NOT:    buffer_wbinvl1_vol
+; GFX10-NOT:    buffer_gl{{[01]}}_inv
+; GCN:          flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel workgroup_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -201,12 +309,23 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_one_as_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
-; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
-; GFX89-NOT:  buffer_wbinvl1_vol
-; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GCN-LABEL:     {{^}}workgroup_one_as_acquire:
+; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT:     s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:         flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX10WGP:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10CU-NOT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX89-NOT:     buffer_wbinvl1_vol
+; GFX10WGP-NEXT: buffer_gl0_inv
+; GFX10CU-NOT:   buffer_gl0_inv
+; GCN:           flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel workgroup_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_acquire(
    i32* %in, i32* %out) {
 entry:
@ -215,12 +334,26 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_one_as_seq_cst:
-; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
-; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
-; GFX89-NOT:  buffer_wbinvl1_vol
-; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GCN-LABEL:     {{^}}workgroup_one_as_seq_cst:
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0
+; GFX89:         flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX10WGP:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10CU:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX89-NOT:     buffer_wbinvl1_vol
+; GFX10WGP-NEXT: buffer_gl0_inv
+; GFX10CU-NOT:   buffer_gl0_inv
+; GCN:           flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -231,10 +364,17 @@ entry:

 ; GCN-LABEL: {{^}}wavefront_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel wavefront_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_unordered(
    i32* %in, i32* %out) {
 entry:
@ -245,10 +385,17 @@ entry:

 ; GCN-LABEL: {{^}}wavefront_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel wavefront_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -259,10 +406,17 @@ entry:

 ; GCN-LABEL: {{^}}wavefront_one_as_acquire:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel wavefront_one_as_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_acquire(
    i32* %in, i32* %out) {
 entry:
@ -273,10 +427,17 @@ entry:

 ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel wavefront_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -287,6 +448,11 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_private_0:
 ; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
+; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_private_0
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_private_0(
    i32 addrspace(5)* %in, i32* %out) {
 entry:
@ -297,6 +463,11 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_private_1:
 ; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
+; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_private_1
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_private_1(
    i32 addrspace(5)* %in, i32* %out) {
 entry:
@ -309,6 +480,10 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_global_0:
 ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_global_0
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_global_0(
    i32 addrspace(1)* %in, i32* %out) {
 entry:
@ -320,6 +495,11 @@ entry:
 ; GCN-LABEL: {{^}}nontemporal_global_1:
 ; GFX8:  flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
 ; GFX9:  global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
+; GFX10: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_global_1
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_global_1(
    i32 addrspace(1)* %in, i32* %out) {
 entry:
@ -332,6 +512,10 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_local_0:
 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_local_0
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_local_0(
    i32 addrspace(3)* %in, i32* %out) {
 entry:
@ -342,6 +526,10 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_local_1:
 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_local_1
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_local_1(
    i32 addrspace(3)* %in, i32* %out) {
 entry:
@ -354,6 +542,11 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_flat_0:
 ; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
+; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_flat_0
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_flat_0(
    i32* %in, i32* %out) {
 entry:
@ -364,6 +557,11 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_flat_1:
 ; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
+; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_flat_1
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_flat_1(
    i32* %in, i32* %out) {
 entry:
@ -375,11 +573,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}system_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel system_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_unordered(
    i32* %in, i32* %out) {
 entry:
@ -389,11 +594,19 @@ entry:
 }

 ; GCN-LABEL: {{^}}system_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel system_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -403,11 +616,20 @@ entry:
 }

 ; GCN-LABEL: {{^}}system_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
+; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel system_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_acquire(
    i32* %in, i32* %out) {
 entry:
@ -417,11 +639,21 @@ entry:
 }

 ; GCN-LABEL: {{^}}system_seq_cst:
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
+; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
+; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel system_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -431,11 +663,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}singlethread_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel singlethread_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_unordered(
    i32* %in, i32* %out) {
 entry:
@ -445,11 +684,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}singlethread_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel singlethread_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -459,11 +705,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}singlethread_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel singlethread_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_acquire(
    i32* %in, i32* %out) {
 entry:
@ -473,11 +726,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}singlethread_seq_cst:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel singlethread_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -487,11 +747,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}agent_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel agent_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_unordered(
    i32* %in, i32* %out) {
 entry:
@ -501,11 +768,19 @@ entry:
 }

 ; GCN-LABEL: {{^}}agent_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel agent_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -515,11 +790,20 @@ entry:
 }

 ; GCN-LABEL: {{^}}agent_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
+; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel agent_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_acquire(
    i32* %in, i32* %out) {
 entry:
@ -529,11 +813,21 @@ entry:
 }

 ; GCN-LABEL: {{^}}agent_seq_cst:
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
+; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
+; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel agent_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -543,11 +837,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}workgroup_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel workgroup_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_unordered(
    i32* %in, i32* %out) {
 entry:
@ -556,12 +857,21 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX89-NOT: buffer_wbinvl1_vol
-; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GCN-LABEL:    {{^}}workgroup_monotonic:
+; GCN-NOT:      s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX10WGP:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10CU-NOT:  flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN-NOT:      s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89-NOT:    buffer_wbinvl1_vol
+; GFX10-NOT:    buffer_gl{{[01]}}_inv
+; GCN:          flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel workgroup_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -570,12 +880,21 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX89-NOT:  buffer_wbinvl1_vol
-; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GCN-LABEL:     {{^}}workgroup_acquire:
+; GFX10-NOT:     s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:         flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX10WGP:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10CU-NOT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT:     buffer_wbinvl1_vol
+; GFX10WGP-NEXT: buffer_gl0_inv
+; GFX10CU-NOT:   buffer_gl0_inv
+; GCN:           flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel workgroup_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_acquire(
    i32* %in, i32* %out) {
 entry:
@ -584,12 +903,25 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_seq_cst:
-; GFX89-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX89-NOT:  buffer_wbinvl1_vol
-; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GCN-LABEL:     {{^}}workgroup_seq_cst:
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0
+; GFX89:         flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX10WGP:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX10CU:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT:     buffer_wbinvl1_vol
+; GFX10WGP-NEXT: buffer_gl0_inv
+; GFX10CU-NOT:   buffer_gl0_inv
+; GCN:           flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel workgroup_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_seq_cst(
    i32* %in, i32* %out) {
 entry:
@ -599,11 +931,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}wavefront_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel wavefront_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_unordered(
    i32* %in, i32* %out) {
 entry:
@ -613,11 +952,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}wavefront_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel wavefront_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_monotonic(
    i32* %in, i32* %out) {
 entry:
@ -627,11 +973,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}wavefront_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel wavefront_acquire
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_acquire(
    i32* %in, i32* %out) {
 entry:
@ -641,11 +994,18 @@ entry:
 }

 ; GCN-LABEL: {{^}}wavefront_seq_cst:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
+; GFX10-NOT: buffer_gl{{[01]}}_inv
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX10:         .amdhsa_kernel wavefront_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_seq_cst(
    i32* %in, i32* %out) {
 entry:
--- a/test/CodeGen/AMDGPU/memory-legalizer-store.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-store.ll
@ -2,12 +2,19 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s

 declare i32 @llvm.amdgcn.workitem.id.x()

 ; GCN-LABEL: {{^}}system_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel system_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_unordered(
    i32 %in, i32* %out) {
 entry:
@ -17,7 +24,12 @@ entry:

 ; GCN-LABEL: {{^}}system_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel system_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -27,7 +39,12 @@ entry:

 ; GCN-LABEL: {{^}}system_one_as_release:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel system_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_release(
    i32 %in, i32* %out) {
 entry:
@ -37,7 +54,12 @@ entry:

 ; GCN-LABEL: {{^}}system_one_as_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel system_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_one_as_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -47,7 +69,12 @@ entry:

 ; GCN-LABEL: {{^}}singlethread_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel singlethread_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_unordered(
    i32 %in, i32* %out) {
 entry:
@ -57,7 +84,12 @@ entry:

 ; GCN-LABEL: {{^}}singlethread_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel singlethread_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -67,7 +99,12 @@ entry:

 ; GCN-LABEL: {{^}}singlethread_one_as_release:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel singlethread_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_release(
    i32 %in, i32* %out) {
 entry:
@ -77,7 +114,12 @@ entry:

 ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel singlethread_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_one_as_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -87,7 +129,12 @@ entry:

 ; GCN-LABEL: {{^}}agent_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel agent_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_unordered(
    i32 %in, i32* %out) {
 entry:
@ -97,7 +144,12 @@ entry:

 ; GCN-LABEL: {{^}}agent_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel agent_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -107,7 +159,12 @@ entry:

 ; GCN-LABEL: {{^}}agent_one_as_release:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel agent_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_release(
    i32 %in, i32* %out) {
 entry:
@ -117,7 +174,12 @@ entry:

 ; GCN-LABEL: {{^}}agent_one_as_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel agent_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_one_as_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -127,7 +189,12 @@ entry:

 ; GCN-LABEL: {{^}}workgroup_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel workgroup_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_unordered(
    i32 %in, i32* %out) {
 entry:
@ -137,7 +204,12 @@ entry:

 ; GCN-LABEL: {{^}}workgroup_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel workgroup_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -145,9 +217,17 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_one_as_release:
-; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
-; GCN:        flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GCN-LABEL:     {{^}}workgroup_one_as_release:
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GCN:           flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel workgroup_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_release(
    i32 %in, i32* %out) {
 entry:
@ -155,9 +235,17 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_one_as_seq_cst:
-; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
-; GCN:        flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GCN-LABEL:     {{^}}workgroup_one_as_seq_cst:
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GCN:           flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_one_as_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -167,7 +255,12 @@ entry:

 ; GCN-LABEL: {{^}}wavefront_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel wavefront_one_as_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_unordered(
    i32 %in, i32* %out) {
 entry:
@ -177,7 +270,12 @@ entry:

 ; GCN-LABEL: {{^}}wavefront_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel wavefront_one_as_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -187,7 +285,12 @@ entry:

 ; GCN-LABEL: {{^}}wavefront_one_as_release:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel wavefront_one_as_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_release(
    i32 %in, i32* %out) {
 entry:
@ -197,7 +300,12 @@ entry:

 ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel wavefront_one_as_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_one_as_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -207,6 +315,11 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_private_0:
 ; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
+; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_private_0
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_private_0(
    i32* %in, i32 addrspace(5)* %out) {
 entry:
@ -217,6 +330,11 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_private_1:
 ; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
+; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_private_1
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_private_1(
    i32* %in, i32 addrspace(5)* %out) {
 entry:
@ -230,6 +348,11 @@ entry:
 ; GCN-LABEL: {{^}}nontemporal_global_0:
 ; GFX8:  flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
 ; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
+; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_global_0
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_global_0(
    i32* %in, i32 addrspace(1)* %out) {
 entry:
@ -241,6 +364,11 @@ entry:
 ; GCN-LABEL: {{^}}nontemporal_global_1:
 ; GFX8:  flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
 ; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
+; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_global_1
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_global_1(
    i32* %in, i32 addrspace(1)* %out) {
 entry:
@ -253,6 +381,10 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_local_0:
 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_local_0
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_local_0(
    i32* %in, i32 addrspace(3)* %out) {
 entry:
@ -263,6 +395,10 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_local_1:
 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_local_1
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_local_1(
    i32* %in, i32 addrspace(3)* %out) {
 entry:
@ -275,6 +411,11 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_flat_0:
 ; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
+; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_flat_0
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_flat_0(
    i32* %in, i32* %out) {
 entry:
@ -285,6 +426,11 @@ entry:

 ; GCN-LABEL: {{^}}nontemporal_flat_1:
 ; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
+; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}}
+; GFX10:         .amdhsa_kernel nontemporal_flat_1
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @nontemporal_flat_1(
    i32* %in, i32* %out) {
 entry:
@ -296,8 +442,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}system_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel system_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_unordered(
    i32 %in, i32* %out) {
 entry:
@ -306,8 +457,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}system_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel system_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -316,8 +472,14 @@ entry:
 }

 ; GCN-LABEL: {{^}}system_release:
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel system_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_release(
    i32 %in, i32* %out) {
 entry:
@ -326,8 +488,14 @@ entry:
 }

 ; GCN-LABEL: {{^}}system_seq_cst:
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel system_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @system_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -336,8 +504,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}singlethread_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel singlethread_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_unordered(
    i32 %in, i32* %out) {
 entry:
@ -346,8 +519,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}singlethread_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel singlethread_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -356,8 +534,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}singlethread_release:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel singlethread_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_release(
    i32 %in, i32* %out) {
 entry:
@ -366,8 +549,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}singlethread_seq_cst:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel singlethread_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @singlethread_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -376,8 +564,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}agent_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel agent_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_unordered(
    i32 %in, i32* %out) {
 entry:
@ -386,8 +579,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}agent_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel agent_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -396,8 +594,14 @@ entry:
 }

 ; GCN-LABEL: {{^}}agent_release:
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel agent_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_release(
    i32 %in, i32* %out) {
 entry:
@ -406,8 +610,14 @@ entry:
 }

 ; GCN-LABEL: {{^}}agent_seq_cst:
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt lgkmcnt(0){{$}}
+; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel agent_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @agent_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -416,8 +626,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}workgroup_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel workgroup_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_unordered(
    i32 %in, i32* %out) {
 entry:
@ -426,8 +641,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}workgroup_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel workgroup_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -435,9 +655,17 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_release:
-; GFX89-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN:        flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GCN-LABEL:     {{^}}workgroup_release:
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GCN:           flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel workgroup_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_release(
    i32 %in, i32* %out) {
 entry:
@ -445,9 +673,17 @@ entry:
  ret void
 }

-; GCN-LABEL: {{^}}workgroup_seq_cst:
-; GFX89-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GCN:        flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GCN-LABEL:     {{^}}workgroup_seq_cst:
+; GFX89-NOT:     s_waitcnt vmcnt(0){{$}}
+; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
+; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
+; GCN:           flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel workgroup_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @workgroup_seq_cst(
    i32 %in, i32* %out) {
 entry:
@ -456,8 +692,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}wavefront_unordered:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel wavefront_unordered
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_unordered(
    i32 %in, i32* %out) {
 entry:
@ -466,8 +707,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}wavefront_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel wavefront_monotonic
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_monotonic(
    i32 %in, i32* %out) {
 entry:
@ -476,8 +722,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}wavefront_release:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel wavefront_release
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_release(
    i32 %in, i32* %out) {
 entry:
@ -486,8 +737,13 @@ entry:
 }

 ; GCN-LABEL: {{^}}wavefront_seq_cst:
-; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+; GFX10:         .amdhsa_kernel wavefront_seq_cst
+; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
+; GFX10CU:       .amdhsa_workgroup_processor_mode 0
+; GFX10-NOT:     .amdhsa_memory_ordered 0
 define amdgpu_kernel void @wavefront_seq_cst(
    i32 %in, i32* %out) {
 entry: