[AMDGPU] Add volatile support to SIMemoryLegalizer

Treat a non-atomic volatile load and store as a relaxed atomic at system scope for the address spaces accessed. This will ensure all relevant caches will be bypassed. A volatile atomic is not changed and still only bypasses caches upto the level specified by the SyncScope operand. Differential Revision: https://reviews.llvm.org/D94214
2025-01-31 20:51:52 +01:00 · 2020-12-19 02:05:11 +00:00 · 2020-12-19 02:05:11 +00:00 · ccb53c0a97
commit ccb53c0a97
parent 29f461772d
87 changed files with 3580 additions and 1114 deletions
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@ -4736,18 +4736,48 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.

                                                           1. buffer/global/flat_load
                                                              glc=1
+                                                           2. s_waitcnt vmcnt(0)

-                                                         - nontemporal
+                                                            - Must happen before
+                                                              any following volatile
+                                                              global/generic
+                                                              load/store.
+                                                            - Ensures that
+                                                              volatile
+                                                              operations to
+                                                              different
+                                                              addresses will not
+                                                              be reordered by
+                                                              hardware.
+
+                                                         - !volatile & nontemporal

                                                           1. buffer/global/flat_load
                                                              glc=1 slc=1

     load         *none*       *none*         - local    1. ds_load
-     store        *none*       *none*         - global   - !nontemporal
+     store        *none*       *none*         - global   - !volatile & !nontemporal
                                              - generic
                                              - private    1. buffer/global/flat_store
                                              - constant
-                                                         - nontemporal
+                                                         - volatile & !nontemporal
+
+                                                           1. buffer/global/flat_store
+                                                           2. s_waitcnt vmcnt(0)
+
+                                                            - Must happen before
+                                                              any following volatile
+                                                              global/generic
+                                                              load/store.
+                                                            - Ensures that
+                                                              volatile
+                                                              operations to
+                                                              different
+                                                              addresses will not
+                                                              be reordered by
+                                                              hardware.
+
+                                                         - !volatile & nontemporal

                                                           1. buffer/global/flat_store
                                                              glc=1 slc=1
@ -6008,18 +6038,48 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.

                                                           1. buffer/global/flat_load
                                                              glc=1 dlc=1
+                                                           2. s_waitcnt vmcnt(0)

-                                                         - nontemporal
+                                                            - Must happen before
+                                                              any following volatile
+                                                              global/generic
+                                                              load/store.
+                                                            - Ensures that
+                                                              volatile
+                                                              operations to
+                                                              different
+                                                              addresses will not
+                                                              be reordered by
+                                                              hardware.
+
+                                                         - !volatile & nontemporal

                                                           1. buffer/global/flat_load
                                                              slc=1

     load         *none*       *none*         - local    1. ds_load
-     store        *none*       *none*         - global   - !nontemporal
+     store        *none*       *none*         - global   - !volatile & !nontemporal
                                              - generic
                                              - private    1. buffer/global/flat_store
                                              - constant
-                                                         - nontemporal
+                                                         - volatile & !nontemporal
+
+                                                           1. buffer/global/flat_store
+                                                           2. s_waitcnt vscnt(0)
+
+                                                            - Must happen before
+                                                              any following volatile
+                                                              global/generic
+                                                              load/store.
+                                                            - Ensures that
+                                                              volatile
+                                                              operations to
+                                                              different
+                                                              addresses will not
+                                                              be reordered by
+                                                              hardware.
+
+                                                         - !volatile & nontemporal

                                                            1. buffer/global/flat_store
                                                               slc=1
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@ -110,6 +110,7 @@ private:
  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
  bool IsCrossAddressSpaceOrdering = false;
+  bool IsVolatile = false;
  bool IsNonTemporal = false;

  SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
@ -119,11 +120,13 @@ private:
              bool IsCrossAddressSpaceOrdering = true,
              AtomicOrdering FailureOrdering =
                AtomicOrdering::SequentiallyConsistent,
+              bool IsVolatile = false,
              bool IsNonTemporal = false)
    : Ordering(Ordering), FailureOrdering(FailureOrdering),
      Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
      InstrAddrSpace(InstrAddrSpace),
      IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+      IsVolatile(IsVolatile),
      IsNonTemporal(IsNonTemporal) {
    // There is also no cross address space ordering if the ordering
    // address space is the same as the instruction address space and
@ -171,7 +174,13 @@ public:
  }

  /// \returns True if memory access of the machine instruction used to
-  /// create this SIMemOpInfo is non-temporal, false otherwise.
+  /// create this SIMemOpInfo is volatile, false otherwise.
+  bool isVolatile() const {
+    return IsVolatile;
+  }
+
+  /// \returns True if memory access of the machine instruction used to
+  /// create this SIMemOpInfo is nontemporal, false otherwise.
  bool isNonTemporal() const {
    return IsNonTemporal;
  }
@ -259,10 +268,13 @@ public:
                                     SIAtomicScope Scope,
                                     SIAtomicAddrSpace AddrSpace) const = 0;

-  /// Update \p MI memory instruction to indicate it is
-  /// nontemporal. Return true iff the instruction was modified.
-  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
-    const = 0;
+  /// Update \p MI memory instruction of kind \p Op associated with address
+  /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
+  /// true iff the instruction was modified.
+  virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                              SIAtomicAddrSpace AddrSpace,
+                                              SIMemOp Op, bool IsVolatile,
+                                              bool IsNonTemporal) const = 0;

  /// Inserts any necessary instructions at position \p Pos relative
  /// to instruction \p MI to ensure memory instructions before \p Pos of kind
@ -328,7 +340,10 @@ public:
                             SIAtomicScope Scope,
                             SIAtomicAddrSpace AddrSpace) const override;

-  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                      bool IsVolatile,
+                                      bool IsNonTemporal) const override;

  bool insertWait(MachineBasicBlock::iterator &MI,
                  SIAtomicScope Scope,
@ -378,7 +393,10 @@ public:
                             SIAtomicScope Scope,
                             SIAtomicAddrSpace AddrSpace) const override;

-  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                      bool IsVolatile,
+                                      bool IsNonTemporal) const override;

  bool insertWait(MachineBasicBlock::iterator &MI,
                  SIAtomicScope Scope,
@ -529,11 +547,13 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
  bool IsNonTemporal = true;
+  bool IsVolatile = false;

  // Validator should check whether or not MMOs cover the entire set of
  // locations accessed by the memory instruction.
  for (const auto &MMO : MI->memoperands()) {
    IsNonTemporal &= MMO->isNonTemporal();
+    IsVolatile |= MMO->isVolatile();
    InstrAddrSpace |=
      toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
    AtomicOrdering OpOrdering = MMO->getOrdering();
@ -576,7 +596,8 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
    }
  }
  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
-                     IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
+                     IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
+                     IsNonTemporal);
 }

 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
@ -703,14 +724,43 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
  return Changed;
 }

-bool SIGfx6CacheControl::enableNonTemporal(
-    const MachineBasicBlock::iterator &MI) const {
+bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
+    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+    bool IsVolatile, bool IsNonTemporal) const {
+  // Only handle load and store, not atomic read-modify-write insructions. The
+  // latter use glc to indicate if the atomic returns a result and so must not
+  // be used for cache control.
  assert(MI->mayLoad() ^ MI->mayStore());
+
+  // Only update load and store, not LLVM IR atomic read-modify-write
+  // instructions. The latter are always marked as volatile so cannot sensibly
+  // handle it as do not want to pessimize all atomics. Also they do not support
+  // the nontemporal attribute.
+  assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
  bool Changed = false;

-  /// TODO: Do not enableGLCBit if rmw atomic.
-  Changed |= enableGLCBit(MI);
-  Changed |= enableSLCBit(MI);
+  if (IsVolatile) {
+    if (Op == SIMemOp::LOAD)
+      Changed |= enableGLCBit(MI);
+
+    // Ensure operation has completed at system scope to cause all volatile
+    // operations to be visible outside the program in a global order. Do not
+    // request cross address space as only the global address space can be
+    // observable outside the program, so no need to cause a waitcnt for LDS
+    // address space operations.
+    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+                          Position::AFTER);
+
+    return Changed;
+  }
+
+  if (IsNonTemporal) {
+    // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
+    Changed |= enableGLCBit(MI);
+    Changed |= enableSLCBit(MI);
+    return Changed;
+  }

  return Changed;
 }
@ -732,7 +782,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
  bool VMCnt = false;
  bool LGKMCnt = false;

-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+      SIAtomicAddrSpace::NONE) {
    switch (Scope) {
    case SIAtomicScope::SYSTEM:
    case SIAtomicScope::AGENT:
@ -959,13 +1010,45 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
  return Changed;
 }

-bool SIGfx10CacheControl::enableNonTemporal(
-    const MachineBasicBlock::iterator &MI) const {
+bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
+    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+    bool IsVolatile, bool IsNonTemporal) const {
+
+  // Only handle load and store, not atomic read-modify-write insructions. The
+  // latter use glc to indicate if the atomic returns a result and so must not
+  // be used for cache control.
  assert(MI->mayLoad() ^ MI->mayStore());
+
+  // Only update load and store, not LLVM IR atomic read-modify-write
+  // instructions. The latter are always marked as volatile so cannot sensibly
+  // handle it as do not want to pessimize all atomics. Also they do not support
+  // the nontemporal attribute.
+  assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
  bool Changed = false;

-  Changed |= enableSLCBit(MI);
-  /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
+  if (IsVolatile) {
+
+    if (Op == SIMemOp::LOAD) {
+      Changed |= enableGLCBit(MI);
+      Changed |= enableDLCBit(MI);
+    }
+
+    // Ensure operation has completed at system scope to cause all volatile
+    // operations to be visible outside the program in a global order. Do not
+    // request cross address space as only the global address space can be
+    // observable outside the program, so no need to cause a waitcnt for LDS
+    // address space operations.
+    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+                          Position::AFTER);
+    return Changed;
+  }
+
+  if (IsNonTemporal) {
+    // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
+    Changed |= enableSLCBit(MI);
+    return Changed;
+  }

  return Changed;
 }
@ -988,7 +1071,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
  bool VSCnt = false;
  bool LGKMCnt = false;

-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+      SIAtomicAddrSpace::NONE) {
    switch (Scope) {
    case SIAtomicScope::SYSTEM:
    case SIAtomicScope::AGENT:
@ -1191,12 +1275,12 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
    return Changed;
  }

-  // Atomic instructions do not have the nontemporal attribute.
-  if (MOI.isNonTemporal()) {
-    Changed |= CC->enableNonTemporal(MI);
-    return Changed;
-  }
-
+  // Atomic instructions already bypass caches to the scope specified by the
+  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
+  // need additional treatment.
+  Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
+                                                SIMemOp::LOAD, MOI.isVolatile(),
+                                                MOI.isNonTemporal());
  return Changed;
 }

@ -1217,12 +1301,12 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
    return Changed;
  }

-  // Atomic instructions do not have the nontemporal attribute.
-  if (MOI.isNonTemporal()) {
-    Changed |= CC->enableNonTemporal(MI);
-    return Changed;
-  }
-
+  // Atomic instructions already bypass caches to the scope specified by the
+  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
+  // need additional treatment.
+  Changed |= CC->enableVolatileAndOrNonTemporal(
+      MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
+      MOI.isNonTemporal());
  return Changed;
 }

--- a/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@ -59,9 +59,11 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) {
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:  BB3_2: ; %bb1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    flat_store_dword v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 entry:
  %trunc = trunc i32 %cond to i1
  br i1 %trunc, label %bb0, label %bb1
@ -88,9 +90,11 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:  BB4_2: ; %bb1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    flat_store_dword v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 entry:
  %trunc0 = trunc i32 %cond0 to i1
  %trunc1 = trunc i32 %cond1 to i1
--- a/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@ -12,7 +12,8 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz BB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
-; CHECK-NEXT:    global_load_dword v0, v[0:1], off
+; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  BB0_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@ -39,7 +40,8 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz BB1_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
-; CHECK-NEXT:    global_load_dword v0, v[0:1], off
+; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  BB1_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@ -68,7 +70,8 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz BB2_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
-; CHECK-NEXT:    global_load_dword v0, v[0:1], off
+; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  BB2_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@ -99,7 +102,8 @@ define i32 @divergent_if_nonboolean_condition1(i32 addrspace(1)* %ptr) {
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz BB3_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
-; CHECK-NEXT:    global_load_dword v0, v[0:1], off
+; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  BB3_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@ -219,7 +223,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
 ; CHECK-NEXT:    s_cbranch_vccnz BB5_1
 ; CHECK-NEXT:  ; %bb.3: ; %bb4
 ; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    global_load_dword v2, v[0:1], off
+; CHECK-NEXT:    global_load_dword v2, v[0:1], off glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_ge_i32_e64 s[2:3], v0, v2
 ; CHECK-NEXT:    s_branch BB5_1
--- a/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@ -15,8 +15,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    s_add_u32 s1, 4, s1
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s0, 4, s0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_kernel:
@ -34,7 +36,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_add_u32 s0, 4, s0
 ; GFX10-NEXT:    s_add_u32 s1, 4, s1
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-NEXT:    scratch_load_dword v0, off, s1
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 bb:
  %i = alloca [32 x float], align 4, addrspace(5)
@ -61,8 +65,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_kernel:
@ -79,7 +85,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
 ; GFX10-NEXT:    scratch_store_dword v0, v3, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 bb:
  %i = alloca [32 x float], align 4, addrspace(5)
@ -107,8 +115,9 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -124,9 +133,9 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
 ; GFX10-NEXT:    scratch_store_dword v0, v3, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
  %i = alloca [32 x float], align 4, addrspace(5)
@ -176,10 +185,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    s_add_u32 s1, 0x104, s1
-; GFX9-NEXT:    scratch_load_dword v1, off, s2
+; GFX9-NEXT:    scratch_load_dword v1, off, s2 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_small_offset_kernel:
@ -190,7 +202,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX10-NEXT:    s_add_u32 s1, 4, 0
-; GFX10-NEXT:    scratch_load_dword v0, off, s1
+; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@ -200,7 +212,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
 ; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-NEXT:    scratch_load_dword v0, off, s1
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 bb:
  %padding = alloca [64 x i32], align 4, addrspace(5)
@ -224,7 +238,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_add_u32 s0, 4, 0
-; GFX9-NEXT:    scratch_load_dword v1, off, s0
+; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
@ -233,8 +247,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_small_offset_kernel:
@ -251,9 +267,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s0
+; GFX10-NEXT:    scratch_load_dword v2, off, s0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    scratch_store_dword v0, v3, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 bb:
  %padding = alloca [64 x i32], align 4, addrspace(5)
@ -278,9 +297,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s0, s32, 0
-; GFX9-NEXT:    scratch_load_dword v1, off, s0
-; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x100
+; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x100
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
@ -288,8 +307,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -306,11 +326,12 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_add_u32 s0, s32, 0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s0
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off
+; GFX10-NEXT:    scratch_load_dword v2, off, s0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
  %padding = alloca [64 x i32], align 4, addrspace(5)
@ -341,10 +362,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    s_add_u32 s1, 0x4004, s1
-; GFX9-NEXT:    scratch_load_dword v1, off, s2
+; GFX9-NEXT:    scratch_load_dword v1, off, s2 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_large_offset_kernel:
@ -355,7 +379,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX10-NEXT:    s_add_u32 s1, 4, 0
-; GFX10-NEXT:    scratch_load_dword v0, off, s1
+; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@ -365,7 +389,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
 ; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-NEXT:    scratch_load_dword v0, off, s1
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 bb:
  %padding = alloca [4096 x i32], align 4, addrspace(5)
@ -389,7 +415,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_add_u32 s0, 4, 0
-; GFX9-NEXT:    scratch_load_dword v1, off, s0
+; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
@ -398,8 +424,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_large_offset_kernel:
@ -416,9 +444,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s0
+; GFX10-NEXT:    scratch_load_dword v2, off, s0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    scratch_store_dword v0, v3, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 bb:
  %padding = alloca [4096 x i32], align 4, addrspace(5)
@ -443,9 +474,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s0, s32, 0
-; GFX9-NEXT:    scratch_load_dword v1, off, s0
-; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
@ -453,8 +484,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -471,11 +503,12 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_add_u32 s0, s32, 0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s0
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off
+; GFX10-NEXT:    scratch_load_dword v2, off, s0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
  %padding = alloca [4096 x i32], align 4, addrspace(5)
@ -501,11 +534,14 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_add_u32 s0, 4, 0
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    s_add_u32 s0, 4, s0
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_large_imm_offset_kernel:
@ -520,8 +556,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX10-NEXT:    s_add_u32 s1, 4, 0
 ; GFX10-NEXT:    s_add_u32 s0, 4, s0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s1
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_store_dword off, v1, s0
-; GFX10-NEXT:    scratch_load_dword v0, off, s0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 bb:
  %i = alloca [4096 x i32], align 4, addrspace(5)
@ -541,11 +580,13 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_add_u32 s0, s32, 0
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    s_add_u32 s0, s32, s0
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -559,10 +600,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-NEXT:    s_add_u32 s1, s32, 0
 ; GFX10-NEXT:    s_add_u32 s0, s32, s0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s1
-; GFX10-NEXT:    scratch_store_dword off, v1, s0
-; GFX10-NEXT:    scratch_load_dword v0, off, s0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_store_dword off, v1, s0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
  %i = alloca [4096 x i32], align 4, addrspace(5)
@ -586,7 +628,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
-; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vidx_sidx_offset:
@ -601,7 +645,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
-; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 bb:
  %alloca = alloca [32 x i32], align 4, addrspace(5)
@ -621,7 +667,8 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
-; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -632,9 +679,9 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
-; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
  store volatile i64 15, i64 addrspace(5)* %arg, align 8
@ -649,7 +696,8 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
-; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -660,9 +708,9 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
-; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
  store volatile i64 15, i64 addrspace(5)* %arg, align 1
@ -681,7 +729,8 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
-; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -696,9 +745,9 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
-; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
@ -719,7 +768,8 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
-; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -736,9 +786,9 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
-; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
--- a/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
@ -13,14 +13,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
 ; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
 ; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@ -43,16 +44,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    flat_load_dword v1, v[2:3]
-; VI-NEXT:    flat_load_dword v2, v[4:5]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[4:5] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v7, s1
 ; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; VI-NEXT:    flat_store_dword v[6:7], v0
 ; VI-NEXT:    s_endpgm
@ -62,12 +64,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@ -98,19 +101,19 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_min_f32_e32 v5, v2, v3
 ; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_min_f32_e32 v2, v2, v3
@ -138,21 +141,21 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_load_dword v7, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_dword v3, v[4:5]
+; VI-NEXT:    flat_load_dword v7, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v3, v[4:5] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_sub_f32_e32 v4, 0x80000000, v7
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; VI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; VI-NEXT:    v_min_f32_e32 v5, v4, v2
 ; VI-NEXT:    v_max_f32_e32 v2, v4, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; VI-NEXT:    v_min_f32_e32 v2, v2, v3
@ -167,17 +170,17 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
@ -213,15 +216,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s2, 0x80000000
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_sub_f32_e32 v2, s2, v2
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b32 s2, 0x80000000
+; SI-NEXT:    v_sub_f32_e32 v2, s2, v2
 ; SI-NEXT:    v_sub_f32_e64 v4, s2, |v4|
 ; SI-NEXT:    v_med3_f32 v2, v2, |v3|, v4
 ; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
@ -245,17 +249,18 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_load_dword v7, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_dword v3, v[4:5]
+; VI-NEXT:    flat_load_dword v7, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v3, v[4:5] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_mov_b32 s2, 0x80000000
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_sub_f32_e32 v4, s2, v7
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
 ; VI-NEXT:    v_med3_f32 v2, v4, |v2|, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@ -266,13 +271,14 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_sub_f32_e32 v1, s2, v1
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
+; GFX9-NEXT:    v_sub_f32_e32 v1, s2, v1
 ; GFX9-NEXT:    v_sub_f32_e64 v3, s2, |v3|
 ; GFX9-NEXT:    v_med3_f32 v1, v1, |v2|, v3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
@ -310,17 +316,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s2, 0x80000000
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_sub_f32_e64 v2, s2, |v2|
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b32 s2, 0x80000000
+; SI-NEXT:    v_sub_f32_e64 v2, s2, |v2|
+; SI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
 ; SI-NEXT:    v_sub_f32_e64 v4, s2, |v4|
 ; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
 ; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
@ -344,19 +350,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_load_dword v7, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_dword v3, v[4:5]
+; VI-NEXT:    flat_load_dword v7, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v3, v[4:5] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s2, 0x80000000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_sub_f32_e64 v4, s2, |v7|
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_sub_f32_e64 v2, s2, |v2|
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
 ; VI-NEXT:    v_med3_f32 v2, v4, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@ -367,15 +373,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_sub_f32_e64 v1, s2, |v1|
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_sub_f32_e64 v2, s2, |v2|
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
+; GFX9-NEXT:    v_sub_f32_e64 v1, s2, |v1|
+; GFX9-NEXT:    v_sub_f32_e64 v2, s2, |v2|
 ; GFX9-NEXT:    v_sub_f32_e64 v3, s2, |v3|
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
@ -415,16 +421,16 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
 ; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
 ; SI-NEXT:    v_min_f32_e32 v5, v2, v3
 ; SI-NEXT:    v_max_f32_e32 v2, v2, v3
@ -454,20 +460,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_load_dword v7, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_dword v3, v[4:5]
+; VI-NEXT:    flat_load_dword v7, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v3, v[4:5] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_f32_e32 v4, 1.0, v7
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; VI-NEXT:    v_min_f32_e32 v5, v4, v2
 ; VI-NEXT:    v_max_f32_e32 v2, v4, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; VI-NEXT:    v_min_f32_e32 v2, v2, v3
@ -482,16 +488,16 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
@ -536,22 +542,23 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s3, s11
 ; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_min_f32_e32 v5, v2, v3
 ; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_min_f32_e32 v2, v2, v3
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
@ -578,20 +585,20 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_load_dword v7, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_dword v3, v[4:5]
+; VI-NEXT:    flat_load_dword v7, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v3, v[4:5] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_mul_f32_e32 v4, 1.0, v7
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; VI-NEXT:    v_min_f32_e32 v5, v4, v2
 ; VI-NEXT:    v_max_f32_e32 v2, v4, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; VI-NEXT:    v_min_f32_e32 v2, v2, v3
@ -599,6 +606,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
 ; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; VI-NEXT:    v_max_f32_e32 v2, v3, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v5
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@ -607,17 +615,18 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v4, off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
--- a/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@ -810,8 +810,11 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v16, v16, v1, s[12:13]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v18, v18, v1, s[14:15]
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
@ -874,8 +877,11 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v16, v16, v1, s8
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v18, v18, v1, s9
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
 ; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
@ -954,9 +960,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v14, v14, v0, s[8:9]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s[10:11]
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[1:4], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[5:8], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[13:16], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_v:
@ -1020,9 +1030,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v15, v15, s30, s4
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v16, v16, s31, s4
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[1:4], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[5:8], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[13:16], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@ -1078,9 +1092,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, v1
 ; GPRIDX-NEXT:    s_set_gpr_idx_off
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_s:
@ -1121,9 +1139,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    v_movreld_b32_e32 v2, v0
 ; MOVREL-NEXT:    v_movreld_b32_e32 v3, v1
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@ -1147,9 +1169,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, s3
 ; GPRIDX-NEXT:    s_set_gpr_idx_off
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_s:
@ -1158,9 +1184,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i
 ; MOVREL-NEXT:    v_movreld_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_movreld_b32_e32 v1, s3
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@ -1235,9 +1265,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v16, v16, v1, s[8:9]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v18, v18, v1, s[10:11]
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_v:
@ -1299,9 +1333,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v16, v16, v1, s4
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v18, v18, v1, s5
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@ -1346,9 +1384,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v13, v16, s[14:15]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v15, v16, s[12:13]
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_v:
@ -1378,9 +1420,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v14, v14, s2, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v15, v15, s3, vcc_lo
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@ -1404,9 +1450,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, v17
 ; GPRIDX-NEXT:    s_set_gpr_idx_off
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_s:
@ -1415,9 +1465,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %
 ; MOVREL-NEXT:    v_movreld_b32_e32 v0, v16
 ; MOVREL-NEXT:    v_movreld_b32_e32 v1, v17
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@ -1460,9 +1514,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[12:13]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[10:11]
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v:
@ -1494,9 +1552,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s6
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s5
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@ -1978,24 +2040,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
 ; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s3
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GPRIDX-NEXT:    s_nop 0
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, s4
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, s5
 ; GPRIDX-NEXT:    v_mov_b32_e32 v2, s6
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s7
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GPRIDX-NEXT:    s_nop 0
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, s8
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, s9
 ; GPRIDX-NEXT:    v_mov_b32_e32 v2, s10
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s11
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GPRIDX-NEXT:    s_nop 0
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, s12
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, s13
 ; GPRIDX-NEXT:    v_mov_b32_e32 v2, s14
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s15
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_s_add_1:
@ -2035,9 +2098,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
 ; MOVREL-NEXT:    v_mov_b32_e32 v14, s14
 ; MOVREL-NEXT:    v_mov_b32_e32 v15, s15
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %idx.add = add i32 %idx, 1
@ -2082,9 +2149,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[12:13]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[10:11]
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1:
@ -2117,9 +2188,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s6
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s5
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_endpgm
 entry:
  %idx.add = add i32 %idx, 1
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@ -818,9 +818,12 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v3, v[1:2], s[0:3], 0 addr64
-; GFX7-NEXT:    buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4
-; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT:    buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX7-NEXT:    s_and_b32 s0, 1, s0
@ -829,7 +832,6 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v3, v4, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
 ; GFX7-NEXT:    s_endpgm
@ -848,9 +850,12 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 8, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_dword v1, v[1:2]
-; GFX8-NEXT:    flat_load_dword v2, v[3:4]
-; GFX8-NEXT:    flat_load_dword v3, v[5:6]
+; GFX8-NEXT:    flat_load_dword v1, v[1:2] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v2, v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_add_u32 s0, s4, 8
 ; GFX8-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
@ -859,8 +864,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
 ; GFX8-NEXT:    s_and_b64 vcc, vcc, s[2:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    s_nop 1
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@ -874,16 +878,17 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX10_W32-NEXT:    s_load_dword s0, s[0:1], 0x54
 ; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_clause 0x2
-; GFX10_W32-NEXT:    global_load_dword v2, v1, s[6:7]
-; GFX10_W32-NEXT:    global_load_dword v3, v1, s[6:7] offset:4
-; GFX10_W32-NEXT:    global_load_dword v1, v1, s[6:7] offset:8
+; GFX10_W32-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
+; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_W32-NEXT:    global_load_dword v3, v1, s[6:7] offset:4 glc dlc
+; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_W32-NEXT:    global_load_dword v1, v1, s[6:7] offset:8 glc dlc
+; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
-; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v2, v3, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[4:5] offset:8
@ -896,16 +901,17 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX10_W64-NEXT:    s_load_dword s0, s[0:1], 0x54
 ; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_clause 0x2
-; GFX10_W64-NEXT:    global_load_dword v2, v1, s[6:7]
-; GFX10_W64-NEXT:    global_load_dword v3, v1, s[6:7] offset:4
-; GFX10_W64-NEXT:    global_load_dword v1, v1, s[6:7] offset:8
+; GFX10_W64-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
+; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_W64-NEXT:    global_load_dword v3, v1, s[6:7] offset:4 glc dlc
+; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_W64-NEXT:    global_load_dword v1, v1, s[6:7] offset:8 glc dlc
+; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W64-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; GFX10_W64-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v2, v3, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[4:5] offset:8
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@ -14,10 +14,11 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v2
 ; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@ -34,8 +35,9 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@ -48,9 +50,9 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@ -79,10 +81,11 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v2, v0, v2
 ; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@ -99,8 +102,9 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@ -113,9 +117,9 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v0, s2, v1, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@ -147,8 +151,9 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
@ -168,8 +173,9 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@ -183,9 +189,9 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
@ -217,8 +223,9 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
@ -238,8 +245,9 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@ -253,9 +261,9 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
@ -1056,12 +1064,12 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v2
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v2
 ; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v1
 ; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@ -1078,11 +1086,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    flat_load_dword v1, v[2:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@ -1094,12 +1102,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
@ -1129,10 +1136,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v2
 ; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
@ -1150,8 +1158,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
@ -1165,9 +1174,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@ -16,11 +16,13 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GCN-NEXT:  ; %bb.1: ; %mid
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:  BB0_2: ; %bb
 ; GCN-NEXT:    s_waitcnt_depctr 0xffe3
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_endpgm
 entry:
  %cond = icmp eq i32 %arg0, 0
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
@ -15,10 +15,12 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) {
 ; GCN-NEXT:  ; %bb.1: ; %mid
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:  BB0_2: ; %bb
 ; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
 entry:
  %cond = icmp eq i32 %arg0, 0
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
@ -15,6 +15,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_endpgm
 entry:
  %cond = icmp eq i32 %arg0, 0
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@ -15,6 +15,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
 entry:
  %cond = icmp eq i32 %arg0, 0
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@ -14,9 +14,10 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_load_dword s0, s[4:5], 0x11
-; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v1
 ; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CI-NEXT:    flat_store_dword v[0:1], v0
@ -27,10 +28,10 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
@ -59,6 +60,7 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
 ; CI-NEXT:  ; %bb.1: ; %bb0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    flat_store_dword v[0:1], v0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:  BB1_2: ; %bb1
 ; CI-NEXT:    s_endpgm
 ;
@ -76,6 +78,7 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB1_2: ; %bb1
 ; GFX9-NEXT:    s_endpgm
  %val = call i1 @llvm.amdgcn.is.private(i8* %ptr)
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@ -14,9 +14,10 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_load_dword s0, s[4:5], 0x10
-; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v1
 ; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CI-NEXT:    flat_store_dword v[0:1], v0
@ -27,10 +28,10 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
@ -59,6 +60,7 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
 ; CI-NEXT:  ; %bb.1: ; %bb0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    flat_store_dword v[0:1], v0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:  BB1_2: ; %bb1
 ; CI-NEXT:    s_endpgm
 ;
@ -76,6 +78,7 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB1_2: ; %bb1
 ; GFX9-NEXT:    s_endpgm
  %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@ -32,6 +32,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_trig_preop_f64:
@ -42,6 +43,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_trig_preop_f64:
@ -52,6 +54,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
 ; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
  %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b)
  store volatile double %result, double* undef
@ -65,6 +68,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
  %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7)
  store volatile double %result, double* undef
--- a/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@ -17,17 +17,23 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
 ; GFX9-NEXT:  ; %bb.1: ; %bb1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x5be6
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1c7
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3e8
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1c8
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB0_2: ; %Flow
 ; GFX9-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
@ -36,16 +42,22 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
 ; GFX9-NEXT:  ; %bb.3: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1c8
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3e8
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1c7
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x5be6
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB0_4: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 entry:
@ -104,7 +116,9 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v0, v0, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB1_2: ; %Flow
 ; GFX9-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
@ -123,7 +137,9 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB1_4: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 entry:
@ -165,11 +181,13 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv2@rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_store_dword v0, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, static.gv3@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv3@rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB2_2: ; %Flow
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
@ -180,11 +198,13 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv0@rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_store_dword v0, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, static.gv1@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv1@rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB2_4: ; %bb2
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
--- a/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@ -468,7 +468,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr(float addrspace(1)* inreg %ptr) {
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -478,7 +478,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr(float addrspace(1)* inreg %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %val = load volatile float, float addrspace(1)* %ptr
@ -493,7 +493,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(float addrspace(1)* inreg
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_movk_i32 s4, 0x3ffc
-; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], s4
+; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], s4 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -504,7 +504,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(float addrspace(1)* inreg
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_movk_i32 s4, 0x3ffc
-; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s4
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s4 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4095
@ -523,7 +523,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(float addrspace(1)*
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -537,7 +537,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(float addrspace(1)*
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296
@ -556,7 +556,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(float addrspace(1)*
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -570,7 +570,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(float addrspace(1)*
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297
@ -586,7 +586,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(float addrspace(1)* inreg
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_movk_i32 s4, 0x4000
-; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], s4
+; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], s4 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -597,7 +597,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(float addrspace(1)* inreg
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_movk_i32 s4, 0x4000
-; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s4
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s4 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4096
@ -612,7 +612,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(float addrspace(1)* %ptr)
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX6-NEXT:    s_movk_i32 s4, 0x3ffc
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -622,7 +622,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(float addrspace(1)* %ptr)
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0x3ffc
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4095
@ -637,7 +637,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(float addrspace(1)*
 ; GFX6-NEXT:    s_mov_b32 s1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, s0
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -647,7 +647,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(float addrspace(1)*
 ; GFX7-NEXT:    s_mov_b32 s1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, s0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296
@ -662,7 +662,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(float addrspace(1)*
 ; GFX6-NEXT:    s_mov_b32 s1, s0
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -672,7 +672,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(float addrspace(1)*
 ; GFX7-NEXT:    s_mov_b32 s1, s0
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297
@ -687,7 +687,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(float addrspace(1)* %ptr)
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX6-NEXT:    s_movk_i32 s4, 0x4000
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -697,7 +697,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(float addrspace(1)* %ptr)
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0x4000
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4096
@ -716,7 +716,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inre
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -730,7 +730,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inre
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
@ -745,7 +745,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -755,7 +755,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr
 ; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
@ -770,7 +770,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -780,7 +780,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
 ; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
@ -802,7 +802,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, s5
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -818,7 +818,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
 ; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, s5
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 256
@ -836,7 +836,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(float addrspace(1)* inre
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -848,7 +848,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(float addrspace(1)* inre
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %voffset
@ -866,7 +866,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(float addrspa
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_movk_i32 s4, 0x3ffc
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -879,7 +879,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(float addrspa
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_movk_i32 s4, 0x3ffc
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %voffset
@ -896,7 +896,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspa
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s5, s3, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -908,7 +908,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspa
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_addc_u32 s5, s3, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095
--- a/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@ -56,6 +56,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:  BB0_3: ; %bb.2
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm

 entry:
@ -129,6 +130,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:  BB1_2: ; %bb.1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
 entry:
  %cond = icmp eq i32 %arg.cond, 0
@ -193,9 +195,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_mov_b32 s33, s8
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

 entry:
@ -257,9 +259,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x2000
 ; GCN-NEXT:    s_mov_b32 s33, s8
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
  %cond = icmp eq i32 %arg.cond, 0
--- a/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@ -2,7 +2,7 @@

 ; GCN-LABEL: {{^}}kernel_ieee_mode_default:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
@ -17,7 +17,7 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {

 ; GCN-LABEL: {{^}}kernel_ieee_mode_on:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
@ -32,7 +32,7 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {

 ; GCN-LABEL: {{^}}kernel_ieee_mode_off:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-NOT: [[VAL0]]
 ; GCN-NOT: [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
@ -47,7 +47,7 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {

 ; GCN-LABEL: {{^}}func_ieee_mode_default:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
@ -62,7 +62,7 @@ define void @func_ieee_mode_default() #0 {

 ; GCN-LABEL: {{^}}func_ieee_mode_on:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
@ -77,7 +77,7 @@ define void @func_ieee_mode_on() #1 {

 ; GCN-LABEL: {{^}}func_ieee_mode_off:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-NOT: [[VAL0]]
 ; GCN-NOT: [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
@ -92,7 +92,7 @@ define void @func_ieee_mode_off() #2 {

 ; GCN-LABEL: {{^}}cs_ieee_mode_default:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-NOT: [[VAL0]]
 ; GCN-NOT: [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
@ -107,7 +107,7 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {

 ; GCN-LABEL: {{^}}cs_ieee_mode_on:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
@ -122,7 +122,7 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {

 ; GCN-LABEL: {{^}}cs_ieee_mode_off:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-NOT: [[VAL0]]
 ; GCN-NOT: [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
@ -137,7 +137,7 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {

 ; GCN-LABEL: {{^}}ps_ieee_mode_default:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-NOT: [[VAL0]]
 ; GCN-NOT: [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
@ -152,7 +152,7 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {

 ; GCN-LABEL: {{^}}ps_ieee_mode_on:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
 ; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
@ -167,7 +167,7 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {

 ; GCN-LABEL: {{^}}ps_ieee_mode_off:
 ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
 ; GCN-NOT: [[VAL0]]
 ; GCN-NOT: [[VAL1]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
--- a/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s

--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@ -245,7 +245,7 @@ bb3:

 ; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
 ; GCN: buffer_store_dword [[BB4_K]]
-; GCN-NEXT: s_endpgm
+; GCN: s_endpgm
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
 define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 bb0:
--- a/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/test/CodeGen/AMDGPU/byval-frame-setup.ll
@ -11,12 +11,12 @@
 ; GCN: s_and_saveexec_b64
 ; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]]

-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 glc{{$}}
 ; GCN-NOT: s32
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
 ; GCN-NOT: s32

-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16 glc{{$}}
 ; GCN-NOT: s32
 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
 ; GCN-NOT: s32
--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@ -87,8 +87,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12
-
-; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
@ -110,9 +108,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
-
-
-; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
--- a/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/test/CodeGen/AMDGPU/callee-frame-setup.ll
@ -57,10 +57,10 @@ define void @callee_with_stack() #0 {
 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
 ; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}}
 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}}
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x200
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_all() #1 {
  %alloca = alloca i32, addrspace(5)
@ -346,10 +346,10 @@ define void @no_new_vgpr_for_fp_csr() #1 {
 ; GCN-NEXT:     v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; MUBUF-NEXT:   buffer_store_dword [[ZERO]], off, s[0:3], s33
 ; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x100000
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define void @realign_stack_no_fp_elim() #1 {
  %alloca = alloca i32, align 8192, addrspace(5)
@ -366,6 +366,7 @@ define void @realign_stack_no_fp_elim() #1 {
 ; GCN: v_writelane_b32 v1, s31, 1
 ; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
+; GCN-NEXT:     s_waitcnt vmcnt(0)
 ; GCN: ;;#ASMSTART
 ; MUBUF:        v_readlane_b32 s4, v1, 0
 ; MUBUF-NEXT:   s_add_u32 s32, s32, 0x200
@ -376,7 +377,6 @@ define void @realign_stack_no_fp_elim() #1 {
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x200
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
 ; GCN-NEXT:     v_readlane_b32 s33, v1, 2
-; GCN-NEXT:     s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:   s_setpc_b64 s[4:5]
 ; FLATSCR-NEXT: s_setpc_b64 s[0:1]
 define void @no_unused_non_csr_sgpr_for_fp() #1 {
--- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@ -399,9 +399,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
 ; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
 ; VARABI: v_and_b32_e32 v32, 0x3ff, v32
 ; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-
-; VARABI: s_waitcnt
-; VARABI-NEXT: s_setpc_b64
+; VARABI: s_setpc_b64

 ; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
 ; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
@ -545,7 +543,7 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
 ; VARABI-NEXT: s_waitcnt
 ; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32
 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
-; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
+; VARABI: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
 ; VARABI: s_setpc_b64


@ -554,7 +552,7 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(

 ; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
 ; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4{{$}}
+; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
 ; FIXEDABI: s_setpc_b64
 define void @too_many_args_use_workitem_id_x_byval(
  i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@ -704,10 +702,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10
 ; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10
 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
-; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
-
-; VARABI: s_waitcnt
-; VARABI-NEXT: s_setpc_b64
+; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
+; VARABI: s_setpc_b64


 ; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
@ -717,7 +713,7 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
 ; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
 ; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
-; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
+; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]

 define void @too_many_args_use_workitem_id_xyz(
  i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@ -810,9 +806,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]]
 ; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]]
-
-; GCN: s_waitcnt
-; GCN-NEXT: s_setpc_b64
+; GCN: s_setpc_b64
 ; GCN: ScratchSize: 0
 define void @too_many_args_use_workitem_id_x_stack_yz(
  i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
--- a/test/CodeGen/AMDGPU/cc-update.ll
+++ b/test/CodeGen/AMDGPU/cc-update.ll
@ -29,6 +29,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX803-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: test_kern_stack:
@ -39,6 +40,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: test_kern_stack:
@ -51,6 +53,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_endpgm
 entry:
  %x = alloca i32, align 4, addrspace(5)
@ -119,6 +122,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
 ; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX803-NEXT:    s_endpgm
 ;
@ -134,6 +138,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX900-NEXT:    s_endpgm
 ;
@ -151,6 +156,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
 ; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1010-NEXT:    s_endpgm
 entry:
@ -190,6 +196,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX803-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: test_force_fp_kern_stack:
@ -201,6 +208,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: test_force_fp_kern_stack:
@ -214,6 +222,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX1010-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_endpgm
 entry:
  %x = alloca i32, align 4, addrspace(5)
@ -286,6 +295,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
 ; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX803-NEXT:    s_endpgm
 ;
@ -302,6 +312,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX900-NEXT:    s_endpgm
 ;
@ -320,6 +331,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
 ; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX1010-NEXT:    s_endpgm
 entry:
@ -336,10 +348,10 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; GFX803-NEXT:    s_add_u32 s0, s0, s7
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
-; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 s4, 0x40000
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
 ; GFX803-NEXT:    ;;#ASMSTART
 ; GFX803-NEXT:    ;;#ASMEND
@ -347,6 +359,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
 ; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: test_sgpr_offset_kernel:
@ -355,9 +368,9 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; GFX900-NEXT:    s_add_u32 s0, s0, s7
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
-; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
-; GFX900-NEXT:    s_mov_b32 s6, 0x40000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s6, 0x40000
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ;;#ASMEND
@ -365,6 +378,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: test_sgpr_offset_kernel:
@ -376,7 +390,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
 ; GFX1010-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
-; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
@ -386,6 +400,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
 ; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_endpgm
 entry:
  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s
@ -36,7 +37,8 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
 ; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] offset:28
-; GCN-NEXT:    global_load_dword v0, v[0:1], off
+; GCN-NEXT:    global_load_dword v0, v[0:1], off glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:  BB0_2: ; %endif
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x3d0000
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@ -137,7 +137,7 @@ done:
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
 ; GCN: {{^}}BB4_2:
 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
@ -177,7 +177,7 @@ done:
 ; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
 ; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
 ; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
-; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092 glc{{$}}
 ; GCN: {{^BB[0-9]+}}_2:

 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
@ -215,7 +215,7 @@ done:
 ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
 define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
--- a/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@ -233,12 +233,15 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
 ; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:6
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
@ -262,14 +265,17 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1]
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:4
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:6
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:8
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
@ -387,10 +393,11 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
 ; GCN-LABEL: chain_hi_to_lo_global_other_dep:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v[0:1], off offset:2
-; GCN-NEXT:    global_load_short_d16_hi v0, v[0:1], off
-; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GCN-NEXT:    global_load_ushort v2, v[0:1], off offset:2 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GCN-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GCN-NEXT:    v_bfi_b32 v0, v1, v2, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@ -409,10 +416,12 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
 ; GCN-LABEL: chain_hi_to_lo_flat_other_dep:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    flat_load_ushort v2, v[0:1] offset:2
-; GCN-NEXT:    flat_load_short_d16_hi v0, v[0:1]
+; GCN-NEXT:    flat_load_ushort v2, v[0:1] offset:2 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    flat_load_short_d16_hi v0, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GCN-NEXT:    v_bfi_b32 v0, v1, v2, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
--- a/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@ -64,7 +64,7 @@ define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, fl
 }

 ; FUNC-LABEL: @commute_add_fabs_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
@ -81,7 +81,7 @@ define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float
 }

 ; FUNC-LABEL: @commute_mul_fneg_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI: buffer_store_dword [[REG]]
@ -98,7 +98,7 @@ define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float
 }

 ; FUNC-LABEL: @commute_mul_fabs_fneg_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
@ -117,7 +117,7 @@ define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, f

 ; There's no reason to commute this.
 ; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
@ -135,7 +135,7 @@ define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %ou
 }

 ; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
@ -157,7 +157,7 @@ define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)
 ; though we have negate modifier on src2.

 ; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
-; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@ -567,7 +567,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %
 ; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s2
 ; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@ -584,7 +584,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %
 ; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
-; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@ -612,9 +612,13 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)*
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_byte v3, off, s[0:3], 0 offset:3
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_byte v2, off, s[0:3], 0 offset:2
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_copy_v4i8_volatile_store:
@ -635,9 +639,13 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)*
 ; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_byte v3, off, s[0:3], 0 offset:3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_byte v2, off, s[0:3], 0 offset:2
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:1
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
--- a/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll
@ -50,6 +50,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; CI-NEXT:    s_mov_b32 s1, s0
 ; CI-NEXT:    ds_write_b32 v0, v2 offset:12
 ; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
@ -66,6 +67,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 entry:
  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
@ -237,6 +239,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; CI-NEXT:    s_mov_b32 s1, s0
 ; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
 ; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
@ -254,6 +257,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
  %neg = sub i32 0, %x.i
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@ -50,10 +50,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
-; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
 ; CI-NEXT:    s_endpgm
 ;
@ -62,8 +63,9 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
 ; GFX9-NEXT:    s_endpgm
@ -92,12 +94,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
 ; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    ds_write_b32 v0, v2
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v2
 ; CI-NEXT:    ds_write_b32 v0, v1 offset:32
 ; CI-NEXT:    s_endpgm
 ;
@ -106,11 +108,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    ds_write_b32 v0, v1
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
 ; GFX9-NEXT:    s_endpgm
  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -138,12 +140,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
 ; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    ds_write_b32 v0, v2
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v2
 ; CI-NEXT:    ds_write_b32 v0, v1 offset:32
 ; CI-NEXT:    s_endpgm
 ;
@ -152,11 +154,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    ds_write_b32 v0, v1
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
 ; GFX9-NEXT:    s_endpgm
  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -186,11 +188,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
-; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8
+; CI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    ds_write2_b32 v0, v3, v2 offset1:8
 ; CI-NEXT:    s_endpgm
 ;
@ -200,9 +203,9 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v3, s[0:1] offset:8
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v3 offset1:8
 ; GFX9-NEXT:    s_endpgm
@ -308,10 +311,11 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
-; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:255
 ; CI-NEXT:    s_endpgm
 ;
@ -320,8 +324,9 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:255
 ; GFX9-NEXT:    s_endpgm
@ -730,10 +735,11 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
-; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
 ; CI-NEXT:    s_endpgm
 ;
@ -742,8 +748,9 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
 ; GFX9-NEXT:    s_endpgm
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@ -27,11 +27,11 @@ define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)*
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0

-; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4

-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}}
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 glc{{$}}


 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
@ -56,10 +56,10 @@ define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)*
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0

-; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4

-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4

 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
@ -84,10 +84,10 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrsp
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0

-; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8

-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:8

 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
--- a/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@ -88,11 +88,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:  BB1_2: ; %bb1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_add_co_br_user:
@ -113,11 +115,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB1_2: ; %bb1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_add_co_br_user:
@ -137,11 +141,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:  BB1_2: ; %bb1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 bb:
  %i1 = add i32 %i, %i
--- a/test/CodeGen/AMDGPU/extload-private.ll
+++ b/test/CodeGen/AMDGPU/extload-private.ll
@ -35,7 +35,7 @@ entry:
 }

 ; FUNC-LABEL: {{^}}load_i16_zext_private:
-; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
+; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 glc{{$}}
 define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
  %tmp0 = alloca i16, addrspace(5)
--- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@ -106,10 +106,10 @@ define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s2
 ; GCN-NEXT:    s_mov_b32 s9, s3
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 entry:
@ -130,10 +130,10 @@ define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s2
 ; GCN-NEXT:    s_mov_b32 s9, s3
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 entry:
@ -155,11 +155,11 @@ define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)*
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s6
 ; GCN-NEXT:    s_mov_b32 s9, s7
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
--- a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@ -231,12 +231,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
 ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[MUL]], [[Z]]
 ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)

 ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
 ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)

 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
 ; GCN-SLOWFMA: v_add_f32_e32
@ -267,12 +271,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 {
 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
 ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[Z]], [[MUL]]
 ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)

 ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
 ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)

 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
 ; GCN-SLOWFMA: v_add_f32_e32
@ -305,12 +313,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 {
 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
 ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[MUL]], [[Z]]
 ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)

 ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
 ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)

 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
 ; GCN-SLOWFMA: v_add_f32_e32
@ -344,12 +356,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #
 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
 ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[Z]], [[MUL]]
 ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
+; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)

 ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
 ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
+; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)

 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
 ; GCN-SLOWFMA: v_add_f32_e32
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@ -201,8 +201,9 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
 }

 ; CHECK-LABEL: {{^}}load_flat_i8_max_offset:
-; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
+; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
+; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
 define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
  %val = load volatile i8, i8* %fptr.offset
@ -210,7 +211,9 @@ define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
 }

 ; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1:
-; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
+; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
 define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
  %val = load volatile i8, i8* %fptr.offset
@ -218,11 +221,11 @@ define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
 }

 ; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
-; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
+; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}

 ; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
-; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
  %val = load volatile i8, i8* %fptr.offset
--- a/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch.ll
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@ -16,9 +16,9 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0

 ; (fadd (fmul x, y), z) -> (fma x, y, z)
 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
@ -40,10 +40,10 @@ define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %ou

 ; (fadd (fmul x, y), z) -> (fma x, y, z)
 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
@ -73,9 +73,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalia

 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
 ; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
@ -97,9 +97,9 @@ define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %ou

 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
@ -121,10 +121,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalia

 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
@ -154,9 +154,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* n

 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
@ -178,10 +178,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalia

 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
@ -211,9 +211,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* n

 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
@ -237,10 +237,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalia

 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
@ -272,10 +272,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(

 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
@ -308,11 +308,11 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(
 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))

 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
-; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
-; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}

 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
@ -349,11 +349,11 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace
 ;   -> (fma (fneg y), z, (fma (fneg u), v, x))

 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
-; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
-; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}

 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
--- a/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@ -4,7 +4,7 @@
 declare double @llvm.maxnum.f64(double, double) nounwind readnone

 ; SI-LABEL: {{^}}test_fmax3_f64:
-; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}}
 ; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]]
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@ -37,7 +37,9 @@ define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrsp
 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -65,7 +67,9 @@ define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out
 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]

 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -168,7 +172,9 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl
 ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
+; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
+; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -196,7 +202,9 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %
 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
+; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -308,7 +316,9 @@ define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]

 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -806,7 +816,9 @@ define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float
 ; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -1042,7 +1054,9 @@ define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float
 ; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -1111,7 +1125,9 @@ define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrsp
 ; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -1142,7 +1158,9 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out
 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]

 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -1349,7 +1367,9 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)*

 ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
+; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -1434,7 +1454,9 @@ define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4
 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]

 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -1859,7 +1881,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float
 ; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -1881,7 +1905,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr
 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -2316,7 +2342,9 @@ define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)
 ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0

 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -2348,7 +2376,9 @@ define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %o
 ; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -2379,7 +2409,9 @@ define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %o
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]

 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -2418,7 +2450,9 @@ define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)*
 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]

 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -2453,7 +2487,9 @@ define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(
 ; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]

 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
 define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
--- a/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll
@ -107,8 +107,8 @@ define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {

 ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
 ; GCN: s_waitcnt
-; MUBUF-NEXT:        buffer_load_dword v0, v0, s[0:3], 0 offen{{$}}
-; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off{{$}}
+; MUBUF-NEXT:        buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}}
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}}
 define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
  %val = load volatile i32, i32 addrspace(5)* %ptr
  ret void
@ -162,11 +162,11 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
 ; GCN: s_and_saveexec_b64

 ; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
-; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
+; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}

 ; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
-; GFX9-MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
-; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4{{$}}
+; GFX9-MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
+; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}}

 ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
--- a/test/CodeGen/AMDGPU/function-args.ll
+++ b/test/CodeGen/AMDGPU/function-args.ll
@ -537,10 +537,10 @@ define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8,
 }

 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
-; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32{{$}}
-; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
+; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 glc{{$}}
+; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4 glc{{$}}
+; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8 glc{{$}}
+; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12 glc{{$}}

 ; GCN: ds_write_b32 v0, v0
 ; GCN: s_setpc_b64
--- a/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@ -159,7 +159,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@ -168,7 +169,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@ -190,7 +190,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@ -199,7 +200,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@ -225,7 +225,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@ -234,7 +235,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@ -256,7 +256,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@ -265,7 +266,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@ -350,7 +350,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    global_load_sbyte v0, v[0:1], off
+; GFX9-NEXT:    global_load_sbyte v0, v[0:1], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@ -378,7 +379,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    global_load_sbyte v0, v[0:1], off
+; GFX10-NEXT:    global_load_sbyte v0, v[0:1], off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@ -410,7 +412,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@ -438,7 +441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@ -529,7 +533,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@ -557,7 +562,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@ -589,7 +595,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
@ -617,7 +624,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@ -2877,6 +2885,7 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    global_store_dword v[40:41], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s4, v42, 0
@ -2912,6 +2921,7 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    global_store_dword v[40:41], v0, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
@ -2924,7 +2934,6 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
  %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42)
  store volatile i32 %val, i32 addrspace(1)* %out
@ -3105,7 +3114,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
@ -3145,13 +3156,14 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
  %in.val = alloca { i8, i32 }, align 4, addrspace(5)
  %out.val = alloca { i8, i32 }, align 4, addrspace(5)
--- a/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
+++ b/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
@ -19,7 +19,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_1d_f16_tfe_dmask0:
@ -37,7 +39,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_short v[0:1], v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
@ -55,7 +59,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_short v[0:1], v1
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    s_endpgm
  %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
  %v.data = extractvalue { half, i32 } %v, 0
@ -81,7 +87,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_1d_f16_tfe_dmask1:
@ -99,7 +107,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_short v[0:1], v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
@ -117,7 +127,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_short v[0:1], v1
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    s_endpgm
  %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
  %v.data = extractvalue { half, i32 } %v, 0
@ -143,7 +155,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_1d_v2f16_tfe_dmask0:
@ -161,7 +175,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
@ -179,7 +195,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v1
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    s_endpgm
  %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
  %v.data = extractvalue { <2 x half>, i32 } %v, 0
@ -205,7 +223,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_1d_v2f16_tfe_dmask1:
@ -223,7 +243,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
@ -241,7 +263,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v1
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    s_endpgm
  %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
  %v.data = extractvalue { <2 x half>, i32 } %v, 0
@ -267,7 +291,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX9-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_1d_v2f16_tfe_dmask3:
@ -285,7 +311,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX10-NEXT:    image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
@ -306,7 +334,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v0
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v3
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    s_endpgm
  %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
  %v.data = extractvalue { <2 x half>, i32 } %v, 0
@ -333,8 +363,11 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX9-NEXT:    image_load v[1:3], v0, s[4:11] dmask:0x7 unorm tfe d16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_1d_v3f16_tfe_dmask7:
@ -353,8 +386,11 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX10-NEXT:    image_load v[1:3], v0, s[4:11] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v3, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7:
@ -375,9 +411,12 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX8-UNPACKED-NEXT:    flat_store_short v[0:1], v3
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v0
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    s_endpgm
  %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
  %v.data = extractvalue { <3 x half>, i32 } %v, 0
@ -404,7 +443,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
 ; GFX9-NEXT:    image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[1:2], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_1d_v4f16_tfe_dmask15:
@ -423,7 +464,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
 ; GFX10-NEXT:    image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[1:2], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v3, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
@ -448,7 +491,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    flat_store_dwordx2 v[0:1], v[1:2]
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    flat_store_dword v[0:1], v5
+; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-UNPACKED-NEXT:    s_endpgm
  %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
  %v.data = extractvalue { <4 x half>, i32 } %v, 0
--- a/test/CodeGen/AMDGPU/imm16.ll
+++ b/test/CodeGen/AMDGPU/imm16.ll
@ -14,6 +14,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
 ; GFX10-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_neg_0.0_i16:
@ -24,6 +25,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
 ; VI-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
 ; SI-LABEL: store_inline_imm_neg_0.0_i16:
@ -34,6 +36,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x8000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
  store volatile i16 -32768, i16 addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/test/CodeGen/AMDGPU/infinite-loop.ll
@ -14,6 +14,7 @@ define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_branch BB0_1
 ; IR-LABEL: @infinite_loop(
 ; IR-NEXT:  entry:
@ -45,6 +46,7 @@ define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccnz BB1_2
 ; SI-NEXT:  BB1_3: ; %UnifiedReturnBlock
@ -87,6 +89,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccnz BB2_2
 ; SI-NEXT:  ; %bb.3: ; %Flow
@ -105,6 +108,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
 ; SI-NEXT:  BB2_6: ; %loop1
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccz BB2_6
 ; SI-NEXT:  BB2_7: ; %DummyReturnBlock
@ -156,6 +160,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execnz BB3_3
 ; SI-NEXT:  ; %bb.4: ; %loop.exit.guard
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@ -245,6 +245,7 @@ entry:
 ; FIXME: Should prodbably be masking high bits of load.
 ; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
 ; CHECK: buffer_load_ubyte v0
+; CHECK-NEXT: s_waitcnt
 ; CHECK-NEXT: buffer_load_ubyte v1
 ; CHECK-NEXT: s_waitcnt
 ; CHECK-NEXT: ASMSTART
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@ -1577,7 +1577,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v2, v[0:1], off
+; GFX9-NEXT:    global_load_dword v2, v[0:1], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
@ -1585,7 +1586,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
 ; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s4
@ -1604,7 +1604,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
+; VI-NEXT:    flat_load_dword v4, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    s_mov_b32 s2, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@ -1614,7 +1615,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; VI-NEXT:    s_lshl_b32 s0, s1, 16
 ; VI-NEXT:    s_or_b32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
 ; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@ -1632,7 +1632,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v4, v[0:1]
+; CI-NEXT:    flat_load_dword v4, v[0:1] glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    s_mov_b32 s3, 0
 ; CI-NEXT:    s_mov_b32 s2, 0xffff
@ -1642,7 +1643,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_or_b32 s0, s4, s1
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
 ; CI-NEXT:    v_lshl_b64 v[4:5], s[2:3], v4
 ; CI-NEXT:    s_waitcnt vmcnt(0)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@ -101,10 +101,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
 ; SI-NEXT:    s_endpgm
@ -121,12 +122,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
 ; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
@ -137,8 +139,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, v2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
@ -165,9 +168,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@ -180,11 +183,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, 1.0
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
@ -194,7 +197,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, 1.0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
@ -219,9 +222,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@ -234,11 +237,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, 1.0, v0
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
@ -248,7 +251,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, 1.0, v1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
@ -275,10 +278,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
 ; SI-NEXT:    s_endpgm
@ -295,12 +299,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -v0, v1
 ; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
@ -311,8 +316,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -v1, v2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
@ -342,10 +348,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
 ; SI-NEXT:    s_endpgm
@ -362,12 +369,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, -v1
 ; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
@ -378,8 +386,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, -v2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
@ -409,10 +418,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
 ; SI-NEXT:    s_endpgm
@ -429,12 +439,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -v0, -v1
 ; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
@ -445,8 +456,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -v1, -v2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
@ -477,10 +489,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
 ; SI-NEXT:    s_endpgm
@ -497,12 +510,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
 ; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
@ -513,8 +527,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@ -114,9 +114,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
 }

 ; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}

 ; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
 ; SI-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@ -22,13 +22,13 @@ define amdgpu_kernel void @maxnum_f16(
 ; SI-NEXT:    s_mov_b32 s15, s3
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
@ -50,13 +50,13 @@ define amdgpu_kernel void @maxnum_f16(
 ; VI-NEXT:    s_mov_b32 s15, s3
 ; VI-NEXT:    s_mov_b32 s10, s2
 ; VI-NEXT:    s_mov_b32 s11, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v1, v1, v1
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v1
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@ -75,13 +75,13 @@ define amdgpu_kernel void @maxnum_f16(
 ; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_mov_b32 s10, s2
 ; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@ -22,13 +22,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; SI-NEXT:    s_mov_b32 s15, s3
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
@ -50,13 +50,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; VI-NEXT:    s_mov_b32 s15, s3
 ; VI-NEXT:    s_mov_b32 s10, s2
 ; VI-NEXT:    s_mov_b32 s11, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v1, v1, v1
 ; VI-NEXT:    v_min_f16_e32 v0, v0, v1
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@ -75,13 +75,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; GFX9-NEXT:    s_mov_b32 s15, s3
 ; GFX9-NEXT:    s_mov_b32 s10, s2
 ; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
--- a/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/test/CodeGen/AMDGPU/load-hi16.ll
@ -536,13 +536,13 @@ entry:
 ; GCN: s_waitcnt
 ; GFX900-MUBUFF:  buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
 ; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}}
+; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]] glc{{$}}
 ; GFX900: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64

-; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
+; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
 entry:
  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
@ -554,15 +554,15 @@ entry:

 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
 ; GCN: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
 ; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}}
+; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]] glc{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64

-; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
+; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
 entry:
  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
@ -660,15 +660,15 @@ entry:

 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
 ; GCN: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
 ; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
+; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64

-; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
+; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
 entry:
  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
@ -681,15 +681,15 @@ entry:

 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
 ; GCN: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
 ; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}}
+; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]] glc{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64

-; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}}
+; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
 entry:
  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
@ -702,15 +702,15 @@ entry:

 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
 ; GCN: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
 ; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
+; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64

-; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
+; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
 entry:
  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
@ -805,9 +805,13 @@ entry:

 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
 ; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
 entry:
  %obj0 = alloca [10 x i32], align 4, addrspace(5)
@ -824,9 +828,13 @@ entry:

 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
 entry:
  %obj0 = alloca [10 x i32], align 4, addrspace(5)
@ -844,9 +852,13 @@ entry:

 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
 entry:
  %obj0 = alloca [10 x i32], align 4, addrspace(5)
@ -997,8 +1009,8 @@ entry:
 ; FIXME: Is there a cost to using the extload over not?
 ; GCN-LABEL: {{^}}load_private_v2i16_split:
 ; GCN: s_waitcnt
-; GFX900-MUBUF:   buffer_load_ushort v0, off, s[0:3], s32{{$}}
-; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}}
+; GFX900-MUBUF:   buffer_load_ushort v0, off, s[0:3], s32 glc{{$}}
+; GFX900-FLATSCR: scratch_load_ushort v0, off, s32 glc{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
 ; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2
--- a/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/test/CodeGen/AMDGPU/load-lo16.ll
@ -1333,7 +1333,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -1342,9 +1342,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@ -1353,9 +1353,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -1365,7 +1365,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1382,7 +1382,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -1391,9 +1391,9 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@ -1402,9 +1402,9 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -1414,7 +1414,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1431,7 +1431,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -1440,9 +1440,9 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
-; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
@ -1452,9 +1452,9 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -1464,7 +1464,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1582,7 +1582,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -1591,9 +1591,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@ -1603,9 +1603,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
-; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -1615,7 +1615,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1633,7 +1633,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -1642,9 +1642,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@ -1653,9 +1653,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
-; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -1665,7 +1665,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v1, off, s0
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1683,7 +1683,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -1692,9 +1692,9 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
-; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
@ -1705,9 +1705,9 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
-; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -1717,7 +1717,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1914,7 +1914,8 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -1925,9 +1926,10 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@ -1938,9 +1940,10 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
-; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -1951,7 +1954,8 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
-; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1975,7 +1979,8 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -1986,9 +1991,10 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@ -1999,9 +2005,10 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
-; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -2012,7 +2019,8 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
-; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -2037,7 +2045,8 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -2048,9 +2057,10 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@ -2061,10 +2071,11 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -2075,7 +2086,8 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
-; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -2100,7 +2112,8 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -2111,9 +2124,10 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
@ -2125,9 +2139,10 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
-; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -2138,7 +2153,8 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
-; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -2164,7 +2180,8 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -2175,9 +2192,10 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
@ -2189,10 +2207,11 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@ -2203,7 +2222,8 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
-; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
--- a/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@ -30,27 +30,32 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
 ; MUBUF-NEXT:    v_mov_b32_e32 v3, 0x2000
 ; MUBUF-NEXT:    s_mov_b32 s6, 0
 ; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:  BB0_1: ; %loadstoreloop
 ; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; MUBUF-NEXT:    v_add_u32_e32 v3, s6, v1
 ; MUBUF-NEXT:    s_add_i32 s6, s6, 1
 ; MUBUF-NEXT:    s_cmpk_lt_u32 s6, 0x2120
 ; MUBUF-NEXT:    buffer_store_byte v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_cbranch_scc1 BB0_1
 ; MUBUF-NEXT:  ; %bb.2: ; %split
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x3000
 ; MUBUF-NEXT:    v_add_u32_e32 v1, 0x20d0, v1
-; MUBUF-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
-; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; MUBUF-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
-; MUBUF-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
-; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v3
+; MUBUF-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v3
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
 ; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; MUBUF-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_endpgm
 ;
 ; FLATSCR-LABEL: local_stack_offset_uses_sp:
@ -62,25 +67,29 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
 ; FLATSCR-NEXT:    s_movk_i32 vcc_hi, 0x2000
 ; FLATSCR-NEXT:    s_mov_b32 s2, 0
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, vcc_hi
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:  BB0_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; FLATSCR-NEXT:    s_add_u32 s3, 0x3000, s2
 ; FLATSCR-NEXT:    s_add_i32 s2, s2, 1
 ; FLATSCR-NEXT:    s_cmpk_lt_u32 s2, 0x2120
 ; FLATSCR-NEXT:    scratch_store_byte off, v0, s3
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB0_1
 ; FLATSCR-NEXT:  ; %bb.2: ; %split
 ; FLATSCR-NEXT:    s_movk_i32 s2, 0x2000
 ; FLATSCR-NEXT:    s_add_u32 s2, 0x3000, s2
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s2 offset:208
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s2 offset:208 glc
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_movk_i32 s2, 0x3000
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s2 offset:64
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s2 offset:64 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_endpgm
 entry:
  %pin.low = alloca i32, align 8192, addrspace(5)
@ -111,26 +120,30 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
 ; MUBUF-NEXT:    s_mov_b32 s4, 0
 ; MUBUF-NEXT:    s_add_u32 s32, s32, 0x180000
 ; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:  BB1_1: ; %loadstoreloop
 ; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; MUBUF-NEXT:    v_add_u32_e32 v5, s4, v3
 ; MUBUF-NEXT:    s_add_i32 s4, s4, 1
 ; MUBUF-NEXT:    s_cmpk_lt_u32 s4, 0x2120
 ; MUBUF-NEXT:    buffer_store_byte v4, v5, s[0:3], 0 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_cbranch_scc1 BB1_1
 ; MUBUF-NEXT:  ; %bb.2: ; %split
 ; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
 ; MUBUF-NEXT:    v_add_u32_e32 v3, 0x1000, v3
 ; MUBUF-NEXT:    v_add_u32_e32 v3, 0x20d0, v3
-; MUBUF-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
-; MUBUF-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen offset:4
-; MUBUF-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen
-; MUBUF-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_sub_u32 s32, s32, 0x180000
 ; MUBUF-NEXT:    s_mov_b32 s33, s5
-; MUBUF-NEXT:    s_waitcnt vmcnt(1)
 ; MUBUF-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v5
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
 ; MUBUF-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
@ -146,6 +159,7 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    s_add_u32 s32, s32, 0x6000
 ; FLATSCR-NEXT:    scratch_store_dword off, v2, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:  BB1_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; FLATSCR-NEXT:    s_add_u32 vcc_hi, s33, 0x1000
@ -153,17 +167,19 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
 ; FLATSCR-NEXT:    s_add_i32 s0, s0, 1
 ; FLATSCR-NEXT:    s_cmpk_lt_u32 s0, 0x2120
 ; FLATSCR-NEXT:    scratch_store_byte off, v2, s1
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB1_1
 ; FLATSCR-NEXT:  ; %bb.2: ; %split
 ; FLATSCR-NEXT:    s_movk_i32 s0, 0x2000
 ; FLATSCR-NEXT:    s_add_u32 s1, s33, 0x1000
 ; FLATSCR-NEXT:    s_add_u32 s0, s1, s0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:208
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_add_u32 s0, s33, 0x1000
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:64
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_sub_u32 s32, s32, 0x6000
 ; FLATSCR-NEXT:    s_mov_b32 s33, s2
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
 ; FLATSCR-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@ -17,9 +17,9 @@ declare float @llvm.fmuladd.f32(float, float, float) #0

 ; (fadd (fmul x, y), z) -> (fma x, y, z)
 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}

 ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]

@ -52,10 +52,10 @@ define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out

 ; (fadd (fmul x, y), z) -> (fma x, y, z)
 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}

 ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
 ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
@ -97,9 +97,9 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias

 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
 ; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}

 ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
@ -128,9 +128,9 @@ define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out

 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}

 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
@ -158,10 +158,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias

 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}

 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
@ -200,9 +200,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* no

 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}

 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
@ -230,10 +230,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias

 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}

 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
@ -272,9 +272,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* no

 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}

 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]

@ -305,10 +305,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias

 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}

 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
@ -349,10 +349,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1

 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}

 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
@ -394,11 +394,11 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1
 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))

 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}

 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
@ -436,11 +436,11 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(
 ;   -> (fma (fneg y), z, (fma (fneg u), v, x))

 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}

 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
@ -478,11 +478,11 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(
 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))

 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}

 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
@ -529,11 +529,11 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(
 ;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))

 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}

 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
--- a/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/test/CodeGen/AMDGPU/mad-mix-hi.ll
@ -120,8 +120,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
 ; GCN: s_waitcnt
 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}}
 ; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3
-; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}}
 ; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}}
 ; GFX9-NEXT: s_setpc_b64
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
  %src0.ext = fpext half %src0 to float
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@ -41,7 +41,7 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
 ; GCN-LABEL: {{^}}madak_2_use_f32:
 ; GFX9:         v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GFX10:        v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6-DAG:     buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GFX6-DAG:     buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GFX6-DAG:     buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GFX6-DAG:     buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; GFX8_9_10:    {{flat|global}}_load_dword [[VA:v[0-9]+]],
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone

 ; GCN-LABEL: {{^}}madmk_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
 define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
@ -28,7 +28,7 @@ define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float add
 }

 ; GCN-LABEL: {{^}}madmk_2_use_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
@ -61,7 +61,7 @@ define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, flo

 ; We don't get any benefit if the constant is an inline immediate.
 ; GCN-LABEL: {{^}}madmk_inline_imm_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
 define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
@ -128,7 +128,7 @@ define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %
 }

 ; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[SK]], [[VB]]
@ -150,7 +150,7 @@ define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalia
 }

 ; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{[sv][0-9]+}}, |{{v[0-9]+}}|
 define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
--- a/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@ -0,0 +1,396 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+
+define amdgpu_kernel void @flat_nontemporal_load_0(
+; GFX7-LABEL: flat_nontemporal_load_0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[2:3], v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_nontemporal_load_0:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_nontemporal_load_0:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32* %in, i32* %out) {
+entry:
+  %val = load volatile i32, i32* %in, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_nontemporal_load_1(
+; GFX7-LABEL: flat_nontemporal_load_1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX7-NEXT:    flat_load_dword v2, v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_nontemporal_load_1:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_add_co_u32_e64 v0, s0, s0, v0
+; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_nontemporal_load_1:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_add_co_u32_e64 v0, s0, s0, v0
+; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; SKIP-CACHE-INV-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32* %in, i32* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
+  %val = load volatile i32, i32* %val.gep, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_nontemporal_store_0(
+; GFX7-LABEL: flat_nontemporal_store_0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[2:3], v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_nontemporal_store_0:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_nontemporal_store_0:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32* %in, i32* %out) {
+entry:
+  %val = load i32, i32* %in, align 4
+  store volatile i32 %val, i32* %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_nontemporal_store_1(
+; GFX7-LABEL: flat_nontemporal_store_1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    flat_load_dword v2, v[1:2]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_nontemporal_store_1:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_add_co_u32_e64 v0, s0, s2, v0
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[1:2]
+; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_nontemporal_store_1:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_add_co_u32_e64 v0, s0, s2, v0
+; GFX10-CU-NEXT:    flat_load_dword v2, v[1:2]
+; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[1:2]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; SKIP-CACHE-INV-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32* %in, i32* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val = load i32, i32* %in, align 4
+  %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
+  store volatile i32 %val, i32* %out.gep
+  ret void
+}
+
+define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
+; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[2:3], v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic volatile i32, i32* %in syncscope("workgroup") acquire, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_volatile_workgroup_release_store(
+; GFX7-LABEL: flat_volatile_workgroup_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+   i32 %in, i32* %out) {
+entry:
+  store atomic volatile i32 %in, i32* %out syncscope("workgroup") release, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
--- a/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@ -0,0 +1,458 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+
+define amdgpu_kernel void @global_volatile_load_0(
+; GFX6-LABEL: global_volatile_load_0:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s7
+; GFX6-NEXT:    s_mov_b32 s6, s2
+; GFX6-NEXT:    s_mov_b32 s7, s3
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_volatile_load_0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dword v[2:3], v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_volatile_load_0:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_volatile_load_0:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_volatile_load_0:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %val = load volatile i32, i32 addrspace(1)* %in, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @global_volatile_load_1(
+; GFX6-LABEL: global_volatile_load_1:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s6
+; GFX6-NEXT:    s_mov_b32 s1, s7
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, s3
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_volatile_load_1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX7-NEXT:    flat_load_dword v2, v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_volatile_load_1:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v0, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_volatile_load_1:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v0, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_volatile_load_1:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load volatile i32, i32 addrspace(1)* %val.gep, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @global_volatile_store_0(
+; GFX6-LABEL: global_volatile_store_0:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT:    s_mov_b32 s4, s2
+; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_volatile_store_0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_volatile_store_0:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_volatile_store_0:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_volatile_store_0:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  store volatile i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @global_volatile_store_1(
+; GFX6-LABEL: global_volatile_store_1:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_volatile_store_1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_volatile_store_1:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_volatile_store_1:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_volatile_store_1:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  store volatile i32 %val, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
+; GFX6-LABEL: global_volatile_workgroup_acquire_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s7
+; GFX6-NEXT:    s_mov_b32 s6, s2
+; GFX6-NEXT:    s_mov_b32 s7, s3
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_volatile_workgroup_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[2:3], v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_volatile_workgroup_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %val = load atomic volatile i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @global_volatile_workgroup_release_store(
+; GFX6-LABEL: global_volatile_workgroup_release_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_volatile_workgroup_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_volatile_workgroup_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_volatile_workgroup_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+   i32 %in, i32 addrspace(1)* %out) {
+entry:
+  store atomic volatile i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
--- a/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@ -0,0 +1,440 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+
+define amdgpu_kernel void @local_volatile_load_0(
+; GFX6-LABEL: local_volatile_load_0:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_volatile_load_0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    ds_read_b32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_volatile_load_0:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_volatile_load_0:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_volatile_load_0:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
+entry:
+  %val = load volatile i32, i32 addrspace(3)* %in, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @local_volatile_load_1(
+; GFX6-LABEL: local_volatile_load_1:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_volatile_load_1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    ds_read_b32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_volatile_load_1:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_volatile_load_1:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_volatile_load_1:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
+  %val = load volatile i32, i32 addrspace(3)* %val.gep, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @local_volatile_store_0(
+; GFX6-LABEL: local_volatile_store_0:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_volatile_store_0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_volatile_store_0:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_volatile_store_0:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_volatile_store_0:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
+entry:
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  store volatile i32 %val, i32 addrspace(3)* %out
+  ret void
+}
+
+define amdgpu_kernel void @local_volatile_store_1(
+; GFX6-LABEL: local_volatile_store_1:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_volatile_store_1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_volatile_store_1:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_volatile_store_1:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_volatile_store_1:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
+  store volatile i32 %val, i32 addrspace(3)* %out.gep
+  ret void
+}
+
+define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
+; GFX6-LABEL: local_volatile_workgroup_acquire_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xa
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_volatile_workgroup_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %val = load atomic volatile i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
+  store i32 %val, i32 addrspace(3)* %out
+  ret void
+}
+
+define amdgpu_kernel void @local_volatile_workgroup_release_store(
+; GFX6-LABEL: local_volatile_workgroup_release_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xa
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_volatile_workgroup_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_volatile_workgroup_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_volatile_workgroup_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+   i32 %in, i32 addrspace(3)* %out) {
+entry:
+  store atomic volatile i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
--- a/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@ -0,0 +1,404 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+
+define amdgpu_kernel void @private_volatile_load_0(
+; GFX6-LABEL: private_volatile_load_0:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    s_mov_b32 s11, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s8, s8, s3
+; GFX6-NEXT:    s_addc_u32 s9, s9, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_volatile_load_0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_add_u32 s8, s8, s7
+; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_volatile_load_0:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_volatile_load_0:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_volatile_load_0:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[8:9]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s8, s8, s3
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s9, s9, 0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
+entry:
+  %val = load volatile i32, i32 addrspace(5)* %in, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @private_volatile_load_1(
+; GFX6-LABEL: private_volatile_load_1:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    s_mov_b32 s11, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s8, s8, s3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_addc_u32 s9, s9, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_volatile_load_1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_add_u32 s8, s8, s7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_volatile_load_1:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_volatile_load_1:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_volatile_load_1:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[8:9]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s8, s8, s3
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s9, s9, 0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid
+  %val = load volatile i32, i32 addrspace(5)* %val.gep, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @private_volatile_store_0(
+; GFX6-LABEL: private_volatile_store_0:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s4, s4, s3
+; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_volatile_store_0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_add_u32 s8, s8, s7
+; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_volatile_store_0:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_volatile_store_0:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_volatile_store_0:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
+entry:
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  store volatile i32 %val, i32 addrspace(5)* %out
+  ret void
+}
+
+define amdgpu_kernel void @private_volatile_store_1(
+; GFX6-LABEL: private_volatile_store_1:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s4, s4, s3
+; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_volatile_store_1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_add_u32 s8, s8, s7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_volatile_store_1:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_volatile_store_1:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_volatile_store_1:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+    i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid
+  store volatile i32 %val, i32 addrspace(5)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
--- a/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@ -154,8 +154,10 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
 ; GFX9-NEXT:    global_store_dword v[0:1], v3, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@ -234,8 +234,8 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:  ; %bb.8: ; %case0
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; GCN-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_b64 s[10:11], vcc, exec
--- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@ -186,6 +186,7 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; GCN-NEXT:  BB1_6: ; %bb31
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
 ; IR-LABEL: @nested_loop_conditions(
 ; IR-NEXT:  bb:
--- a/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/test/CodeGen/AMDGPU/non-entry-alloca.ll
@ -48,6 +48,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:  BB0_3: ; %bb.2
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_endpgm
 ;
 ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
@ -83,6 +84,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:  BB0_3: ; %bb.2
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_endpgm

 entry:
@ -151,6 +153,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:  BB1_2: ; %bb.1
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_endpgm
 ;
 ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
@ -181,6 +184,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:  BB1_2: ; %bb.1
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_endpgm
 entry:
  %cond = icmp eq i32 %arg.cond, 0
@ -243,9 +247,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_sub_u32 s32, s32, 0x400
 ; MUBUF-NEXT:    s_mov_b32 s33, s7
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
@ -280,9 +284,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_sub_u32 s32, s32, 16
 ; FLATSCR-NEXT:    s_mov_b32 s33, s5
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]

 entry:
@ -342,9 +346,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_sub_u32 s32, s32, 0x2000
 ; MUBUF-NEXT:    s_mov_b32 s33, s7
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
@ -374,9 +378,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_sub_u32 s32, s32, 0x80
 ; FLATSCR-NEXT:    s_mov_b32 s33, s3
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
  %cond = icmp eq i32 %arg.cond, 0
--- a/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll
@ -34,7 +34,7 @@ define amdgpu_kernel void @store_to_inttoptr() #0 {
 ; GCN-LABEL: {{^}}load_from_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen glc{{$}}
 define amdgpu_kernel void @load_from_undef() #0 {
  %ld = load volatile i32, i32 addrspace(5)* undef
  ret void
@ -43,7 +43,7 @@ define amdgpu_kernel void @load_from_undef() #0 {
 ; GCN-LABEL: {{^}}load_from_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124 glc{{$}}
 define amdgpu_kernel void @load_from_inttoptr() #0 {
  %ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*)
  ret void
--- a/test/CodeGen/AMDGPU/rsq.ll
+++ b/test/CodeGen/AMDGPU/rsq.ll
@ -44,7 +44,7 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float
 ; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.

 ; SI-LABEL: @rsqrt_fmul
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8

--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@ -24,19 +24,19 @@ define amdgpu_kernel void @select_f16(
 ; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_mov_b32 s14, s2
 ; SI-NEXT:    s_mov_b32 s15, s3
-; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
@ -65,15 +65,17 @@ define amdgpu_kernel void @select_f16(
 ; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s14, s2
 ; VI-NEXT:    s_mov_b32 s15, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
-; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
+; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@ -112,16 +114,16 @@ define amdgpu_kernel void @select_f16_imm_a(
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
@ -147,14 +149,15 @@ define amdgpu_kernel void @select_f16_imm_a(
 ; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
@ -191,16 +194,16 @@ define amdgpu_kernel void @select_f16_imm_b(
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
@ -226,14 +229,15 @@ define amdgpu_kernel void @select_f16_imm_b(
 ; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
@ -270,16 +274,16 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
@ -305,15 +309,16 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
@ -350,16 +355,16 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
@ -385,15 +390,16 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@ -294,11 +294,12 @@ define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 a
 ; GCN-NEXT:    s_mov_b32 s14, 0
 ; GCN-NEXT:    s_mov_b32 s15, s3
 ; GCN-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2
+; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NEXT:    v_lshl_b32_e32 v0, v2, v0
@ -1640,6 +1641,7 @@ define amdgpu_kernel void @test_mul2(i32 %p) {
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_mul2:
--- a/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@ -82,15 +82,17 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
 ; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_i32_x_sub_64_multi_use:
@ -101,17 +103,19 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    flat_load_dword v4, v[0:1]
+; VI-NEXT:    flat_load_dword v3, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v4, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_subrev_u32_e32 v3, vcc, 64, v4
 ; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
@ -119,14 +123,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
@ -134,15 +140,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
@ -804,15 +811,17 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
 ; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_i16_x_sub_64_multi_use:
@ -823,17 +832,19 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ushort v3, v[0:1]
-; VI-NEXT:    flat_load_ushort v4, v[0:1]
+; VI-NEXT:    flat_load_ushort v3, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_subrev_u16_e32 v3, 64, v4
 ; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_short v[0:1], v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
@ -841,14 +852,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
 ; GFX9-NEXT:    v_subrev_u16_e32 v2, 64, v2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
@ -856,15 +869,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_sub_nc_u16_e64 v1, v1, 64
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_nc_u16_e64 v1, v1, 64
 ; GFX10-NEXT:    v_sub_nc_u16_e64 v2, v2, 64
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
--- a/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@ -212,6 +212,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
 ; SI-NEXT:    s_mov_b64 s[12:13], 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_branch BB3_1
 ; SI-NEXT:  BB3_8: ; %loop.exit.guard4
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
@ -284,6 +285,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
 ; FLAT-NEXT:    s_mov_b64 s[12:13], 0
 ; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    s_branch BB3_1
 ; FLAT-NEXT:  BB3_8: ; %loop.exit.guard4
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@ -351,15 +351,16 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a)
 ; SI-NEXT:    s_sext_i32_i8 s0, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_sext_v4i8_to_v4i32:
@ -376,11 +377,15 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a)
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
  %cast = bitcast i32 %a to <4 x i8>
  %ext = sext <4 x i8> %cast to <4 x i32>
@ -417,9 +422,13 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
 ; SI-NEXT:    v_bfe_i32 v3, v0, 8, 8
 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_sext_v4i8_to_v4i32:
@ -442,9 +451,13 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
  %a = load i32, i32 addrspace(1)* %in
  %cast = bitcast i32 %a to <4 x i8>
@ -475,16 +488,17 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
 ; SI-NEXT:    s_sext_i32_i16 s6, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    s_sext_i32_i16 s7, s7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_sext_v4i16_to_v4i32:
@ -500,13 +514,17 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_ashr_i32 s4, s7, 16
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
 ; VI-NEXT:    s_sext_i32_i16 s7, s7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
  %cast = bitcast i64 %a to <4 x i16>
  %ext = sext <4 x i16> %cast to <4 x i32>
@ -541,9 +559,13 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 add
 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_sext_v4i16_to_v4i32:
@ -565,9 +587,13 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 add
 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
  %a = load i64, i64 addrspace(1)* %in
  %cast = bitcast i64 %a to <4 x i16>
--- a/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@ -13,6 +13,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:    .section .rodata,#alloc
 ; VI-NEXT:    .p2align 6
@ -59,6 +60,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:    .section .rodata,#alloc
 ; GFX9-NEXT:    .p2align 6
@ -112,6 +114,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:    .section .rodata,#alloc
 ; VI-NEXT:    .p2align 6
@ -158,6 +161,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:    .section .rodata,#alloc
 ; GFX9-NEXT:    .p2align 6
@ -211,6 +215,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:    .section .rodata,#alloc
 ; VI-NEXT:    .p2align 6
@ -257,6 +262,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:    .section .rodata,#alloc
 ; GFX9-NEXT:    .p2align 6
--- a/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/test/CodeGen/AMDGPU/stack-realign.ll
@ -165,6 +165,7 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN: s_mov_b32 s34, s32
 ; GCN: v_mov_b32_e32 v32, 0
 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000

@ -237,10 +238,10 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
 ; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ;;#ASMEND
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN: s_setpc_b64 s[30:31]
  %local_val = alloca i32, align 128, addrspace(5)
  store volatile i32 %b, i32 addrspace(5)* %local_val, align 128
  ; Use all clobberable registers, so BP has to spill to a VGPR.
--- a/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/test/CodeGen/AMDGPU/store-hi16.ll
@ -646,9 +646,13 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
 ; GCN: s_waitcnt
 ; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
 entry:
  %obj0 = alloca [10 x i32], align 4, addrspace(5)
@ -665,9 +669,12 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
 ; GCN: s_waitcnt
 ; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095
 ; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095
+; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
 entry:
  %obj0 = alloca [10 x i32], align 4, addrspace(5)
--- a/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@ -13,10 +13,11 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v0, v0, s[8:9]
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[8:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@ -33,11 +34,12 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; VI-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
@ -163,11 +165,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@ -180,11 +182,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xfffffe38
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, 0xff85, v0
 ; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
@ -208,11 +210,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@ -225,11 +227,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x3df
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, 0x34d, v0
 ; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
@ -252,10 +254,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@ -268,11 +270,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 1
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, 1, v0
 ; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
@ -295,10 +297,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 32
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@ -311,10 +313,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@ -338,11 +340,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@ -357,11 +359,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffffc080
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@ -383,11 +385,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@ -406,11 +409,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v1, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
+; VI-NEXT:    flat_load_dword v1, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u16_e32 v0, v1, v2
 ; VI-NEXT:    v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@ -437,11 +441,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@ -460,13 +465,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
+; VI-NEXT:    flat_load_dword v4, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u16_e32 v0, v4, v2
 ; VI-NEXT:    v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@ -491,11 +497,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
@ -514,11 +521,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
--- a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@ -44,11 +44,12 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_mov_b32 s3, 0xf000
 ; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], 0
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
 ; CHECK-NEXT:  BB1_1: ; %bb9
@ -102,6 +103,7 @@ define amdgpu_kernel void @partially_undef_copy() #0 {
 ; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v6
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    v_nop
 ; CHECK-NEXT:    ;;#ASMEND
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@ -42,12 +42,16 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(
 ; GCN-LABEL: {{^}}test_use_s_v_s:
 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: buffer_load_dword [[VA0:v[0-9]+]]
+; SI-NEXT: s_waitcnt vmcnt(0)
 ; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
+; SI-NEXT: s_waitcnt vmcnt(0)

 ; GCN-NOT: v_mov_b32

 ; VI: buffer_load_dword [[VA0:v[0-9]+]]
+; VI-NEXT: s_waitcnt vmcnt(0)
 ; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
+; VI-NEXT: s_waitcnt vmcnt(0)

 ; GCN-NOT: v_mov_b32
 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]]
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@ -3,7 +3,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; GCN-LABEL: {{^}}mac_vvv:
-; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}}
 ; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]]
--- a/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/test/CodeGen/AMDGPU/v_madak_f16.ll
@ -82,19 +82,19 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_mov_b32 s14, s2
 ; SI-NEXT:    s_mov_b32 s15, s3
-; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x41200000
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_madak_f32 v1, v0, v1, 0x41200000
 ; SI-NEXT:    v_mac_f32_e32 v3, v0, v2
@ -121,17 +121,18 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s14, s2
 ; VI-NEXT:    s_mov_b32 s15, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0
+; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x4900
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_mov_b32 s8, s6
 ; VI-NEXT:    s_mov_b32 s9, s7
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_madak_f16 v1, v0, v1, 0x4900
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mac_f16_e32 v3, v0, v2
 ; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0
 ; VI-NEXT:    buffer_store_short v3, off, s[8:11], 0
--- a/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/test/CodeGen/AMDGPU/vector-extract-insert.ll
@ -114,9 +114,9 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)*
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, v2
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64
-; GCN-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    buffer_store_dword v0, v[4:5], s[4:7], 0 addr64
 ; GCN-NEXT:    s_endpgm