RegAlloc: Allow targets to split register allocation

AMDGPU normally spills SGPRs to VGPRs. Previously, since all register classes are handled at the same time, this was problematic. We don't know ahead of time how many registers will be needed to be reserved to handle the spilling. If no VGPRs were left for spilling, we would have to try to spill to memory. If the spilled SGPRs were required for exec mask manipulation, it is highly problematic because the lanes active at the point of spill are not necessarily the same as at the restore point. Avoid this problem by fully allocating SGPRs in a separate regalloc run from VGPRs. This way we know the exact number of VGPRs needed, and can reserve them for a second run. This fixes the most serious issues, but it is still possible using inline asm to make all VGPRs unavailable. Start erroring in the case where we ever would require memory for an SGPR spill. This is implemented by giving each regalloc pass a callback which reports if a register class should be handled or not. A few passes need some small changes to deal with leftover virtual registers. In the AMDGPU implementation, a new pass is introduced to take the place of PrologEpilogInserter for SGPR spills emitted during the first run. One disadvantage of this is currently StackSlotColoring is no longer used for SGPR spills. It would need to be run again, which will require more work. Error if the standard -regalloc option is used. Introduce new separate -sgpr-regalloc and -vgpr-regalloc flags, so the two runs can be controlled individually. PBQB is not currently supported, so this also prevents using the unhandled allocator.
2024-11-25 04:02:41 +01:00 · 2018-09-27 09:36:28 +10:00 · 2018-09-27 09:36:28 +10:00 · 74be1319be
commit 74be1319be
parent 60fc4afb9a
40 changed files with 1396 additions and 723 deletions
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@ -16,6 +16,8 @@

 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Discriminator.h"
+#include "llvm/CodeGen/RegAllocCommon.h"
+
 #include <functional>
 #include <string>

@ -173,16 +175,20 @@ namespace llvm {
  /// possible. It is best suited for debug code where live ranges are short.
  ///
  FunctionPass *createFastRegisterAllocator();
+  FunctionPass *createFastRegisterAllocator(RegClassFilterFunc F,
+                                            bool ClearVirtRegs);

  /// BasicRegisterAllocation Pass - This pass implements a degenerate global
  /// register allocator using the basic regalloc framework.
  ///
  FunctionPass *createBasicRegisterAllocator();
+  FunctionPass *createBasicRegisterAllocator(RegClassFilterFunc F);

  /// Greedy register allocation pass - This pass implements a global register
  /// allocator for optimized builds.
  ///
  FunctionPass *createGreedyRegisterAllocator();
+  FunctionPass *createGreedyRegisterAllocator(RegClassFilterFunc F);

  /// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean
  /// Quadratic Prograaming (PBQP) based register allocator.
--- a/include/llvm/CodeGen/RegAllocCommon.h
+++ b/include/llvm/CodeGen/RegAllocCommon.h
@ -0,0 +1,32 @@
+//===- RegAllocCommon.h - Utilities shared between allocators ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCCOMMON_H
+#define LLVM_CODEGEN_REGALLOCCOMMON_H
+
+#include <functional>
+
+namespace llvm {
+
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
+typedef std::function<bool(const TargetRegisterInfo &TRI,
+                           const TargetRegisterClass &RC)> RegClassFilterFunc;
+
+/// Default register class filter function for register allocation. All virtual
+/// registers should be allocated.
+static inline bool allocateAllRegClasses(const TargetRegisterInfo &,
+                                         const TargetRegisterClass &) {
+  return true;
+}
+
+}
+
+#endif // LLVM_CODEGEN_REGALLOCCOMMON_H
--- a/include/llvm/CodeGen/RegAllocRegistry.h
+++ b/include/llvm/CodeGen/RegAllocRegistry.h
@ -14,6 +14,7 @@
 #ifndef LLVM_CODEGEN_REGALLOCREGISTRY_H
 #define LLVM_CODEGEN_REGALLOCREGISTRY_H

+#include "llvm/CodeGen/RegAllocCommon.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"

 namespace llvm {
--- a/lib/CodeGen/LiveIntervals.cpp
+++ b/lib/CodeGen/LiveIntervals.cpp
@ -713,10 +713,15 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
    if (LI.empty())
      continue;

+    // Target may have not allocated this yet.
+    Register PhysReg = VRM->getPhys(Reg);
+    if (!PhysReg)
+      continue;
+
    // Find the regunit intervals for the assigned register. They may overlap
    // the virtual register live range, cancelling any kills.
    RU.clear();
-    for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid();
+    for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid();
         ++Unit) {
      const LiveRange &RURange = getRegUnit(*Unit);
      if (RURange.empty())
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@ -175,3 +175,21 @@ void RegAllocBase::postOptimization() {
  }
  DeadRemats.clear();
 }
+
+void RegAllocBase::enqueue(LiveInterval *LI) {
+  const Register Reg = LI->reg();
+
+  assert(Reg.isVirtual() && "Can only enqueue virtual registers");
+
+  if (VRM->hasPhys(Reg))
+    return;
+
+  const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
+  if (ShouldAllocateClass(*TRI, RC)) {
+    LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n');
+    enqueueImpl(LI);
+  } else {
+    LLVM_DEBUG(dbgs() << "Not enqueueing " << printReg(Reg, TRI)
+                      << " in skipped register class\n");
+  }
+}
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@ -37,6 +37,7 @@
 #define LLVM_LIB_CODEGEN_REGALLOCBASE_H

 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/RegAllocCommon.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"

 namespace llvm {
@ -67,6 +68,7 @@ protected:
  LiveIntervals *LIS = nullptr;
  LiveRegMatrix *Matrix = nullptr;
  RegisterClassInfo RegClassInfo;
+  const RegClassFilterFunc ShouldAllocateClass;

  /// Inst which is a def of an original reg and whose defs are already all
  /// dead after remat is saved in DeadRemats. The deletion of such inst is
@ -74,7 +76,9 @@ protected:
  /// always available for the remat of all the siblings of the original reg.
  SmallPtrSet<MachineInstr *, 32> DeadRemats;

-  RegAllocBase() = default;
+  RegAllocBase(const RegClassFilterFunc F = allocateAllRegClasses) :
+    ShouldAllocateClass(F) {}
+
  virtual ~RegAllocBase() = default;

  // A RegAlloc pass should call this before allocatePhysRegs.
@ -92,7 +96,10 @@ protected:
  virtual Spiller &spiller() = 0;

  /// enqueue - Add VirtReg to the priority queue of unassigned registers.
-  virtual void enqueue(LiveInterval *LI) = 0;
+  virtual void enqueueImpl(LiveInterval *LI) = 0;
+
+  /// enqueue - Add VirtReg to the priority queue of unassigned registers.
+  void enqueue(LiveInterval *LI);

  /// dequeue - Return the next unassigned register, or NULL.
  virtual LiveInterval *dequeue() = 0;
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@ -76,7 +76,7 @@ class RABasic : public MachineFunctionPass,
  void LRE_WillShrinkVirtReg(Register) override;

 public:
-  RABasic();
+  RABasic(const RegClassFilterFunc F = allocateAllRegClasses);

  /// Return the pass name.
  StringRef getPassName() const override { return "Basic Register Allocator"; }
@ -88,7 +88,7 @@ public:

  Spiller &spiller() override { return *SpillerInstance; }

-  void enqueue(LiveInterval *LI) override {
+  void enqueueImpl(LiveInterval *LI) override {
    Queue.push(LI);
  }

@ -171,7 +171,9 @@ void RABasic::LRE_WillShrinkVirtReg(Register VirtReg) {
  enqueue(&LI);
 }

-RABasic::RABasic(): MachineFunctionPass(ID) {
+RABasic::RABasic(RegClassFilterFunc F):
+  MachineFunctionPass(ID),
+  RegAllocBase(F) {
 }

 void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
@ -332,7 +334,10 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
  return true;
 }

-FunctionPass* llvm::createBasicRegisterAllocator()
-{
+FunctionPass* llvm::createBasicRegisterAllocator() {
  return new RABasic();
 }
+
+FunctionPass* llvm::createBasicRegisterAllocator(RegClassFilterFunc F) {
+  return new RABasic(F);
+}
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocCommon.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@ -69,7 +70,13 @@ namespace {
  public:
    static char ID;

-    RegAllocFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {}
+    RegAllocFast(const RegClassFilterFunc F = allocateAllRegClasses,
+                 bool ClearVirtRegs_ = true) :
+      MachineFunctionPass(ID),
+      ShouldAllocateClass(F),
+      StackSlotForVirtReg(-1),
+      ClearVirtRegs(ClearVirtRegs_) {
+    }

  private:
    MachineFrameInfo *MFI;
@ -77,6 +84,7 @@ namespace {
    const TargetRegisterInfo *TRI;
    const TargetInstrInfo *TII;
    RegisterClassInfo RegClassInfo;
+    const RegClassFilterFunc ShouldAllocateClass;

    /// Basic block currently being allocated.
    MachineBasicBlock *MBB;
@ -84,6 +92,8 @@ namespace {
    /// Maps virtual regs to the frame index where these values are spilled.
    IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;

+    bool ClearVirtRegs;
+
    /// Everything we know about a live virtual register.
    struct LiveReg {
      MachineInstr *LastUse = nullptr; ///< Last instr to use reg.
@ -213,8 +223,12 @@ namespace {
    }

    MachineFunctionProperties getSetProperties() const override {
-      return MachineFunctionProperties().set(
+      if (ClearVirtRegs) {
+        return MachineFunctionProperties().set(
          MachineFunctionProperties::Property::NoVRegs);
+      }
+
+      return MachineFunctionProperties();
    }

    MachineFunctionProperties getClearedProperties() const override {
@ -1539,9 +1553,11 @@ bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
  for (MachineBasicBlock &MBB : MF)
    allocateBasicBlock(MBB);

-  // All machine operands and other references to virtual registers have been
-  // replaced. Remove the virtual registers.
-  MRI->clearVirtRegs();
+  if (ClearVirtRegs) {
+    // All machine operands and other references to virtual registers have been
+    // replaced. Remove the virtual registers.
+    MRI->clearVirtRegs();
+  }

  StackSlotForVirtReg.clear();
  LiveDbgValueMap.clear();
@ -1551,3 +1567,9 @@ bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
 FunctionPass *llvm::createFastRegisterAllocator() {
  return new RegAllocFast();
 }
+
+FunctionPass *llvm::createFastRegisterAllocator(
+  std::function<bool(const TargetRegisterInfo &TRI,
+                     const TargetRegisterClass &RC)> Ftor, bool ClearVirtRegs) {
+  return new RegAllocFast(Ftor, ClearVirtRegs);
+}
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@ -412,7 +412,7 @@ class RAGreedy : public MachineFunctionPass,
  ArrayRef<uint8_t> RegCosts;

 public:
-  RAGreedy();
+  RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses);

  /// Return the pass name.
  StringRef getPassName() const override { return "Greedy Register Allocator"; }
@ -421,7 +421,7 @@ public:
  void getAnalysisUsage(AnalysisUsage &AU) const override;
  void releaseMemory() override;
  Spiller &spiller() override { return *SpillerInstance; }
-  void enqueue(LiveInterval *LI) override;
+  void enqueueImpl(LiveInterval *LI) override;
  LiveInterval *dequeue() override;
  MCRegister selectOrSplit(LiveInterval &,
                           SmallVectorImpl<Register> &) override;
@ -636,7 +636,22 @@ FunctionPass* llvm::createGreedyRegisterAllocator() {
  return new RAGreedy();
 }

-RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
+namespace llvm {
+FunctionPass* createGreedyRegisterAllocator(
+  std::function<bool(const TargetRegisterInfo &TRI,
+                     const TargetRegisterClass &RC)> Ftor);
+
+}
+
+FunctionPass* llvm::createGreedyRegisterAllocator(
+  std::function<bool(const TargetRegisterInfo &TRI,
+                     const TargetRegisterClass &RC)> Ftor) {
+  return new RAGreedy(Ftor);
+}
+
+RAGreedy::RAGreedy(RegClassFilterFunc F):
+  MachineFunctionPass(ID),
+  RegAllocBase(F) {
 }

 void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
@ -693,7 +708,7 @@ void RAGreedy::LRE_WillShrinkVirtReg(Register VirtReg) {
  // Register is assigned, put it back on the queue for reassignment.
  LiveInterval &LI = LIS->getInterval(VirtReg);
  Matrix->unassign(LI);
-  enqueue(&LI);
+  RegAllocBase::enqueue(&LI);
 }

 void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) {
@ -716,7 +731,7 @@ void RAGreedy::releaseMemory() {
  GlobalCand.clear();
 }

-void RAGreedy::enqueue(LiveInterval *LI) { enqueue(Queue, LI); }
+void RAGreedy::enqueueImpl(LiveInterval *LI) { enqueue(Queue, LI); }

 void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
  // Prioritize live ranges by size, assigning larger ranges first.
@ -2936,7 +2951,12 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
    if (Register::isPhysicalRegister(Reg))
      continue;

-    assert(VRM->hasPhys(Reg) && "We have unallocated variable!!");
+    // This may be a skipped class
+    if (!VRM->hasPhys(Reg)) {
+      assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) &&
+             "We have an unallocated variable which should have been handled");
+      continue;
+    }

    // Get the live interval mapped with this virtual register to be able
    // to check for the interference with the new color.
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@ -1333,8 +1333,8 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
 }

 bool TargetPassConfig::addRegAssignAndRewriteFast() {
-  if (RegAlloc != &useDefaultRegisterAllocator &&
-      RegAlloc != &createFastRegisterAllocator)
+  if (RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator &&
+      RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&createFastRegisterAllocator)
    report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");

  addPass(createRegAllocPass(false));
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -32,6 +32,8 @@
 #include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
@ -52,6 +54,115 @@

 using namespace llvm;

+namespace {
+class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
+public:
+  SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+    : RegisterRegAllocBase(N, D, C) {}
+};
+
+class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
+public:
+  VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+    : RegisterRegAllocBase(N, D, C) {}
+};
+
+static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
+                              const TargetRegisterClass &RC) {
+  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+}
+
+static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
+                              const TargetRegisterClass &RC) {
+  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+}
+
+
+/// -{sgpr|vgpr}-regalloc=... command line option.
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+
+/// A dummy default pass factory indicates whether the register allocator is
+/// overridden on the command line.
+static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+
+static SGPRRegisterRegAlloc
+defaultSGPRRegAlloc("default",
+                    "pick SGPR register allocator based on -O option",
+                    useDefaultRegisterAllocator);
+
+static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<SGPRRegisterRegAlloc>>
+SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
+             cl::desc("Register allocator to use for SGPRs"));
+
+static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<VGPRRegisterRegAlloc>>
+VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
+             cl::desc("Register allocator to use for VGPRs"));
+
+
+static void initializeDefaultSGPRRegisterAllocatorOnce() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = SGPRRegAlloc;
+    SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
+  }
+}
+
+static void initializeDefaultVGPRRegisterAllocatorOnce() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = VGPRRegAlloc;
+    VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
+  }
+}
+
+static FunctionPass *createBasicSGPRRegisterAllocator() {
+  return createBasicRegisterAllocator(onlyAllocateSGPRs);
+}
+
+static FunctionPass *createGreedySGPRRegisterAllocator() {
+  return createGreedyRegisterAllocator(onlyAllocateSGPRs);
+}
+
+static FunctionPass *createFastSGPRRegisterAllocator() {
+  return createFastRegisterAllocator(onlyAllocateSGPRs, false);
+}
+
+static FunctionPass *createBasicVGPRRegisterAllocator() {
+  return createBasicRegisterAllocator(onlyAllocateVGPRs);
+}
+
+static FunctionPass *createGreedyVGPRRegisterAllocator() {
+  return createGreedyRegisterAllocator(onlyAllocateVGPRs);
+}
+
+static FunctionPass *createFastVGPRRegisterAllocator() {
+  return createFastRegisterAllocator(onlyAllocateVGPRs, true);
+}
+
+static SGPRRegisterRegAlloc basicRegAllocSGPR(
+  "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc greedyRegAllocSGPR(
+  "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
+
+static SGPRRegisterRegAlloc fastRegAllocSGPR(
+  "fast", "fast register allocator", createFastSGPRRegisterAllocator);
+
+
+static VGPRRegisterRegAlloc basicRegAllocVGPR(
+  "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc greedyRegAllocVGPR(
+  "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
+
+static VGPRRegisterRegAlloc fastRegAllocVGPR(
+  "fast", "fast register allocator", createFastVGPRRegisterAllocator);
+}
+
+
 static cl::opt<bool> EnableR600StructurizeCFG(
  "r600-ir-structurize",
  cl::desc("Use StructurizeCFG IR pass"),
@ -852,6 +963,14 @@ public:
  bool addGlobalInstructionSelect() override;
  void addFastRegAlloc() override;
  void addOptimizedRegAlloc() override;
+
+  FunctionPass *createSGPRAllocPass(bool Optimized);
+  FunctionPass *createVGPRAllocPass(bool Optimized);
+  FunctionPass *createRegAllocPass(bool Optimized) override;
+
+  bool addRegAssignAndRewriteFast() override;
+  bool addRegAssignAndRewriteOptimized() override;
+
  void addPreRegAlloc() override;
  bool addPreRewrite() override;
  void addPostRegAlloc() override;
@ -1237,14 +1356,84 @@ bool GCNPassConfig::addPreRewrite() {
  return true;
 }

+FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
+  // Initialize the global default.
+  llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
+                  initializeDefaultSGPRRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  if (Optimized)
+    return createGreedyRegisterAllocator(onlyAllocateSGPRs);
+
+  return createFastRegisterAllocator(onlyAllocateSGPRs, false);
+}
+
+FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
+  // Initialize the global default.
+  llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
+                  initializeDefaultVGPRRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  if (Optimized)
+    return createGreedyVGPRRegisterAllocator();
+
+  return createFastVGPRRegisterAllocator();
+}
+
+FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
+  llvm_unreachable("should not be used");
+}
+
+static const char RegAllocOptNotSupportedMessage[] =
+  "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
+
+bool GCNPassConfig::addRegAssignAndRewriteFast() {
+  if (!usingDefaultRegAlloc())
+    report_fatal_error(RegAllocOptNotSupportedMessage);
+
+  addPass(createSGPRAllocPass(false));
+
+  // Equivalent of PEI for SGPRs.
+  addPass(&SILowerSGPRSpillsID);
+
+  addPass(createVGPRAllocPass(false));
+  return true;
+}
+
+bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
+  if (!usingDefaultRegAlloc())
+    report_fatal_error(RegAllocOptNotSupportedMessage);
+
+  addPass(createSGPRAllocPass(true));
+
+  // Commit allocated register changes. This is mostly necessary because too
+  // many things rely on the use lists of the physical registers, such as the
+  // verifier. This is only necessary with allocators which use LiveIntervals,
+  // since FastRegAlloc does the replacments itself.
+  addPass(createVirtRegRewriter(false));
+
+  // Equivalent of PEI for SGPRs.
+  addPass(&SILowerSGPRSpillsID);
+
+  addPass(createVGPRAllocPass(true));
+
+  addPreRewrite();
+  addPass(&VirtRegRewriterID);
+
+  return true;
+}
+
 void GCNPassConfig::addPostRegAlloc() {
  addPass(&SIFixVGPRCopiesID);
  if (getOptLevel() > CodeGenOpt::None)
    addPass(&SIOptimizeExecMaskingID);
  TargetPassConfig::addPostRegAlloc();
-
-  // Equivalent of PEI for SGPRs.
-  addPass(&SILowerSGPRSpillsID);
 }

 void GCNPassConfig::addPreSched2() {
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -20,6 +20,12 @@ using namespace llvm;

 #define DEBUG_TYPE "frame-info"

+static cl::opt<bool> EnableSpillVGPRToAGPR(
+  "amdgpu-spill-vgpr-to-agpr",
+  cl::desc("Enable spilling VGPRs to AGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
+
 // Find a scratch register that we can use in the prologue. We avoid using
 // callee-save registers since they may appear to be free when this is called
 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
@ -1127,9 +1133,73 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
  MachineFrameInfo &MFI = MF.getFrameInfo();

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();

+  const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
+                               && EnableSpillVGPRToAGPR;
+
+  if (SpillVGPRToAGPR) {
+    // To track the spill frame indices handled in this pass.
+    BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
+
+    bool SeenDbgInstr = false;
+
+    for (MachineBasicBlock &MBB : MF) {
+      MachineBasicBlock::iterator Next;
+      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+        MachineInstr &MI = *I;
+        Next = std::next(I);
+
+        if (MI.isDebugInstr())
+          SeenDbgInstr = true;
+
+        if (TII->isVGPRSpill(MI)) {
+          // Try to eliminate stack used by VGPR spills before frame
+          // finalization.
+          unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                     AMDGPU::OpName::vaddr);
+          int FI = MI.getOperand(FIOp).getIndex();
+          Register VReg =
+            TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
+          if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
+                                                TRI->isAGPR(MRI, VReg))) {
+            // FIXME: change to enterBasicBlockEnd()
+            RS->enterBasicBlock(MBB);
+            TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
+            SpillFIs.set(FI);
+            continue;
+          }
+        }
+      }
+    }
+
+    for (MachineBasicBlock &MBB : MF) {
+      for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
+        MBB.addLiveIn(Reg);
+
+      for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
+        MBB.addLiveIn(Reg);
+
+      MBB.sortUniqueLiveIns();
+
+      if (!SpillFIs.empty() && SeenDbgInstr) {
+        // FIXME: The dead frame indices are replaced with a null register from
+        // the debug value instructions. We should instead, update it with the
+        // correct register value. But not sure the register value alone is
+        for (MachineInstr &MI : MBB) {
+          if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
+              SpillFIs[MI.getOperand(0).getIndex()]) {
+            MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+            MI.getOperand(0).setIsDebug();
+          }
+        }
+      }
+    }
+  }
+
  FuncInfo->removeDeadFrameIndices(MFI);
  assert(allSGPRSpillsAreDead(MF) &&
         "SGPR spill should have been removed in SILowerSGPRSpills");
--- a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@ -31,12 +31,6 @@ using MBBVector = SmallVector<MachineBasicBlock *, 4>;

 namespace {

-static cl::opt<bool> EnableSpillVGPRToAGPR(
-  "amdgpu-spill-vgpr-to-agpr",
-  cl::desc("Enable spilling VGPRs to AGPRs"),
-  cl::ReallyHidden,
-  cl::init(true));
-
 class SILowerSGPRSpills : public MachineFunctionPass {
 private:
  const SIRegisterInfo *TRI = nullptr;
@ -71,6 +65,7 @@ char SILowerSGPRSpills::ID = 0;

 INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE,
                      "SI lower SGPR spill instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
                    "SI lower SGPR spill instructions", false, false)
@ -295,6 +290,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
  TRI = &TII->getRegisterInfo();

  VRM = getAnalysisIfAvailable<VirtRegMap>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();

  assert(SaveBlocks.empty() && RestoreBlocks.empty());

@ -318,21 +314,14 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
    return false;
  }

-  const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
-    && EnableSpillVGPRToAGPR;
-
  bool MadeChange = false;
-
-  const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
-  std::unique_ptr<RegScavenger> RS;
-
  bool NewReservedRegs = false;

  // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
  // handled as SpilledToReg in regular PrologEpilogInserter.
  const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() &&
                                  (HasCSRs || FuncInfo->hasSpilledSGPRs());
-  if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) {
+  if (HasSGPRSpillToVGPR) {
    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
    // are spilled to VGPRs, in which case we can eliminate the stack usage.
    //
@ -350,36 +339,15 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
        MachineInstr &MI = *I;
        Next = std::next(I);

-        if (SpillToAGPR && TII->isVGPRSpill(MI)) {
-          // Try to eliminate stack used by VGPR spills before frame
-          // finalization.
-          unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
-                                                     AMDGPU::OpName::vaddr);
-          int FI = MI.getOperand(FIOp).getIndex();
-          Register VReg =
-              TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
-          if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
-                                                TRI->isAGPR(MRI, VReg))) {
-            NewReservedRegs = true;
-            if (!RS)
-              RS.reset(new RegScavenger());
-
-            // FIXME: change to enterBasicBlockEnd()
-            RS->enterBasicBlock(MBB);
-            TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get());
-            SpillFIs.set(FI);
-            continue;
-          }
-        }
-
-        if (!TII->isSGPRSpill(MI) || !TRI->spillSGPRToVGPR())
+        if (!TII->isSGPRSpill(MI))
          continue;

        int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
        assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
        if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
          NewReservedRegs = true;
-          bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
+          bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI,
+                                                                 nullptr, LIS);
          (void)Spilled;
          assert(Spilled && "failed to spill SGPR to VGPR when allocated");
          SpillFIs.set(FI);
@ -387,16 +355,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
      }
    }

+    // FIXME: Adding to live-ins redundant with reserving registers.
    for (MachineBasicBlock &MBB : MF) {
      for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
        MBB.addLiveIn(SSpill.VGPR);
-
-      for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
-        MBB.addLiveIn(Reg);
-
-      for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
-        MBB.addLiveIn(Reg);
-
      MBB.sortUniqueLiveIns();

      // FIXME: The dead frame indices are replaced with a null register from
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -8,7 +8,22 @@

 #include "SIMachineFunctionInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
+#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include <cassert>
+#include <vector>

 #define MAX_LANES 64

@ -315,6 +330,13 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
        // partially spill the SGPR to VGPRs.
        SGPRToVGPRSpills.erase(FI);
        NumVGPRSpillLanes -= I;
+
+#if 0
+        DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(),
+                                                  "VGPRs for SGPR spilling",
+                                                  0, DS_Error);
+        MF.getFunction().getContext().diagnose(DiagOutOfRegs);
+#endif
        return false;
      }

--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -572,6 +572,17 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
    reserveRegisterTuples(Reserved, Reg.first);
  }

+  // Reserve VGPRs used for SGPR spilling.
+  // Note we treat freezeReservedRegs unusually because we run register
+  // allocation in two phases. It's OK to re-freeze with new registers for the
+  // second run.
+#if 0
+  for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) {
+    for (auto &SpilledVGPR : SpilledFI.second)
+      reserveRegisterTuples(Reserved, SpilledVGPR.VGPR);
+  }
+#endif
+
  // FIXME: Stop using reserved registers for this.
  for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
    reserveRegisterTuples(Reserved, Reg);
@ -1311,6 +1322,7 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                               int Index,
                               RegScavenger *RS,
+                               LiveIntervals *LIS,
                               bool OnlyToVGPR) const {
  SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);

@ -1340,6 +1352,12 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                     .addReg(SubReg, getKillRegState(UseKill))
                     .addImm(Spill.Lane)
                     .addReg(Spill.VGPR);
+      if (LIS) {
+        if (i == 0)
+          LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+        else
+          LIS->InsertMachineInstrInMaps(*MIB);
+      }

      if (i == 0 && SB.NumSubRegs > 1) {
        // We may be spilling a super-register which is only partially defined,
@ -1383,6 +1401,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                .addReg(SB.TmpVGPR, TmpVGPRFlags);
        TmpVGPRFlags = 0;

+        if (LIS) {
+          if (i == 0)
+            LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane);
+          else
+            LIS->InsertMachineInstrInMaps(*WriteLane);
+        }
+
        // There could be undef components of a spilled super register.
        // TODO: Can we detect this and skip the spill?
        if (SB.NumSubRegs > 1) {
@ -1403,12 +1428,17 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,

  MI->eraseFromParent();
  SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
+
+  if (LIS)
+    LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
+
  return true;
 }

 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                                 int Index,
                                 RegScavenger *RS,
+                                 LiveIntervals *LIS,
                                 bool OnlyToVGPR) const {
  SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);

@ -1432,6 +1462,13 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
              .addImm(Spill.Lane);
      if (SB.NumSubRegs > 1 && i == 0)
        MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+      if (LIS) {
+        if (i == e - 1)
+          LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+        else
+          LIS->InsertMachineInstrInMaps(*MIB);
+      }
+
    }
  } else {
    SB.prepare();
@ -1459,6 +1496,12 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                       .addImm(i);
        if (SB.NumSubRegs > 1 && i == 0)
          MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+        if (LIS) {
+          if (i == e - 1)
+            LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+          else
+            LIS->InsertMachineInstrInMaps(*MIB);
+        }
      }
    }

@ -1466,6 +1509,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
  }

  MI->eraseFromParent();
+
+  if (LIS)
+    LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
+
  return true;
 }

@ -1475,7 +1522,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
  MachineBasicBlock::iterator MI,
  int FI,
-  RegScavenger *RS) const {
+  RegScavenger *RS,
+  LiveIntervals *LIS) const {
  switch (MI->getOpcode()) {
  case AMDGPU::SI_SPILL_S1024_SAVE:
  case AMDGPU::SI_SPILL_S512_SAVE:
@ -1487,7 +1535,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
  case AMDGPU::SI_SPILL_S96_SAVE:
  case AMDGPU::SI_SPILL_S64_SAVE:
  case AMDGPU::SI_SPILL_S32_SAVE:
-    return spillSGPR(MI, FI, RS, true);
+    return spillSGPR(MI, FI, RS, LIS, true);
  case AMDGPU::SI_SPILL_S1024_RESTORE:
  case AMDGPU::SI_SPILL_S512_RESTORE:
  case AMDGPU::SI_SPILL_S256_RESTORE:
@ -1498,7 +1546,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
  case AMDGPU::SI_SPILL_S96_RESTORE:
  case AMDGPU::SI_SPILL_S64_RESTORE:
  case AMDGPU::SI_SPILL_S32_RESTORE:
-    return restoreSGPR(MI, FI, RS, true);
+    return restoreSGPR(MI, FI, RS, LIS, true);
  default:
    llvm_unreachable("not an SGPR spill instruction");
  }
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@ -114,10 +114,12 @@ public:
  /// If \p OnlyToVGPR is true, this will only succeed if this
  bool spillSGPR(MachineBasicBlock::iterator MI,
                 int FI, RegScavenger *RS,
+                 LiveIntervals *LIS = nullptr,
                 bool OnlyToVGPR = false) const;

  bool restoreSGPR(MachineBasicBlock::iterator MI,
                   int FI, RegScavenger *RS,
+                   LiveIntervals *LIS = nullptr,
                   bool OnlyToVGPR = false) const;

  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
@ -125,7 +127,8 @@ public:
                           RegScavenger *RS) const override;

  bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
-                                          int FI, RegScavenger *RS) const;
+                                          int FI, RegScavenger *RS,
+                                          LiveIntervals *LIS = nullptr) const;

  StringRef getRegAsmName(MCRegister Reg) const override;

--- a/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@ -12,20 +12,22 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
 ; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN-NEXT:    s_movk_i32 s4, 0x80
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
@ -44,8 +46,8 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
 ; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
 ; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
 ; GCN-NEXT:    s_movk_i32 s4, 0xc0
 ; GCN-NEXT:    v_mov_b32_e32 v6, s5
 ; GCN-NEXT:    v_mov_b32_e32 v5, s4
@ -59,44 +61,25 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@ -115,6 +98,7 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
 ; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
 ; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
 ; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
@ -155,26 +139,26 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
 ; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
 ; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v12, v20
 ; GCN-NEXT:    v_mov_b32_e32 v13, v21
@ -192,31 +176,6 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
 ; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
 ; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@ -234,29 +193,35 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v15
-; GCN-NEXT:    v_mov_b32_e32 v13, v16
-; GCN-NEXT:    v_mov_b32_e32 v14, v17
-; GCN-NEXT:    v_mov_b32_e32 v15, v18
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_mov_b32_e32 v8, v11
+; GCN-NEXT:    v_mov_b32_e32 v9, v12
+; GCN-NEXT:    v_mov_b32_e32 v10, v13
+; GCN-NEXT:    v_mov_b32_e32 v11, v14
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@ -273,20 +238,22 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
 ; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN-NEXT:    s_movk_i32 s4, 0x80
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
@ -305,8 +272,8 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
 ; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
 ; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
 ; GCN-NEXT:    s_movk_i32 s4, 0xc0
 ; GCN-NEXT:    v_mov_b32_e32 v6, s5
 ; GCN-NEXT:    v_mov_b32_e32 v5, s4
@ -319,44 +286,25 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@ -375,6 +323,7 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
 ; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
 ; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
 ; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
@ -415,26 +364,26 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
 ; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
 ; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v12, v20
 ; GCN-NEXT:    v_mov_b32_e32 v13, v21
@ -452,31 +401,6 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
 ; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
 ; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@ -494,34 +418,40 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v15
-; GCN-NEXT:    v_mov_b32_e32 v13, v16
-; GCN-NEXT:    v_mov_b32_e32 v14, v17
-; GCN-NEXT:    v_mov_b32_e32 v15, v18
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT:    v_lshrrev_b32_e64 v15, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v15, 0x100, v15
-; GCN-NEXT:    v_add_u32_e32 v0, v15, v0
+; GCN-NEXT:    v_mov_b32_e32 v8, v11
+; GCN-NEXT:    v_mov_b32_e32 v9, v12
+; GCN-NEXT:    v_mov_b32_e32 v10, v13
+; GCN-NEXT:    v_mov_b32_e32 v11, v14
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_lshrrev_b32_e64 v11, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v11, 0x100, v11
+; GCN-NEXT:    v_add_u32_e32 v0, v11, v0
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s6
-; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    s_waitcnt vmcnt(16)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@ -538,20 +468,22 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
 ; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN-NEXT:    s_movk_i32 s4, 0x80
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
@ -570,8 +502,8 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
 ; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
 ; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
 ; GCN-NEXT:    s_movk_i32 s4, 0xc0
 ; GCN-NEXT:    v_mov_b32_e32 v6, s5
 ; GCN-NEXT:    v_mov_b32_e32 v5, s4
@ -585,44 +517,25 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@ -641,6 +554,7 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
 ; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
 ; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
 ; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
@ -681,26 +595,26 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
 ; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
 ; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v12, v20
 ; GCN-NEXT:    v_mov_b32_e32 v13, v21
@ -718,31 +632,6 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
 ; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
 ; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@ -760,30 +649,36 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v15
-; GCN-NEXT:    v_mov_b32_e32 v13, v16
-; GCN-NEXT:    v_mov_b32_e32 v14, v17
-; GCN-NEXT:    v_mov_b32_e32 v15, v18
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_mov_b32_e32 v8, v11
+; GCN-NEXT:    v_mov_b32_e32 v9, v12
+; GCN-NEXT:    v_mov_b32_e32 v10, v13
+; GCN-NEXT:    v_mov_b32_e32 v11, v14
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
--- a/test/CodeGen/AMDGPU/agpr-csr.ll
+++ b/test/CodeGen/AMDGPU/agpr-csr.ll
@ -35,11 +35,11 @@ define void @func_areg_32() #0 {

 ; GCN-LABEL: {{^}}func_areg_33:
 ; GCN-NOT: a32
-; GFX90A: buffer_store_dword a32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A: v_accvgpr_read_b32 v0, a32 ; Reload Reuse
 ; GCN-NOT: a32
 ; GCN:        use agpr32
 ; GCN-NOT: a32
-; GFX90A: buffer_load_dword a32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A: v_accvgpr_write_b32 a32, v0 ; Reload Reuse
 ; GCN-NOT: a32
 ; GCN:        s_setpc_b64
 define void @func_areg_33() #0 {
@ -50,9 +50,9 @@ define void @func_areg_33() #0 {
 ; GCN-LABEL: {{^}}func_areg_64:
 ; GFX908-NOT: buffer_
 ; GCN-NOT:    v_accvgpr
-; GFX90A:     buffer_store_dword a63,
+; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
 ; GCN:        use agpr63
-; GFX90A:     buffer_load_dword a63,
+; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
 ; GCN-NOT:    v_accvgpr
 ; GCN:        s_setpc_b64
 define void @func_areg_64() #0 {
@ -62,12 +62,13 @@ define void @func_areg_64() #0 {

 ; GCN-LABEL: {{^}}func_areg_31_63:
 ; GFX908-NOT: buffer_
-; GCN-NOT:    v_accvgpr
-; GFX90A:     buffer_store_dword a63,
+; GFX908-NOT: v_accvgpr
+; GFX908-NOT: buffer
+; GFX90A:     v_accvgpr_read_b32 v0, a63 ; Reload Reuse
 ; GCN:        use agpr31, agpr63
-; GFX90A:     buffer_load_dword a63,
-; GCN-NOT:    buffer_
-; GCN-NOT:    v_accvgpr
+; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
+; GFX908-NOT: v_accvgpr
+; GFX908-NOT: buffer
 ; GCN:        s_setpc_b64
 define void @func_areg_31_63() #0 {
  call void asm sideeffect "; use agpr31, agpr63", "~{a31},~{a63}" ()
--- a/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir
+++ b/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir
@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s

 ---
 # GCN-LABEL: name: alloc_vgpr_64
--- a/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir
+++ b/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir
@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s
 # Using the unaligned vector tuples are OK as long as they aren't used
 # in a real instruction.

--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@ -1,11 +1,11 @@
 ; -enable-misched=false makes the register usage more predictable
 ; -regalloc=fast just makes the test run faster
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9
-; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false --sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64

 define internal void @use256vgprs() {
  %v0 = call i32 asm sideeffect "; def $0", "=v"()
--- a/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/test/CodeGen/AMDGPU/callee-frame-setup.ll
@ -273,22 +273,22 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 v1, s33, 63
-; GCN-COUNT-60: v_writelane_b32 v1
+; GCN-NEXT: v_writelane_b32 v0, s33, 63
+; GCN-COUNT-60: v_writelane_b32 v0
 ; GCN: s_mov_b32 s33, s32
-; GCN-COUNT-2: v_writelane_b32 v1
+; GCN-COUNT-2: v_writelane_b32 v0
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
 ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8
 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8
 ; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v1
+; GCN: v_writelane_b32 v0

 ; MUBUF:        s_addk_i32 s32, 0x400
 ; MUBUF:        s_addk_i32 s32, 0xfc00
 ; FLATSCR:      s_add_i32 s32, s32, 16
 ; FLATSCR:      s_add_i32 s32, s32, -16
-; GCN-NEXT: v_readlane_b32 s33, v1, 63
+; GCN-NEXT: v_readlane_b32 s33, v0, 63
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload
@ -318,21 +318,21 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-COUNT-62: v_writelane_b32 v1,
+; GCN-COUNT-62: v_writelane_b32 v0,
 ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN-NEXT: s_mov_b32 s33, s32
-; GCN: v_writelane_b32 v1,
+; GCN: v_writelane_b32 v0,
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
 ; MUBUF:   buffer_store_dword
 ; FLATSCR: scratch_store_dword
 ; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v1,
+; GCN: v_writelane_b32 v0,
 ; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
 ; MUBUF:        s_addk_i32 s32, 0x400
 ; FLATSCR:      s_add_i32 s32, s32, 16
-; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
+; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0
 ; MUBUF-NEXT:   s_addk_i32 s32, 0xfc00
 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
@ -388,24 +388,24 @@ define void @realign_stack_no_fp_elim() #1 {
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 v1, s33, 2
-; GCN-NEXT: v_writelane_b32 v1, s30, 0
+; GCN-NEXT: v_writelane_b32 v0, s33, 2
+; GCN-NEXT: v_writelane_b32 v0, s30, 0
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN: v_writelane_b32 v1, s31, 1
+; GCN: v_writelane_b32 v0, s31, 1
 ; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
 ; GCN-NEXT:     s_waitcnt vmcnt(0)
 ; GCN: ;;#ASMSTART
 ; MUBUF:        s_addk_i32 s32, 0x300
-; MUBUF-NEXT:   v_readlane_b32 s4, v1, 0
-; MUBUF-NEXT:   v_readlane_b32 s5, v1, 1
+; MUBUF-NEXT:   v_readlane_b32 s4, v0, 0
+; MUBUF-NEXT:   v_readlane_b32 s5, v0, 1
 ; FLATSCR:      s_add_i32 s32, s32, 12
-; FLATSCR-NEXT: v_readlane_b32 s0, v1, 0
-; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1
+; FLATSCR-NEXT: v_readlane_b32 s0, v0, 0
+; FLATSCR-NEXT: v_readlane_b32 s1, v0, 1
 ; MUBUF-NEXT:   s_addk_i32 s32, 0xfd00
 ; FLATSCR-NEXT: s_add_i32 s32, s32, -12
-; GCN-NEXT:     v_readlane_b32 s33, v1, 2
+; GCN-NEXT:     v_readlane_b32 s33, v0, 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
--- a/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@ -4225,32 +4225,32 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v42, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX9-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    global_store_dword v[40:41], v0, off
+; GFX9-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s4, v42, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v42, 1
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v42, 2
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[4:5]
@ -4260,34 +4260,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v42, s33, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_mov_b32_e32 v40, v0
-; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v41, v0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v41, v1
-; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v42, v1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    global_store_dword v[40:41], v0, off
+; GFX10-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s4, v42, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v42, 1
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v42, 2
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@ -4298,34 +4298,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s32 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, v0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, v1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v42, 1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v42, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
--- a/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@ -184,33 +184,33 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v41, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
-; GFX9-NEXT:    v_mov_b32_e32 v40, v31
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v41, v31
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
+; GFX9-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[4:5]
@ -220,34 +220,34 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v40, v31
-; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v41, v31
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v31, v40
+; GFX10-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@ -762,14 +762,14 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v41, s33, 3
-; GFX9-NEXT:    v_writelane_b32 v41, s40, 0
-; GFX9-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s40
 ; GFX9-NEXT:    ;;#ASMEND
@ -779,23 +779,23 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v41, s31, 2
-; GFX9-NEXT:    v_mov_b32_e32 v40, v32
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_mov_b32_e32 v41, v32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v40
+; GFX9-NEXT:    ; use v41
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s4, v41, 1
-; GFX9-NEXT:    v_readlane_b32 s5, v41, 2
-; GFX9-NEXT:    v_readlane_b32 s40, v41, 0
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v41, 3
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[4:5]
@ -805,41 +805,41 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v41, s33, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v41, s40, 0
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v40, s40, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v32
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_mov_b32_e32 v40, v32
-; GFX10-NEXT:    v_writelane_b32 v41, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v41, s31, 2
+; GFX10-NEXT:    v_mov_b32_e32 v41, v32
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
-; GFX10-NEXT:    ; use v40
+; GFX10-NEXT:    ; use v41
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s4, v41, 1
-; GFX10-NEXT:    v_readlane_b32 s5, v41, 2
-; GFX10-NEXT:    v_readlane_b32 s40, v41, 0
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s40, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v41, 3
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
--- a/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/test/CodeGen/AMDGPU/indirect-call.ll
@ -598,44 +598,44 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v41, s33, 6
+; GCN-NEXT:    v_writelane_b32 v40, s33, 6
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v41, s34, 0
-; GCN-NEXT:    v_writelane_b32 v41, s35, 1
-; GCN-NEXT:    v_writelane_b32 v41, s36, 2
-; GCN-NEXT:    v_writelane_b32 v41, s37, 3
-; GCN-NEXT:    v_writelane_b32 v41, s30, 4
-; GCN-NEXT:    v_writelane_b32 v41, s31, 5
-; GCN-NEXT:    v_mov_b32_e32 v40, v0
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s30, 4
+; GCN-NEXT:    v_writelane_b32 v40, s31, 5
+; GCN-NEXT:    v_mov_b32_e32 v41, v0
 ; GCN-NEXT:    s_mov_b64 s[34:35], exec
 ; GCN-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v2
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
 ; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, v40
+; GCN-NEXT:    v_mov_b32_e32 v0, v41
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
 ; GCN-NEXT:    s_cbranch_execnz BB7_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    v_mov_b32_e32 v0, v40
-; GCN-NEXT:    v_readlane_b32 s4, v41, 4
-; GCN-NEXT:    v_readlane_b32 s5, v41, 5
-; GCN-NEXT:    v_readlane_b32 s37, v41, 3
-; GCN-NEXT:    v_readlane_b32 s36, v41, 2
-; GCN-NEXT:    v_readlane_b32 s35, v41, 1
-; GCN-NEXT:    v_readlane_b32 s34, v41, 0
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    v_mov_b32_e32 v0, v41
+; GCN-NEXT:    v_readlane_b32 s4, v40, 4
+; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v41, 6
+; GCN-NEXT:    v_readlane_b32 s33, v40, 6
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
--- a/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/test/CodeGen/AMDGPU/llc-pipeline.ll
@ -123,8 +123,9 @@
 ; GCN-O0-NEXT:         Live Register Matrix
 ; GCN-O0-NEXT:         SI Pre-allocate WWM Registers
 ; GCN-O0-NEXT:         Fast Register Allocator
-; GCN-O0-NEXT:         SI Fix VGPR copies
 ; GCN-O0-NEXT:         SI lower SGPR spill instructions
+; GCN-O0-NEXT:         Fast Register Allocator
+; GCN-O0-NEXT:         SI Fix VGPR copies
 ; GCN-O0-NEXT:         Fixup Statepoint Caller Saved
 ; GCN-O0-NEXT:         Lazy Machine Block Frequency Analysis
 ; GCN-O0-NEXT:         Machine Optimization Remark Emitter
@ -344,6 +345,12 @@
 ; GCN-O1-NEXT:         Lazy Machine Block Frequency Analysis
 ; GCN-O1-NEXT:         Machine Optimization Remark Emitter
 ; GCN-O1-NEXT:         Greedy Register Allocator
+; GCN-O1-NEXT:         Virtual Register Rewriter
+; GCN-O1-NEXT:         SI lower SGPR spill instructions
+; GCN-O1-NEXT:         Virtual Register Map
+; GCN-O1-NEXT:         Live Register Matrix
+; GCN-O1-NEXT:         Machine Optimization Remark Emitter
+; GCN-O1-NEXT:         Greedy Register Allocator
 ; GCN-O1-NEXT:         GCN NSA Reassign
 ; GCN-O1-NEXT:         Virtual Register Rewriter
 ; GCN-O1-NEXT:         Stack Slot Coloring
@ -351,7 +358,6 @@
 ; GCN-O1-NEXT:         Machine Loop Invariant Code Motion
 ; GCN-O1-NEXT:         SI Fix VGPR copies
 ; GCN-O1-NEXT:         SI optimize exec mask operations
-; GCN-O1-NEXT:         SI lower SGPR spill instructions
 ; GCN-O1-NEXT:         Fixup Statepoint Caller Saved
 ; GCN-O1-NEXT:         PostRA Machine Sink
 ; GCN-O1-NEXT:         MachineDominator Tree Construction
@ -622,6 +628,12 @@
 ; GCN-O1-OPTS-NEXT:         Lazy Machine Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:         Machine Optimization Remark Emitter
 ; GCN-O1-OPTS-NEXT:         Greedy Register Allocator
+; GCN-O1-OPTS-NEXT:         Virtual Register Rewriter
+; GCN-O1-OPTS-NEXT:         SI lower SGPR spill instructions
+; GCN-O1-OPTS-NEXT:         Virtual Register Map
+; GCN-O1-OPTS-NEXT:         Live Register Matrix
+; GCN-O1-OPTS-NEXT:         Machine Optimization Remark Emitter
+; GCN-O1-OPTS-NEXT:         Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:         GCN NSA Reassign
 ; GCN-O1-OPTS-NEXT:         Virtual Register Rewriter
 ; GCN-O1-OPTS-NEXT:         Stack Slot Coloring
@ -629,7 +641,6 @@
 ; GCN-O1-OPTS-NEXT:         Machine Loop Invariant Code Motion
 ; GCN-O1-OPTS-NEXT:         SI Fix VGPR copies
 ; GCN-O1-OPTS-NEXT:         SI optimize exec mask operations
-; GCN-O1-OPTS-NEXT:         SI lower SGPR spill instructions
 ; GCN-O1-OPTS-NEXT:         Fixup Statepoint Caller Saved
 ; GCN-O1-OPTS-NEXT:         PostRA Machine Sink
 ; GCN-O1-OPTS-NEXT:         MachineDominator Tree Construction
@ -901,6 +912,12 @@
 ; GCN-O2-NEXT:         Lazy Machine Block Frequency Analysis
 ; GCN-O2-NEXT:         Machine Optimization Remark Emitter
 ; GCN-O2-NEXT:         Greedy Register Allocator
+; GCN-O2-NEXT:         Virtual Register Rewriter
+; GCN-O2-NEXT:         SI lower SGPR spill instructions
+; GCN-O2-NEXT:         Virtual Register Map
+; GCN-O2-NEXT:         Live Register Matrix
+; GCN-O2-NEXT:         Machine Optimization Remark Emitter
+; GCN-O2-NEXT:         Greedy Register Allocator
 ; GCN-O2-NEXT:         GCN NSA Reassign
 ; GCN-O2-NEXT:         Virtual Register Rewriter
 ; GCN-O2-NEXT:         Stack Slot Coloring
@ -908,7 +925,6 @@
 ; GCN-O2-NEXT:         Machine Loop Invariant Code Motion
 ; GCN-O2-NEXT:         SI Fix VGPR copies
 ; GCN-O2-NEXT:         SI optimize exec mask operations
-; GCN-O2-NEXT:         SI lower SGPR spill instructions
 ; GCN-O2-NEXT:         Fixup Statepoint Caller Saved
 ; GCN-O2-NEXT:         PostRA Machine Sink
 ; GCN-O2-NEXT:         MachineDominator Tree Construction
@ -1193,6 +1209,12 @@
 ; GCN-O3-NEXT:         Lazy Machine Block Frequency Analysis
 ; GCN-O3-NEXT:         Machine Optimization Remark Emitter
 ; GCN-O3-NEXT:         Greedy Register Allocator
+; GCN-O3-NEXT:         Virtual Register Rewriter
+; GCN-O3-NEXT:         SI lower SGPR spill instructions
+; GCN-O3-NEXT:         Virtual Register Map
+; GCN-O3-NEXT:         Live Register Matrix
+; GCN-O3-NEXT:         Machine Optimization Remark Emitter
+; GCN-O3-NEXT:         Greedy Register Allocator
 ; GCN-O3-NEXT:         GCN NSA Reassign
 ; GCN-O3-NEXT:         Virtual Register Rewriter
 ; GCN-O3-NEXT:         Stack Slot Coloring
@ -1200,7 +1222,6 @@
 ; GCN-O3-NEXT:         Machine Loop Invariant Code Motion
 ; GCN-O3-NEXT:         SI Fix VGPR copies
 ; GCN-O3-NEXT:         SI optimize exec mask operations
-; GCN-O3-NEXT:         SI lower SGPR spill instructions
 ; GCN-O3-NEXT:         Fixup Statepoint Caller Saved
 ; GCN-O3-NEXT:         PostRA Machine Sink
 ; GCN-O3-NEXT:         MachineDominator Tree Construction
--- a/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@ -189,44 +189,44 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v43, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v43, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v43, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, v1
-; GFX9-NEXT:    v_mov_b32_e32 v41, v0
-; GFX9-NEXT:    v_writelane_b32 v43, s30, 2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v41, v40
-; GFX9-NEXT:    v_writelane_b32 v43, s31, 3
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffffff, v40
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-NEXT:    v_mov_b32_e32 v42, v0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v42, v41
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffffff, v41
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_mad_u32_u24 v40, v41, v40, v42
-; GFX9-NEXT:    v_mov_b32_e32 v0, v40
+; GFX9-NEXT:    v_mad_u32_u24 v41, v42, v41, v43
+; GFX9-NEXT:    v_mov_b32_e32 v0, v41
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_add_u32_e32 v0, v40, v42
+; GFX9-NEXT:    v_add_u32_e32 v0, v41, v43
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s4, v43, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v43, 3
-; GFX9-NEXT:    v_readlane_b32 s35, v43, 1
-; GFX9-NEXT:    v_readlane_b32 s34, v43, 0
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-NEXT:    v_readlane_b32 s33, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[4:5]
--- a/test/CodeGen/AMDGPU/pei-build-spill.mir
+++ b/test/CodeGen/AMDGPU/pei-build-spill.mir
@ -1,12 +1,12 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF %s
-# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s
-# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s
-# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A-V2A %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A-V2A %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A-V2A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A-V2A %s

 ---
 name:            test_spill_v1
--- a/test/CodeGen/AMDGPU/remat-vop.mir
+++ b/test/CodeGen/AMDGPU/remat-vop.mir
@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs --stress-regalloc=2 -start-before=greedy -stop-after=virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs --stress-regalloc=2 -start-before=greedy,0 -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s

 ---
 name:            test_remat_v_mov_b32_e32
--- a/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@ -0,0 +1,108 @@
+; REQUIRES: asserts
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s
+; RUN: llc -sgpr-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s
+
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=O0 %s
+
+; RUN: llc -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s
+; RUN: llc -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s
+; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s
+
+; RUN: not --crash llc -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
+; RUN: not --crash llc -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
+
+
+; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc
+
+; DEFAULT: Greedy Register Allocator
+; DEFAULT-NEXT: Virtual Register Rewriter
+; DEFAULT-NEXT: SI lower SGPR spill instructions
+; DEFAULT-NEXT: Virtual Register Map
+; DEFAULT-NEXT: Live Register Matrix
+; DEFAULT-NEXT: Machine Optimization Remark Emitter
+; DEFAULT-NEXT: Greedy Register Allocator
+; DEFAULT-NEXT: GCN NSA Reassign
+; DEFAULT-NEXT: Virtual Register Rewriter
+; DEFAULT-NEXT: Stack Slot Coloring
+
+; O0: Fast Register Allocator
+; O0-NEXT: SI lower SGPR spill instructions
+; O0-NEXT: Fast Register Allocator
+; O0-NEXT: SI Fix VGPR copies
+
+
+
+
+; BASIC-DEFAULT: Debug Variable Analysis
+; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis
+; BASIC-DEFAULT-NEXT: Machine Natural Loop Construction
+; BASIC-DEFAULT-NEXT: Machine Block Frequency Analysis
+; BASIC-DEFAULT-NEXT: Virtual Register Map
+; BASIC-DEFAULT-NEXT: Live Register Matrix
+; BASIC-DEFAULT-NEXT: Basic Register Allocator
+; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
+; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
+; BASIC-DEFAULT-NEXT: Virtual Register Map
+; BASIC-DEFAULT-NEXT: Live Register Matrix
+; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
+; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
+; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
+; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter
+; BASIC-DEFAULT-NEXT: Greedy Register Allocator
+; BASIC-DEFAULT-NEXT: GCN NSA Reassign
+; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
+; BASIC-DEFAULT-NEXT: Stack Slot Coloring
+
+
+
+; DEFAULT-BASIC: Greedy Register Allocator
+; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
+; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
+; DEFAULT-BASIC-NEXT: Virtual Register Map
+; DEFAULT-BASIC-NEXT: Live Register Matrix
+; DEFAULT-BASIC-NEXT: Basic Register Allocator
+; DEFAULT-BASIC-NEXT: GCN NSA Reassign
+; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
+; DEFAULT-BASIC-NEXT: Stack Slot Coloring
+
+
+
+; BASIC-BASIC: Debug Variable Analysis
+; BASIC-BASIC-NEXT: Live Stack Slot Analysis
+; BASIC-BASIC-NEXT: Machine Natural Loop Construction
+; BASIC-BASIC-NEXT: Machine Block Frequency Analysis
+; BASIC-BASIC-NEXT: Virtual Register Map
+; BASIC-BASIC-NEXT: Live Register Matrix
+; BASIC-BASIC-NEXT: Basic Register Allocator
+; BASIC-BASIC-NEXT: Virtual Register Rewriter
+; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
+; BASIC-BASIC-NEXT: Virtual Register Map
+; BASIC-BASIC-NEXT: Live Register Matrix
+; BASIC-BASIC-NEXT: Basic Register Allocator
+; BASIC-BASIC-NEXT: GCN NSA Reassign
+; BASIC-BASIC-NEXT: Virtual Register Rewriter
+; BASIC-BASIC-NEXT: Stack Slot Coloring
+
+
+declare void @bar()
+
+; Something with some CSR SGPR spills
+define void @foo() {
+  call void asm sideeffect "; clobber", "~{s33}"()
+  call void @bar()
+  ret void
+}
+
+; Block live out spills with fast regalloc
+define amdgpu_kernel void @control_flow(i1 %cond) {
+  %s33 = call i32 asm sideeffect "; clobber", "={s33}"()
+  br i1 %cond, label %bb0, label %bb1
+
+bb0:
+   call void asm sideeffect "; use %0", "s"(i32 %s33)
+   br label %bb1
+
+bb1:
+  ret void
+}
--- a/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; The first 64 SGPR spills can go to a VGPR, but there isn't a second
+; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element.
+
+define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
+; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[8:23]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v23, s8, 0
+; GCN-NEXT:    v_writelane_b32 v23, s9, 1
+; GCN-NEXT:    v_writelane_b32 v23, s10, 2
+; GCN-NEXT:    v_writelane_b32 v23, s11, 3
+; GCN-NEXT:    v_writelane_b32 v23, s12, 4
+; GCN-NEXT:    v_writelane_b32 v23, s13, 5
+; GCN-NEXT:    v_writelane_b32 v23, s14, 6
+; GCN-NEXT:    v_writelane_b32 v23, s15, 7
+; GCN-NEXT:    v_writelane_b32 v23, s16, 8
+; GCN-NEXT:    v_writelane_b32 v23, s17, 9
+; GCN-NEXT:    v_writelane_b32 v23, s18, 10
+; GCN-NEXT:    v_writelane_b32 v23, s19, 11
+; GCN-NEXT:    v_writelane_b32 v23, s20, 12
+; GCN-NEXT:    v_writelane_b32 v23, s21, 13
+; GCN-NEXT:    v_writelane_b32 v23, s22, 14
+; GCN-NEXT:    v_writelane_b32 v23, s23, 15
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[8:23]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v23, s8, 16
+; GCN-NEXT:    v_writelane_b32 v23, s9, 17
+; GCN-NEXT:    v_writelane_b32 v23, s10, 18
+; GCN-NEXT:    v_writelane_b32 v23, s11, 19
+; GCN-NEXT:    v_writelane_b32 v23, s12, 20
+; GCN-NEXT:    v_writelane_b32 v23, s13, 21
+; GCN-NEXT:    v_writelane_b32 v23, s14, 22
+; GCN-NEXT:    v_writelane_b32 v23, s15, 23
+; GCN-NEXT:    v_writelane_b32 v23, s16, 24
+; GCN-NEXT:    v_writelane_b32 v23, s17, 25
+; GCN-NEXT:    v_writelane_b32 v23, s18, 26
+; GCN-NEXT:    v_writelane_b32 v23, s19, 27
+; GCN-NEXT:    v_writelane_b32 v23, s20, 28
+; GCN-NEXT:    v_writelane_b32 v23, s21, 29
+; GCN-NEXT:    v_writelane_b32 v23, s22, 30
+; GCN-NEXT:    v_writelane_b32 v23, s23, 31
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[8:23]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v23, s8, 32
+; GCN-NEXT:    v_writelane_b32 v23, s9, 33
+; GCN-NEXT:    v_writelane_b32 v23, s10, 34
+; GCN-NEXT:    v_writelane_b32 v23, s11, 35
+; GCN-NEXT:    v_writelane_b32 v23, s12, 36
+; GCN-NEXT:    v_writelane_b32 v23, s13, 37
+; GCN-NEXT:    v_writelane_b32 v23, s14, 38
+; GCN-NEXT:    v_writelane_b32 v23, s15, 39
+; GCN-NEXT:    v_writelane_b32 v23, s16, 40
+; GCN-NEXT:    v_writelane_b32 v23, s17, 41
+; GCN-NEXT:    v_writelane_b32 v23, s18, 42
+; GCN-NEXT:    v_writelane_b32 v23, s19, 43
+; GCN-NEXT:    v_writelane_b32 v23, s20, 44
+; GCN-NEXT:    v_writelane_b32 v23, s21, 45
+; GCN-NEXT:    v_writelane_b32 v23, s22, 46
+; GCN-NEXT:    v_writelane_b32 v23, s23, 47
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[8:23]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v23, s8, 48
+; GCN-NEXT:    v_writelane_b32 v23, s9, 49
+; GCN-NEXT:    v_writelane_b32 v23, s10, 50
+; GCN-NEXT:    v_writelane_b32 v23, s11, 51
+; GCN-NEXT:    v_writelane_b32 v23, s12, 52
+; GCN-NEXT:    v_writelane_b32 v23, s13, 53
+; GCN-NEXT:    v_writelane_b32 v23, s14, 54
+; GCN-NEXT:    v_writelane_b32 v23, s15, 55
+; GCN-NEXT:    v_writelane_b32 v23, s16, 56
+; GCN-NEXT:    v_writelane_b32 v23, s17, 57
+; GCN-NEXT:    v_writelane_b32 v23, s18, 58
+; GCN-NEXT:    v_writelane_b32 v23, s19, 59
+; GCN-NEXT:    v_writelane_b32 v23, s20, 60
+; GCN-NEXT:    v_writelane_b32 v23, s21, 61
+; GCN-NEXT:    v_writelane_b32 v23, s22, 62
+; GCN-NEXT:    v_writelane_b32 v23, s23, 63
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[6:7]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_mov_b64 s[8:9], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    v_writelane_b32 v0, s6, 0
+; GCN-NEXT:    v_writelane_b32 v0, s7, 1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s4, s5
+; GCN-NEXT:    s_cbranch_scc1 BB0_2
+; GCN-NEXT:  ; %bb.1: ; %bb0
+; GCN-NEXT:    v_readlane_b32 s4, v23, 0
+; GCN-NEXT:    v_readlane_b32 s5, v23, 1
+; GCN-NEXT:    v_readlane_b32 s6, v23, 2
+; GCN-NEXT:    v_readlane_b32 s7, v23, 3
+; GCN-NEXT:    v_readlane_b32 s8, v23, 4
+; GCN-NEXT:    v_readlane_b32 s9, v23, 5
+; GCN-NEXT:    v_readlane_b32 s10, v23, 6
+; GCN-NEXT:    v_readlane_b32 s11, v23, 7
+; GCN-NEXT:    v_readlane_b32 s12, v23, 8
+; GCN-NEXT:    v_readlane_b32 s13, v23, 9
+; GCN-NEXT:    v_readlane_b32 s14, v23, 10
+; GCN-NEXT:    v_readlane_b32 s15, v23, 11
+; GCN-NEXT:    v_readlane_b32 s16, v23, 12
+; GCN-NEXT:    v_readlane_b32 s17, v23, 13
+; GCN-NEXT:    v_readlane_b32 s18, v23, 14
+; GCN-NEXT:    v_readlane_b32 s19, v23, 15
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s[4:19]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_readlane_b32 s4, v23, 16
+; GCN-NEXT:    v_readlane_b32 s5, v23, 17
+; GCN-NEXT:    v_readlane_b32 s6, v23, 18
+; GCN-NEXT:    v_readlane_b32 s7, v23, 19
+; GCN-NEXT:    v_readlane_b32 s8, v23, 20
+; GCN-NEXT:    v_readlane_b32 s9, v23, 21
+; GCN-NEXT:    v_readlane_b32 s10, v23, 22
+; GCN-NEXT:    v_readlane_b32 s11, v23, 23
+; GCN-NEXT:    v_readlane_b32 s12, v23, 24
+; GCN-NEXT:    v_readlane_b32 s13, v23, 25
+; GCN-NEXT:    v_readlane_b32 s14, v23, 26
+; GCN-NEXT:    v_readlane_b32 s15, v23, 27
+; GCN-NEXT:    v_readlane_b32 s16, v23, 28
+; GCN-NEXT:    v_readlane_b32 s17, v23, 29
+; GCN-NEXT:    v_readlane_b32 s18, v23, 30
+; GCN-NEXT:    v_readlane_b32 s19, v23, 31
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s[4:19]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_readlane_b32 s4, v23, 32
+; GCN-NEXT:    v_readlane_b32 s5, v23, 33
+; GCN-NEXT:    v_readlane_b32 s6, v23, 34
+; GCN-NEXT:    v_readlane_b32 s7, v23, 35
+; GCN-NEXT:    v_readlane_b32 s8, v23, 36
+; GCN-NEXT:    v_readlane_b32 s9, v23, 37
+; GCN-NEXT:    v_readlane_b32 s10, v23, 38
+; GCN-NEXT:    v_readlane_b32 s11, v23, 39
+; GCN-NEXT:    v_readlane_b32 s12, v23, 40
+; GCN-NEXT:    v_readlane_b32 s13, v23, 41
+; GCN-NEXT:    v_readlane_b32 s14, v23, 42
+; GCN-NEXT:    v_readlane_b32 s15, v23, 43
+; GCN-NEXT:    v_readlane_b32 s16, v23, 44
+; GCN-NEXT:    v_readlane_b32 s17, v23, 45
+; GCN-NEXT:    v_readlane_b32 s18, v23, 46
+; GCN-NEXT:    v_readlane_b32 s19, v23, 47
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s[4:19]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_readlane_b32 s8, v23, 48
+; GCN-NEXT:    v_readlane_b32 s9, v23, 49
+; GCN-NEXT:    v_readlane_b32 s10, v23, 50
+; GCN-NEXT:    v_readlane_b32 s11, v23, 51
+; GCN-NEXT:    v_readlane_b32 s12, v23, 52
+; GCN-NEXT:    v_readlane_b32 s13, v23, 53
+; GCN-NEXT:    v_readlane_b32 s14, v23, 54
+; GCN-NEXT:    v_readlane_b32 s15, v23, 55
+; GCN-NEXT:    v_readlane_b32 s16, v23, 56
+; GCN-NEXT:    v_readlane_b32 s17, v23, 57
+; GCN-NEXT:    v_readlane_b32 s18, v23, 58
+; GCN-NEXT:    v_readlane_b32 s19, v23, 59
+; GCN-NEXT:    v_readlane_b32 s20, v23, 60
+; GCN-NEXT:    v_readlane_b32 s21, v23, 61
+; GCN-NEXT:    v_readlane_b32 s22, v23, 62
+; GCN-NEXT:    v_readlane_b32 s23, v23, 63
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s4, v0, 0
+; GCN-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s[8:23]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:  BB0_2: ; %ret
+; GCN-NEXT:    s_endpgm
+  call void asm sideeffect "", "~{v[0:7]}" () #0
+  call void asm sideeffect "", "~{v[8:15]}" () #0
+  call void asm sideeffect "", "~{v[16:19]}"() #0
+  call void asm sideeffect "", "~{v[20:21]}"() #0
+  call void asm sideeffect "", "~{v22}"() #0
+
+  %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0
+  call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }
--- a/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
+++ b/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
@ -1,5 +1,8 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s
+
+# -run-pass is used to artifically avoid using split register allocation, which would avoid stressing StackSlotColoring.
+

 # Make sure that stack slot coloring doesn't try to merge frame
 # indexes used for SGPR spilling with those that aren't.
--- a/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/test/CodeGen/AMDGPU/sibling-call.ll
@ -197,15 +197,15 @@ entry:
 ; Have another non-tail in the function
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
 ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec
 ; GCN: s_mov_b32 s33, s32
 ; GCN-DAG: s_addk_i32 s32, 0x400

-; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-DAG: v_writelane_b32 v42, s34, 0
-; GCN-DAG: v_writelane_b32 v42, s35, 1
+; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-DAG: v_writelane_b32 [[CSRV]], s34, 0
+; GCN-DAG: v_writelane_b32 [[CSRV]], s35, 1

 ; GCN-DAG: s_getpc_b64 s[4:5]
 ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
@ -214,20 +214,20 @@ entry:

 ; GCN: s_swappc_b64

-; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload

 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12

-; GCN-DAG: v_readlane_b32 s34, v42, 0
-; GCN-DAG: v_readlane_b32 s35, v42, 1
+; GCN-DAG: v_readlane_b32 s34, [[CSRV]], 0
+; GCN-DAG: v_readlane_b32 s35, [[CSRV]], 1

 ; GCN: s_addk_i32 s32, 0xfc00
 ; GCN-NEXT: v_readlane_b32 s33,
 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
 ; GCN-NEXT: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
--- a/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-dce-in-ra=0 -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-dce-in-ra=0 -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy,1 -o - %s | FileCheck %s
 # https://bugs.llvm.org/show_bug.cgi?id=33620

 ---
--- a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s
-; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mattr=-xnack -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s
 ;
--- a/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
+++ b/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
@ -22,12 +22,11 @@ define void @spill_more_than_wavesize_csr_sgprs() {
 }

 ; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object:
-; CHECK-DAG:    v_writelane_b32 v1, s98, 63
-; CHECK-DAG:    v_writelane_b32 v2, s99, 0
+; CHECK-DAG:    v_writelane_b32 v0, s98, 63
+; CHECK-DAG:    v_writelane_b32 v1, s99, 0
 ; CHECK-NOT: dummy
-; CHECK-DAG:    v_readlane_b32 s99, v2, 0
-; CHECK-DAG:    v_readlane_b32 s98, v1, 63
-
+; CHECK-DAG:    v_readlane_b32 s99, v1, 0
+; CHECK-DAG:    v_readlane_b32 s98, v0, 63
 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca
--- a/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
+++ b/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
@ -1,4 +1,7 @@
-# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -stress-regalloc=1 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck  %s
+# Note we are NOT using the normal register allocator pipeline. We are
+# forcing allocating VGPRs and SGPRs at the same time.
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -stress-regalloc=1 -run-pass=greedy,virtregrewriter,stack-slot-coloring -o - %s | FileCheck  %s
+
 ---

 # CHECK-LABEL: name: no_merge_sgpr_vgpr_spill_slot{{$}}
--- a/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@ -89,7 +89,6 @@ define hidden void @widget() {
 ; SI-OPT:       bb12:
 ; SI-OPT-NEXT:    store float 0.000000e+00, float addrspace(1)* null, align 8
 ; SI-OPT-NEXT:    ret void
-;
 bb:
  %tmp = load i32, i32 addrspace(1)* null, align 16
  %tmp1 = icmp slt i32 %tmp, 21
@ -187,30 +186,30 @@ define hidden void @blam() {
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 4
+; GCN-NEXT:    v_writelane_b32 v40, s33, 4
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s37, 3
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    flat_load_dword v40, v[1:2]
-; GCN-NEXT:    v_mov_b32_e32 v42, 0
+; GCN-NEXT:    flat_load_dword v41, v[1:2]
+; GCN-NEXT:    v_mov_b32_e32 v43, 0
 ; GCN-NEXT:    s_getpc_b64 s[36:37]
 ; GCN-NEXT:    s_add_u32 s36, s36, spam@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s37, s37, spam@rel32@hi+12
-; GCN-NEXT:    v_lshlrev_b32_e32 v41, 2, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v42, 2, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_f32_e64 s[34:35], 0, v40
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[34:35], 0, v41
 ; GCN-NEXT:    s_branch BB1_3
 ; GCN-NEXT:  BB1_1: ; %bb10
 ; GCN-NEXT:    ; in Loop: Header=BB1_3 Depth=1
@ -229,7 +228,7 @@ define hidden void @blam() {
 ; GCN-NEXT:  BB1_4: ; %bb2
 ; GCN-NEXT:    ; Parent Loop BB1_3 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
-; GCN-NEXT:    flat_load_dword v0, v[41:42]
+; GCN-NEXT:    flat_load_dword v0, v[42:43]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
@ -273,7 +272,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:  BB1_10: ; %bb17
 ; GCN-NEXT:    ; in Loop: Header=BB1_3 Depth=1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], 0
 ; GCN-NEXT:    s_branch BB1_2
 bb:
  %tmp = load float, float* null, align 16
--- a/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@ -8,11 +8,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; preserved across the call and should get 8 scratch registers.

 ; GFX9-LABEL: non_preserved_vgpr_tuple8:
-; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill

 ; GFX9: v_mov_b32_e32 v36, v16
 ; GFX9-NEXT: v_mov_b32_e32 v35, v15
@ -21,28 +21,28 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT: v_mov_b32_e32 v32, v12
 ; GFX9: ;;#ASMSTART
 ; GFX9-NEXT: ;;#ASMEND
-; GFX9: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1
+; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT: s_getpc_b64 s[4:5]
 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_writelane_b32 v44, s30, 0
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
 ; GFX9: s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]

-; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9: s_setpc_b64 s[4:5]
 ;
 ; GFX10-LABEL: non_preserved_vgpr_tuple8:
-; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill

 ; GFX10: v_mov_b32_e32 v36, v16
 ; GFX10-NEXT: v_mov_b32_e32 v35, v15
@ -53,7 +53,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10: ;;#ASMSTART
 ; GFX10-NEXT: ;;#ASMEND

-; GFX10: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
@ -62,12 +62,12 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10: s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]

-; GFX10: buffer_load_dword v43, off, s[0:3], s33
-; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
-; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12
+; GFX10: buffer_load_dword v44, off, s[0:3], s33
+; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12

-; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX10: s_setpc_b64 s[4:5]
 main_body:
  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
@ -86,20 +86,20 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; The upper 3 sub-registers are unused.

 ; GFX9-LABEL: call_preserved_vgpr_tuple8:
-; GFX9: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill

-; GFX9: v_mov_b32_e32 v44, v16
-; GFX9-NEXT: v_mov_b32_e32 v43, v15
-; GFX9-NEXT: v_mov_b32_e32 v42, v14
-; GFX9-NEXT: v_mov_b32_e32 v41, v13
-; GFX9-NEXT: v_mov_b32_e32 v40, v12
+; GFX9: v_mov_b32_e32 v45, v16
+; GFX9-NEXT: v_mov_b32_e32 v44, v15
+; GFX9-NEXT: v_mov_b32_e32 v43, v14
+; GFX9-NEXT: v_mov_b32_e32 v42, v13
+; GFX9-NEXT: v_mov_b32_e32 v41, v12

-; GFX9: image_gather4_c_b_cl v[0:3], v[40:44], s[36:43], s[4:7] dmask:0x1
+; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
 ; GFX9-NEXT: s_getpc_b64 s[4:5]
 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
@ -108,24 +108,24 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1

-; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload

-; GFX9: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9: s_setpc_b64 s[4:5]
 ;
 ; GFX10-LABEL: call_preserved_vgpr_tuple8:
-; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill


 ; GFX10:      image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
@ -133,24 +133,24 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
-; GFX10-NEXT: v_mov_b32_e32 v40, v16
+; GFX10-NEXT: v_mov_b32_e32 v41, v16
 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v41, v15
-; GFX10-NEXT: v_mov_b32_e32 v42, v14
-; GFX10-NEXT: v_mov_b32_e32 v43, v13
-; GFX10-NEXT: v_mov_b32_e32 v44, v12
+; GFX10-NEXT: v_mov_b32_e32 v42, v15
+; GFX10-NEXT: v_mov_b32_e32 v43, v14
+; GFX10-NEXT: v_mov_b32_e32 v44, v13
+; GFX10-NEXT: v_mov_b32_e32 v45, v12
 ; GFX10-NEXT: s_waitcnt vmcnt(0)
 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D

-; GFX10: buffer_load_dword v44, off, s[0:3], s33
-; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4
-; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
-; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16
-; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20
+; GFX10: buffer_load_dword v45, off, s[0:3], s33{{$}}
+; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4
+; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16
+; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:20
 ; GFX10: s_setpc_b64 s[4:5]
 main_body:
  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
--- a/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
+++ b/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck %s

 # The undef copy of %4 is allocated to $vgpr3, and the identity copy
 # was deleted, and $vgpr3 was considered undef. The code to replace
@ -31,7 +31,7 @@ body:             |
    ; CHECK-LABEL: name: undef_identity_copy
    ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128), addrspace 1)
    ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc
-    ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
+    ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc
    ; CHECK: $sgpr4 = COPY $sgpr95
    ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @foo, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4
    ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95