Recommit r265309 after fixed an invalid memory reference bug happened

when DenseMap growed and moved memory. I verified it fixed the bootstrap problem on x86_64-linux-gnu but I cannot verify whether it fixes the bootstrap error on clang-ppc64be-linux. I will watch the build-bot result closely. Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Differential Revision: http://reviews.llvm.org/D15302 llvm-svn: 265547
2024-11-24 19:52:54 +01:00 · 2016-04-06 15:41:07 +00:00 · 2016-04-06 15:41:07 +00:00 · ec02e9ab60
commit ec02e9ab60
parent aab80ed89c
16 changed files with 948 additions and 1053 deletions
--- a/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/include/llvm/CodeGen/LiveRangeEdit.h
@ -72,6 +72,10 @@ private:
  /// ScannedRemattable - true when remattable values have been identified.
  bool ScannedRemattable;

+  /// DeadRemats - The saved instructions which have already been dead after
+  /// rematerialization but not deleted yet -- to be done in postOptimization.
+  SmallPtrSet<MachineInstr *, 32> *DeadRemats;
+
  /// Remattable - Values defined by remattable instructions as identified by
  /// tii.isTriviallyReMaterializable().
  SmallPtrSet<const VNInfo*,4> Remattable;
@ -116,13 +120,16 @@ public:
  /// @param vrm Map of virtual registers to physical registers for this
  ///            function.  If NULL, no virtual register map updates will
  ///            be done.  This could be the case if called before Regalloc.
+  /// @param deadRemats The collection of all the instructions defining an
+  ///                   original reg and are dead after remat.
  LiveRangeEdit(LiveInterval *parent, SmallVectorImpl<unsigned> &newRegs,
                MachineFunction &MF, LiveIntervals &lis, VirtRegMap *vrm,
-                Delegate *delegate = nullptr)
+                Delegate *delegate = nullptr,
+                SmallPtrSet<MachineInstr *, 32> *deadRemats = nullptr)
      : Parent(parent), NewRegs(newRegs), MRI(MF.getRegInfo()), LIS(lis),
-        VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()),
-        TheDelegate(delegate), FirstNew(newRegs.size()),
-        ScannedRemattable(false) {
+        VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()), TheDelegate(delegate),
+        FirstNew(newRegs.size()), ScannedRemattable(false),
+        DeadRemats(deadRemats) {
    MRI.setDelegate(this);
  }

@ -142,6 +149,16 @@ public:
  bool empty() const { return size() == 0; }
  unsigned get(unsigned idx) const { return NewRegs[idx+FirstNew]; }

+  /// pop_back - It allows LiveRangeEdit users to drop new registers.
+  /// The context is when an original def instruction of a register is
+  /// dead after rematerialization, we still want to keep it for following
+  /// rematerializations. We save the def instruction in DeadRemats,
+  /// and replace the original dst register with a new dummy register so
+  /// the live range of original dst register can be shrinked normally.
+  /// We don't want to allocate phys register for the dummy register, so
+  /// we want to drop it from the NewRegs set.
+  void pop_back() { NewRegs.pop_back(); }
+
  ArrayRef<unsigned> regs() const {
    return makeArrayRef(NewRegs).slice(FirstNew);
  }
@ -175,15 +192,15 @@ public:
  /// Remat - Information needed to rematerialize at a specific location.
  struct Remat {
    VNInfo *ParentVNI;      // parent_'s value at the remat location.
-    MachineInstr *OrigMI;   // Instruction defining ParentVNI.
+    MachineInstr *OrigMI;   // Instruction defining OrigVNI. It contains the
+                            // real expr for remat.
    explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(nullptr) {}
  };

  /// canRematerializeAt - Determine if ParentVNI can be rematerialized at
  /// UseIdx. It is assumed that parent_.getVNINfoAt(UseIdx) == ParentVNI.
  /// When cheapAsAMove is set, only cheap remats are allowed.
-  bool canRematerializeAt(Remat &RM,
-                          SlotIndex UseIdx,
+  bool canRematerializeAt(Remat &RM, VNInfo *OrigVNI, SlotIndex UseIdx,
                          bool cheapAsAMove);

  /// rematerializeAt - Rematerialize RM.ParentVNI into DestReg by inserting an
@ -208,6 +225,12 @@ public:
    return Rematted.count(ParentVNI);
  }

+  void markDeadRemat(MachineInstr *inst) {
+    // DeadRemats is an optional field.
+    if (DeadRemats)
+      DeadRemats->insert(inst);
+  }
+
  /// eraseVirtReg - Notify the delegate that Reg is no longer in use, and try
  /// to erase it from LIS.
  void eraseVirtReg(unsigned Reg);
@ -218,8 +241,11 @@ public:
  /// RegsBeingSpilled lists registers currently being spilled by the register
  /// allocator.  These registers should not be split into new intervals
  /// as currently those new intervals are not guaranteed to spill.
-  void eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                         ArrayRef<unsigned> RegsBeingSpilled = None);
+  /// NoSplit indicates this func is used after the iterations of selectOrSplit
+  /// where registers should not be split into new intervals.
+  void eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                         ArrayRef<unsigned> RegsBeingSpilled = None,
+                         bool NoSplit = false);

  /// calculateRegClassAndHint - Recompute register class and hint for each new
  /// register.
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@ -63,10 +63,13 @@ void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
  for (VNInfo *VNI : getParent().valnos) {
    if (VNI->isUnused())
      continue;
-    MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
+    unsigned Original = VRM->getOriginal(getReg());
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def);
    if (!DefMI)
      continue;
-    checkRematerializable(VNI, DefMI, aa);
+    checkRematerializable(OrigVNI, DefMI, aa);
  }
  ScannedRemattable = true;
 }
@ -113,24 +116,18 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
  return true;
 }

-bool LiveRangeEdit::canRematerializeAt(Remat &RM,
-                                       SlotIndex UseIdx,
-                                       bool cheapAsAMove) {
+bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI,
+                                       SlotIndex UseIdx, bool cheapAsAMove) {
  assert(ScannedRemattable && "Call anyRematerializable first");

  // Use scanRemattable info.
-  if (!Remattable.count(RM.ParentVNI))
+  if (!Remattable.count(OrigVNI))
    return false;

  // No defining instruction provided.
  SlotIndex DefIdx;
-  if (RM.OrigMI)
-    DefIdx = LIS.getInstructionIndex(*RM.OrigMI);
-  else {
-    DefIdx = RM.ParentVNI->def;
-    RM.OrigMI = LIS.getInstructionFromIndex(DefIdx);
-    assert(RM.OrigMI && "No defining instruction for remattable value");
-  }
+  assert(RM.OrigMI && "No defining instruction for remattable value");
+  DefIdx = LIS.getInstructionIndex(*RM.OrigMI);

  // If only cheap remats were requested, bail out early.
  if (cheapAsAMove && !TII.isAsCheapAsAMove(RM.OrigMI))
@ -261,6 +258,15 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
  // Collect virtual registers to be erased after MI is gone.
  SmallVector<unsigned, 8> RegsToErase;
  bool ReadsPhysRegs = false;
+  bool isOrigDef = false;
+  unsigned Dest;
+  if (VRM && MI->getOperand(0).isReg()) {
+    Dest = MI->getOperand(0).getReg();
+    unsigned Original = VRM->getOriginal(Dest);
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+    isOrigDef = SlotIndex::isSameInstr(OrigVNI->def, Idx);
+  }

  // Check for live intervals that may shrink
  for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
@ -314,11 +320,24 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
    }
    DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
  } else {
-    if (TheDelegate)
-      TheDelegate->LRE_WillEraseInstruction(MI);
-    LIS.RemoveMachineInstrFromMaps(*MI);
-    MI->eraseFromParent();
-    ++NumDCEDeleted;
+    // If the dest of MI is an original reg, don't delete the inst. Replace
+    // the dest with a new reg, keep the inst for remat of other siblings.
+    // The inst is saved in LiveRangeEdit::DeadRemats and will be deleted
+    // after all the allocations of the func are done.
+    if (isOrigDef) {
+      unsigned NewDest = createFrom(Dest);
+      pop_back();
+      markDeadRemat(MI);
+      const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+      MI->substituteRegister(Dest, NewDest, 0, TRI);
+      MI->getOperand(0).setIsDead(false);
+    } else {
+      if (TheDelegate)
+        TheDelegate->LRE_WillEraseInstruction(MI);
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      ++NumDCEDeleted;
+    }
  }

  // Erase any virtregs that are now empty and unused. There may be <undef>
@ -332,8 +351,9 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
  }
 }

-void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                                      ArrayRef<unsigned> RegsBeingSpilled) {
+void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                                      ArrayRef<unsigned> RegsBeingSpilled,
+                                      bool NoSplit) {
  ToShrinkSet ToShrink;

  for (;;) {
@ -355,6 +375,9 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
    if (!LIS.shrinkToUses(LI, &Dead))
      continue;

+    if (NoSplit)
+      continue;
+
    // Don't create new intervals for a register being spilled.
    // The new intervals would have to be spilled anyway so its not worth it.
    // Also they currently aren't spilled so creating them and not spilling
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@ -153,3 +153,12 @@ void RegAllocBase::allocatePhysRegs() {
    }
  }
 }
+
+void RegAllocBase::postOptimization() {
+  spiller().postOptimization();
+  for (auto DeadInst : DeadRemats) {
+    LIS->RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@ -65,6 +65,12 @@ protected:
  LiveRegMatrix *Matrix;
  RegisterClassInfo RegClassInfo;

+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
  RegAllocBase()
    : TRI(nullptr), MRI(nullptr), VRM(nullptr), LIS(nullptr), Matrix(nullptr) {}

@ -77,6 +83,10 @@ protected:
  // physical register assignments.
  void allocatePhysRegs();

+  // Include spiller post optimization and removing dead defs left because of
+  // rematerialization.
+  virtual void postOptimization();
+
  // Get a temporary reference to a Spiller instance.
  virtual Spiller &spiller() = 0;

--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@ -199,7 +199,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
    Matrix->unassign(Spill);

    // Spill the extracted interval.
-    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM);
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
    spiller().spill(LRE);
  }
  return true;
@ -258,7 +258,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
  DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
  if (!VirtReg.isSpillable())
    return ~0u;
-  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM);
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
  spiller().spill(LRE);

  // The live virtual register requesting allocation was spilled, so tell
@ -283,6 +283,7 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));

  allocatePhysRegs();
+  postOptimization();

  // Diagnostic output before rewriting
  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//

-#include "llvm/CodeGen/Passes.h"
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
@ -33,6 +32,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@ -44,6 +44,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <queue>

@ -55,14 +56,14 @@ STATISTIC(NumGlobalSplits, "Number of split global live ranges");
 STATISTIC(NumLocalSplits,  "Number of split local live ranges");
 STATISTIC(NumEvicted,      "Number of interferences evicted");

-static cl::opt<SplitEditor::ComplementSpillMode>
-SplitSpillMode("split-spill-mode", cl::Hidden,
-  cl::desc("Spill mode for splitting live ranges"),
-  cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
-             clEnumValN(SplitEditor::SM_Size,  "size",  "Optimize for size"),
-             clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
-             clEnumValEnd),
-  cl::init(SplitEditor::SM_Partition));
+static cl::opt<SplitEditor::ComplementSpillMode> SplitSpillMode(
+    "split-spill-mode", cl::Hidden,
+    cl::desc("Spill mode for splitting live ranges"),
+    cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+               clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"),
+               clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
+               clEnumValEnd),
+    cl::init(SplitEditor::SM_Speed));

 static cl::opt<unsigned>
 LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden,
@ -1465,7 +1466,7 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
                                 SmallVectorImpl<unsigned> &NewVRegs) {
  SmallVector<unsigned, 8> UsedCands;
  // Prepare split editor.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
  SE->reset(LREdit, SplitSpillMode);

  // Assign all edge bundles to the preferred candidate, or NoCand.
@ -1513,7 +1514,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
  assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
  unsigned Reg = VirtReg.reg;
  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
  SE->reset(LREdit, SplitSpillMode);
  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@ -1585,7 +1586,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,

  // Always enable split spill mode, since we're effectively spilling to a
  // register.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
  SE->reset(LREdit, SplitEditor::SM_Size);

  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
@ -1908,7 +1909,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
               << '-' << Uses[BestAfter] << ", " << BestDiff
               << ", " << (BestAfter - BestBefore + 1) << " instrs\n");

-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
  SE->reset(LREdit);

  SE->openIntv();
@ -2551,7 +2552,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
    NewVRegs.push_back(VirtReg.reg);
  } else {
    NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
-    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
    spiller().spill(LRE);
    setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);

@ -2609,6 +2610,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {

  allocatePhysRegs();
  tryHintsRecoloring();
+  postOptimization();
+
  releaseMemory();
  return true;
 }
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@ -123,6 +123,12 @@ private:

  RegSet VRegsToAlloc, EmptyIntervalVRegs;

+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
  /// \brief Finds the initial set of vreg intervals to allocate.
  void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);

@ -146,6 +152,7 @@ private:
  void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
                     VirtRegMap &VRM) const;

+  void postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS);
 };

 char RegAllocPBQP::ID = 0;
@ -631,7 +638,8 @@ void RegAllocPBQP::spillVReg(unsigned VReg,
                             VirtRegMap &VRM, Spiller &VRegSpiller) {

  VRegsToAlloc.erase(VReg);
-  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM);
+  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM,
+                    nullptr, &DeadRemats);
  VRegSpiller.spill(LRE);

  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@ -713,6 +721,16 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
  }
 }

+void RegAllocPBQP::postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS) {
+  VRegSpiller.postOptimization();
+  /// Remove dead defs because of rematerialization.
+  for (auto DeadInst : DeadRemats) {
+    LIS.RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
+
 static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size,
                                         unsigned NumInstr) {
  // All intervals have a spill weight that is mostly proportional to the number
@ -798,6 +816,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {

  // Finalise allocation, allocate empty ranges.
  finalizeAlloc(MF, LIS, VRM);
+  postOptimization(*VRegSpiller, LIS);
  VRegsToAlloc.clear();
  EmptyIntervalVRegs.clear();

--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@ -16,6 +16,7 @@ namespace llvm {
  class MachineFunction;
  class MachineFunctionPass;
  class VirtRegMap;
+  class LiveIntervals;

  /// Spiller interface.
  ///
@ -28,7 +29,7 @@ namespace llvm {

    /// spill - Spill the LRE.getParent() live interval.
    virtual void spill(LiveRangeEdit &LRE) = 0;
-
+    virtual void postOptimization(){};
  };

  /// Create and return a spiller that will insert spill code directly instead
@ -36,7 +37,6 @@ namespace llvm {
  Spiller *createInlineSpiller(MachineFunctionPass &pass,
                               MachineFunction &mf,
                               VirtRegMap &vrm);
-
 }

 #endif
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@ -430,8 +431,13 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
  bool Late = RegIdx != 0;

  // Attempt cheap-as-a-copy rematerialization.
+  unsigned Original = VRM.getOriginal(Edit->get(RegIdx));
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
  LiveRangeEdit::Remat RM(ParentVNI);
-  if (Edit->canRematerializeAt(RM, UseIdx, true)) {
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
+  if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
    Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
    ++NumRemats;
  } else {
@ -716,7 +722,62 @@ SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
  }
 }

-void SplitEditor::hoistCopiesForSize() {
+void SplitEditor::computeRedundantBackCopies(
+    DenseSet<unsigned> &NotToHoistSet, SmallVectorImpl<VNInfo *> &BackCopies) {
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  LiveInterval *Parent = &Edit->getParent();
+  SmallVector<SmallPtrSet<VNInfo *, 8>, 8> EqualVNs(Parent->getNumValNums());
+  SmallPtrSet<VNInfo *, 8> DominatedVNIs;
+
+  // Aggregate VNIs having the same value as ParentVNI.
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    EqualVNs[ParentVNI->id].insert(VNI);
+  }
+
+  // For VNI aggregation of each ParentVNI, collect dominated, i.e.,
+  // redundant VNIs to BackCopies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    if (!NotToHoistSet.count(ParentVNI->id))
+      continue;
+    SmallPtrSetIterator<VNInfo *> It1 = EqualVNs[ParentVNI->id].begin();
+    SmallPtrSetIterator<VNInfo *> It2 = It1;
+    for (; It1 != EqualVNs[ParentVNI->id].end(); ++It1) {
+      It2 = It1;
+      for (++It2; It2 != EqualVNs[ParentVNI->id].end(); ++It2) {
+        if (DominatedVNIs.count(*It1) || DominatedVNIs.count(*It2))
+          continue;
+
+        MachineBasicBlock *MBB1 = LIS.getMBBFromIndex((*It1)->def);
+        MachineBasicBlock *MBB2 = LIS.getMBBFromIndex((*It2)->def);
+        if (MBB1 == MBB2) {
+          DominatedVNIs.insert((*It1)->def < (*It2)->def ? (*It2) : (*It1));
+        } else if (MDT.dominates(MBB1, MBB2)) {
+          DominatedVNIs.insert(*It2);
+        } else if (MDT.dominates(MBB2, MBB1)) {
+          DominatedVNIs.insert(*It1);
+        }
+      }
+    }
+    if (!DominatedVNIs.empty()) {
+      forceRecompute(0, ParentVNI);
+      for (auto VNI : DominatedVNIs) {
+        BackCopies.push_back(VNI);
+      }
+      DominatedVNIs.clear();
+    }
+  }
+}
+
+/// For SM_Size mode, find a common dominator for all the back-copies for
+/// the same ParentVNI and hoist the backcopies to the dominator BB.
+/// For SM_Speed mode, if the common dominator is hot and it is not beneficial
+/// to do the hoisting, simply remove the dominated backcopies for the same
+/// ParentVNI.
+void SplitEditor::hoistCopies() {
  // Get the complement interval, always RegIdx 0.
  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
  LiveInterval *Parent = &Edit->getParent();
@ -725,6 +786,11 @@ void SplitEditor::hoistCopiesForSize() {
  // indexed by ParentVNI->id.
  typedef std::pair<MachineBasicBlock*, SlotIndex> DomPair;
  SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
+  // The total cost of all the back-copies for each ParentVNI.
+  SmallVector<BlockFrequency, 8> Costs(Parent->getNumValNums());
+  // The ParentVNI->id set for which hoisting back-copies are not beneficial
+  // for Speed.
+  DenseSet<unsigned> NotToHoistSet;

  // Find the nearest common dominator for parent values with multiple
  // back-copies.  If a single back-copy dominates, put it in DomPair.second.
@ -740,6 +806,7 @@ void SplitEditor::hoistCopiesForSize() {
      continue;

    MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);
+
    DomPair &Dom = NearestDom[ParentVNI->id];

    // Keep directly defined parent values.  This is either a PHI or an
@ -774,6 +841,7 @@ void SplitEditor::hoistCopiesForSize() {
      else if (Near != Dom.first)
        // None dominate. Hoist to common dominator, need new def.
        Dom = DomPair(Near, SlotIndex());
+      Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB);
    }

    DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
@ -792,6 +860,11 @@ void SplitEditor::hoistCopiesForSize() {
    MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
    // Get a less loopy dominator than Dom.first.
    Dom.first = findShallowDominator(Dom.first, DefMBB);
+    if (SpillMode == SM_Speed &&
+        MBFI.getBlockFreq(Dom.first) > Costs[ParentVNI->id]) {
+      NotToHoistSet.insert(ParentVNI->id);
+      continue;
+    }
    SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot();
    Dom.second =
      defFromParent(0, ParentVNI, Last, *Dom.first,
@ -806,11 +879,18 @@ void SplitEditor::hoistCopiesForSize() {
      continue;
    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
    const DomPair &Dom = NearestDom[ParentVNI->id];
-    if (!Dom.first || Dom.second == VNI->def)
+    if (!Dom.first || Dom.second == VNI->def ||
+        NotToHoistSet.count(ParentVNI->id))
      continue;
    BackCopies.push_back(VNI);
    forceRecompute(0, ParentVNI);
  }
+
+  // If it is not beneficial to hoist all the BackCopies, simply remove
+  // redundant BackCopies in speed mode.
+  if (SpillMode == SM_Speed && !NotToHoistSet.empty())
+    computeRedundantBackCopies(NotToHoistSet, BackCopies);
+
  removeBackCopies(BackCopies);
 }

@ -1004,6 +1084,8 @@ void SplitEditor::deleteRematVictims() {
      // Dead defs end at the dead slot.
      if (S.end != S.valno->def.getDeadSlot())
        continue;
+      if (S.valno->isPHIDef())
+        continue;
      MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
      assert(MI && "Missing instruction for dead def");
      MI->addRegisterDead(LI->reg, &TRI);
@ -1048,10 +1130,9 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
    // Leave all back-copies as is.
    break;
  case SM_Size:
-    hoistCopiesForSize();
-    break;
  case SM_Speed:
-    llvm_unreachable("Spill mode 'speed' not implemented yet");
+    // hoistCopies will behave differently between size and speed.
+    hoistCopies();
  }

  // Transfer the simply mapped values, check if any are skipped.
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@ -18,6 +18,7 @@
 #include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/SmallPtrSet.h"

@ -329,9 +330,14 @@ private:
  MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB,
                                          MachineBasicBlock *DefMBB);

-  /// hoistCopiesForSize - Hoist back-copies to the complement interval in a
-  /// way that minimizes code size. This implements the SM_Size spill mode.
-  void hoistCopiesForSize();
+  /// Find out all the backCopies dominated by others.
+  void computeRedundantBackCopies(DenseSet<unsigned> &NotToHoistSet,
+                                  SmallVectorImpl<VNInfo *> &BackCopies);
+
+  /// Hoist back-copies to the complement interval. It tries to hoist all
+  /// the back-copies to one BB if it is beneficial, or else simply remove
+  /// redundent backcopies dominated by others.
+  void hoistCopies();

  /// transferValues - Transfer values to the new ranges.
  /// Return true if any ranges were skipped.
--- a/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
+++ b/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
@ -1,514 +0,0 @@
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR
-
-; Check that we do not end up with useless spill code.
-;
-; Move to the basic block we are interested in.
-;
-; CHECK: // %if.then.120
-;
-; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill
-; Check that w21 wouldn't need to be spilled since it is never reused.
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-;
-; Check that w22 is used to carry a value through the call.
-; DEFERRED-NOT: str {{[wx]}}22,
-; DEFERRED: mov {{[wx]}}22,
-; DEFERRED-NOT: str {{[wx]}}22,
-;
-; CHECK:        bl      fprintf
-;
-; DEFERRED-NOT: ldr {{[wx]}}22,
-; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22
-; DEFERRED-NOT: ldr {{[wx]}}22,
-;
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload
-;
-; End of the basic block we are interested in.
-; CHECK:        b
-; CHECK: {{[^:]+}}: // %sw.bb.123
-
-%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
-%struct.__sbuf = type { i8*, i64 }
-%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* }
-%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* }
-
-@__sF = external global [0 x %struct.__sFILE], align 8
-@.str = private unnamed_addr constant [20 x i8] c"\0A    [%d: stuff+mf \00", align 1
-
-declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...)
-
-declare void @bar(i32)
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-define i32 @foo(%struct.DState* %s) {
-entry:
-  %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1
-  %tmp = load i32, i32* %state, align 4
-  %cmp = icmp eq i32 %tmp, 10
-  %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40
-  br i1 %cmp, label %if.end.thread, label %if.end
-
-if.end.thread:                                    ; preds = %entry
-  %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
-  %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
-  %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
-  %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
-  %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
-  %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
-  %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
-  %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
-  %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
-  %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
-  %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
-  %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
-  %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
-  %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
-  %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
-  %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
-  %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
-  %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
-  %tmp1 = bitcast i32* %save_i to i8*
-  call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false)
-  br label %sw.default
-
-if.end:                                           ; preds = %entry
-  %.pre = load i32, i32* %save_i, align 4
-  %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
-  %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4
-  %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
-  %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4
-  %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
-  %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4
-  %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
-  %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4
-  %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
-  %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4
-  %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
-  %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4
-  %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
-  %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4
-  %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
-  %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4
-  %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
-  %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4
-  %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
-  %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4
-  %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
-  %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4
-  %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
-  %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4
-  %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
-  %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4
-  %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
-  %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4
-  %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
-  %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4
-  %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
-  %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4
-  %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
-  %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4
-  %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
-  %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4
-  switch i32 %tmp, label %sw.default [
-    i32 13, label %sw.bb
-    i32 14, label %if.end.sw.bb.65_crit_edge
-    i32 25, label %if.end.sw.bb.123_crit_edge
-  ]
-
-if.end.sw.bb.123_crit_edge:                       ; preds = %if.end
-  %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  br label %sw.bb.123
-
-if.end.sw.bb.65_crit_edge:                        ; preds = %if.end
-  %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4
-  br label %sw.bb.65
-
-sw.bb:                                            ; preds = %if.end
-  %sunkaddr = ptrtoint %struct.DState* %s to i64
-  %sunkaddr485 = add i64 %sunkaddr, 8
-  %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32*
-  store i32 13, i32* %sunkaddr486, align 4
-  %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  %tmp2 = load i32, i32* %bsLive, align 4
-  %cmp28.400 = icmp sgt i32 %tmp2, 7
-  br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph
-
-sw.bb.if.then.29_crit_edge:                       ; preds = %sw.bb
-  %sunkaddr487 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr488 = add i64 %sunkaddr487, 32
-  %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32*
-  %.pre425 = load i32, i32* %sunkaddr489, align 4
-  br label %if.then.29
-
-if.end.33.lr.ph:                                  ; preds = %sw.bb
-  %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8
-  %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1
-  %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4
-  %tmp4 = add i32 %.pre430, -1
-  br label %if.end.33
-
-if.then.29:                                       ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge
-  %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ]
-  %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ]
-  %sub = add nsw i32 %.lcssa393, -8
-  %shr = lshr i32 %tmp5, %sub
-  %and = and i32 %shr, 255
-  %sunkaddr491 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr492 = add i64 %sunkaddr491, 36
-  %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32*
-  store i32 %sub, i32* %sunkaddr493, align 4
-  %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9
-  store i32 %and, i32* %blockSize100k, align 4
-  %and.off = add nsw i32 %and, -49
-  %tmp6 = icmp ugt i32 %and.off, 8
-  br i1 %tmp6, label %save_state_and_return, label %if.end.62
-
-if.end.33:                                        ; preds = %while.body.backedge, %if.end.33.lr.ph
-  %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ]
-  %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ]
-  %cmp35 = icmp eq i32 %lsr.iv482, -1
-  br i1 %cmp35, label %save_state_and_return, label %if.end.37
-
-if.end.37:                                        ; preds = %if.end.33
-  %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8**
-  %sunkaddr494 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr495 = add i64 %sunkaddr494, 32
-  %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32*
-  %tmp9 = load i32, i32* %sunkaddr496, align 4
-  %shl = shl i32 %tmp9, 8
-  %tmp10 = load i8*, i8** %tmp8, align 8
-  %tmp11 = load i8, i8* %tmp10, align 1
-  %conv = zext i8 %tmp11 to i32
-  %or = or i32 %conv, %shl
-  store i32 %or, i32* %sunkaddr496, align 4
-  %add = add nsw i32 %tmp7, 8
-  %sunkaddr497 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr498 = add i64 %sunkaddr497, 36
-  %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32*
-  store i32 %add, i32* %sunkaddr499, align 4
-  %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1
-  store i8* %incdec.ptr, i8** %tmp8, align 8
-  %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr501 = add i64 %sunkaddr500, 8
-  %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32*
-  store i32 %lsr.iv482, i32* %sunkaddr502, align 4
-  %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr504 = add i64 %sunkaddr503, 12
-  %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32*
-  %tmp12 = load i32, i32* %sunkaddr505, align 4
-  %inc = add i32 %tmp12, 1
-  store i32 %inc, i32* %sunkaddr505, align 4
-  %cmp49 = icmp eq i32 %inc, 0
-  br i1 %cmp49, label %if.then.51, label %while.body.backedge
-
-if.then.51:                                       ; preds = %if.end.37
-  %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr507 = add i64 %sunkaddr506, 16
-  %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32*
-  %tmp13 = load i32, i32* %sunkaddr508, align 4
-  %inc53 = add i32 %tmp13, 1
-  store i32 %inc53, i32* %sunkaddr508, align 4
-  br label %while.body.backedge
-
-while.body.backedge:                              ; preds = %if.then.51, %if.end.37
-  %lsr.iv.next483 = add i32 %lsr.iv482, -1
-  %cmp28 = icmp sgt i32 %add, 7
-  br i1 %cmp28, label %if.then.29, label %if.end.33
-
-if.end.62:                                        ; preds = %if.then.29
-  %sub64 = add nsw i32 %and, -48
-  %sunkaddr509 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr510 = add i64 %sunkaddr509, 40
-  %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32*
-  store i32 %sub64, i32* %sunkaddr511, align 4
-  br label %sw.bb.65
-
-sw.bb.65:                                         ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge
-  %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ]
-  %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ]
-  %sunkaddr512 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr513 = add i64 %sunkaddr512, 8
-  %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32*
-  store i32 14, i32* %sunkaddr514, align 4
-  %cmp70.397 = icmp sgt i32 %tmp14, 7
-  br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph
-
-if.end.82.lr.ph:                                  ; preds = %sw.bb.65
-  %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8
-  %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1
-  %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4
-  %tmp16 = add i32 %.pre431, -1
-  br label %if.end.82
-
-if.then.72:                                       ; preds = %while.body.68.backedge, %sw.bb.65
-  %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ]
-  %sub76 = add nsw i32 %.lcssa390, -8
-  %sunkaddr516 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr517 = add i64 %sunkaddr516, 36
-  %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32*
-  store i32 %sub76, i32* %sunkaddr518, align 4
-  %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11
-  %tmp17 = load i32, i32* %currBlockNo, align 4
-  %inc117 = add nsw i32 %tmp17, 1
-  store i32 %inc117, i32* %currBlockNo, align 4
-  %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12
-  %tmp18 = load i32, i32* %verbosity, align 4
-  %cmp118 = icmp sgt i32 %tmp18, 1
-  br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0
-
-if.end.82:                                        ; preds = %while.body.68.backedge, %if.end.82.lr.ph
-  %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ]
-  %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ]
-  %cmp85 = icmp eq i32 %lsr.iv480, -1
-  br i1 %cmp85, label %save_state_and_return, label %if.end.88
-
-if.end.88:                                        ; preds = %if.end.82
-  %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8**
-  %sunkaddr519 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr520 = add i64 %sunkaddr519, 32
-  %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32*
-  %tmp21 = load i32, i32* %sunkaddr521, align 4
-  %shl90 = shl i32 %tmp21, 8
-  %tmp22 = load i8*, i8** %tmp20, align 8
-  %tmp23 = load i8, i8* %tmp22, align 1
-  %conv93 = zext i8 %tmp23 to i32
-  %or94 = or i32 %conv93, %shl90
-  store i32 %or94, i32* %sunkaddr521, align 4
-  %add97 = add nsw i32 %tmp19, 8
-  %sunkaddr522 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr523 = add i64 %sunkaddr522, 36
-  %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32*
-  store i32 %add97, i32* %sunkaddr524, align 4
-  %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1
-  store i8* %incdec.ptr100, i8** %tmp20, align 8
-  %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr526 = add i64 %sunkaddr525, 8
-  %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32*
-  store i32 %lsr.iv480, i32* %sunkaddr527, align 4
-  %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr529 = add i64 %sunkaddr528, 12
-  %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32*
-  %tmp24 = load i32, i32* %sunkaddr530, align 4
-  %inc106 = add i32 %tmp24, 1
-  store i32 %inc106, i32* %sunkaddr530, align 4
-  %cmp109 = icmp eq i32 %inc106, 0
-  br i1 %cmp109, label %if.then.111, label %while.body.68.backedge
-
-if.then.111:                                      ; preds = %if.end.88
-  %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr532 = add i64 %sunkaddr531, 16
-  %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32*
-  %tmp25 = load i32, i32* %sunkaddr533, align 4
-  %inc114 = add i32 %tmp25, 1
-  store i32 %inc114, i32* %sunkaddr533, align 4
-  br label %while.body.68.backedge
-
-while.body.68.backedge:                           ; preds = %if.then.111, %if.end.88
-  %lsr.iv.next481 = add i32 %lsr.iv480, -1
-  %cmp70 = icmp sgt i32 %add97, 7
-  br i1 %cmp70, label %if.then.72, label %if.end.82
-
-if.then.120:                                      ; preds = %if.then.72
-  %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117)
-  br label %sw.bb.123
-
-sw.bb.123:                                        ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge
-  %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ]
-  %sunkaddr534 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr535 = add i64 %sunkaddr534, 8
-  %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32*
-  store i32 25, i32* %sunkaddr536, align 4
-  %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4
-  %cmp128.395 = icmp sgt i32 %tmp26, 7
-  br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph
-
-sw.bb.123.if.then.130_crit_edge:                  ; preds = %sw.bb.123
-  %sunkaddr537 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr538 = add i64 %sunkaddr537, 32
-  %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32*
-  %.pre429 = load i32, i32* %sunkaddr539, align 4
-  br label %if.then.130
-
-if.end.140.lr.ph:                                 ; preds = %sw.bb.123
-  %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8
-  %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1
-  %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4
-  %tmp28 = add i32 %.pre432, -1
-  br label %if.end.140
-
-if.then.130:                                      ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge
-  %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ]
-  %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ]
-  %sub134 = add nsw i32 %.lcssa, -8
-  %shr135 = lshr i32 %tmp29, %sub134
-  store i32 %sub134, i32* %bsLive127.pre-phi, align 4
-  %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13
-  %tmp30 = load i32, i32* %origPtr, align 4
-  %shl175 = shl i32 %tmp30, 8
-  %conv176 = and i32 %shr135, 255
-  %or177 = or i32 %shl175, %conv176
-  store i32 %or177, i32* %origPtr, align 4
-  %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27
-  %tmp31 = load i32, i32* %nInUse, align 4
-  %add179 = add nsw i32 %tmp31, 2
-  br label %save_state_and_return
-
-if.end.140:                                       ; preds = %while.body.126.backedge, %if.end.140.lr.ph
-  %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ]
-  %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ]
-  %cmp143 = icmp eq i32 %lsr.iv, -1
-  br i1 %cmp143, label %save_state_and_return, label %if.end.146
-
-if.end.146:                                       ; preds = %if.end.140
-  %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8**
-  %sunkaddr541 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr542 = add i64 %sunkaddr541, 32
-  %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32*
-  %tmp34 = load i32, i32* %sunkaddr543, align 4
-  %shl148 = shl i32 %tmp34, 8
-  %tmp35 = load i8*, i8** %tmp33, align 8
-  %tmp36 = load i8, i8* %tmp35, align 1
-  %conv151 = zext i8 %tmp36 to i32
-  %or152 = or i32 %conv151, %shl148
-  store i32 %or152, i32* %sunkaddr543, align 4
-  %add155 = add nsw i32 %tmp32, 8
-  store i32 %add155, i32* %bsLive127.pre-phi, align 4
-  %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1
-  store i8* %incdec.ptr158, i8** %tmp33, align 8
-  %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr545 = add i64 %sunkaddr544, 8
-  %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32*
-  store i32 %lsr.iv, i32* %sunkaddr546, align 4
-  %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr548 = add i64 %sunkaddr547, 12
-  %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32*
-  %tmp37 = load i32, i32* %sunkaddr549, align 4
-  %inc164 = add i32 %tmp37, 1
-  store i32 %inc164, i32* %sunkaddr549, align 4
-  %cmp167 = icmp eq i32 %inc164, 0
-  br i1 %cmp167, label %if.then.169, label %while.body.126.backedge
-
-if.then.169:                                      ; preds = %if.end.146
-  %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr551 = add i64 %sunkaddr550, 16
-  %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32*
-  %tmp38 = load i32, i32* %sunkaddr552, align 4
-  %inc172 = add i32 %tmp38, 1
-  store i32 %inc172, i32* %sunkaddr552, align 4
-  br label %while.body.126.backedge
-
-while.body.126.backedge:                          ; preds = %if.then.169, %if.end.146
-  %lsr.iv.next = add i32 %lsr.iv, -1
-  %cmp128 = icmp sgt i32 %add155, 7
-  br i1 %cmp128, label %if.then.130, label %if.end.140
-
-sw.default:                                       ; preds = %if.end, %if.end.thread
-  %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ]
-  %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ]
-  %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ]
-  %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ]
-  %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ]
-  %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ]
-  %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ]
-  %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ]
-  %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ]
-  %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ]
-  %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ]
-  %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ]
-  %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ]
-  %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ]
-  %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ]
-  %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ]
-  %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ]
-  %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ]
-  %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ]
-  %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ]
-  %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ]
-  %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ]
-  %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ]
-  %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ]
-  %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ]
-  %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ]
-  %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ]
-  %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ]
-  %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ]
-  %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ]
-  %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ]
-  %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ]
-  %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ]
-  %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ]
-  %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ]
-  %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ]
-  %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ]
-  tail call void @bar(i32 4001)
-  br label %save_state_and_return
-
-save_state_and_return:                            ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29
-  %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ]
-  %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ]
-  %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ]
-  %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ]
-  %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ]
-  %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ]
-  %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ]
-  %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ]
-  %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ]
-  %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ]
-  %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ]
-  %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ]
-  %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ]
-  %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ]
-  %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ]
-  %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ]
-  %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ]
-  %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ]
-  %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ]
-  %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ]
-  %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ]
-  %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ]
-  %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ]
-  %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ]
-  %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ]
-  %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ]
-  %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ]
-  %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ]
-  %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ]
-  %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ]
-  %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ]
-  %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ]
-  %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ]
-  %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ]
-  %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ]
-  %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ]
-  %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ]
-  %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ]
-  store i32 %tmp58, i32* %save_i, align 4
-  store i32 %tmp59, i32* %save_j3.pre-phi468, align 4
-  store i32 %tmp60, i32* %save_t4.pre-phi466, align 4
-  store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4
-  store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4
-  store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4
-  store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4
-  store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4
-  store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4
-  store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4
-  store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4
-  store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4
-  store i32 %tmp68, i32* %save_es14.pre-phi446, align 4
-  store i32 %tmp69, i32* %save_N15.pre-phi444, align 4
-  store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4
-  store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4
-  store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4
-  store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4
-  store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4
-  ret i32 %retVal.0
-}
-
-!0 = !{!"branch_weights", i32 10, i32 1}
--- a/test/CodeGen/X86/fp128-compare.ll
+++ b/test/CodeGen/X86/fp128-compare.ll
@ -86,8 +86,8 @@ entry:
  %cond = select i1 %cmp, fp128 %x, fp128 %y
  ret fp128 %cond
 ; CHECK-LABEL: TestMax:
-; CHECK: movaps %xmm1
 ; CHECK: movaps %xmm0
+; CHECK: movaps %xmm1
 ; CHECK: callq __gttf2
 ; CHECK: movaps {{.*}}, %xmm0
 ; CHECK: testl %eax, %eax
--- a/test/CodeGen/X86/hoist-spill.ll
+++ b/test/CodeGen/X86/hoist-spill.ll
@ -0,0 +1,121 @@
+; RUN: llc < %s | FileCheck %s
+
+; grep 'Spill' |sed 's%.*\(-[0-9]\+(\%rsp)\).*%\1%g' |sort |uniq -d |awk '{if (/rsp/); exit -1}'
+; Check no spills to the same stack slot after hoisting.
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET3:-?[0-9]*]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET3]](%rsp)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global i32*, align 8
+@b = external global i32, align 4
+@d = external global i32*, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @fn1(i32 %p1) {
+entry:
+  %tmp = load i32*, i32** @d, align 8
+  %tmp1 = load i32*, i32** @a, align 8
+  %tmp2 = sext i32 %p1 to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc14, %entry
+  %indvar = phi i32 [ %indvar.next, %for.inc14 ], [ 0, %entry ]
+  %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ]
+  %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ]
+  %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ]
+  %tmp3 = icmp sgt i32 undef, 0
+  %smax52 = select i1 %tmp3, i32 undef, i32 0
+  %tmp4 = zext i32 %smax52 to i64
+  %tmp5 = icmp sgt i64 undef, %tmp4
+  %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4
+  %tmp6 = add nsw i64 %smax53, 1
+  %tmp7 = sub nsw i64 %tmp6, %tmp4
+  %tmp8 = add nsw i64 %tmp7, -8
+  %tmp9 = sub i32 undef, %indvar
+  %tmp10 = icmp sgt i64 %tmp2, 0
+  %smax40 = select i1 %tmp10, i64 %tmp2, i64 0
+  %scevgep41 = getelementptr i32, i32* %tmp1, i64 %smax40
+  %indvars.iv30 = add i32 %indvars.iv30.in, -1
+  %tmp11 = icmp sgt i32 %indvars.iv30, 0
+  %smax = select i1 %tmp11, i32 %indvars.iv30, i32 0
+  %tmp12 = zext i32 %smax to i64
+  %sub = sub nsw i32 %p1, %c.0
+  %cmp = icmp sgt i32 %sub, 0
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  %cmp326 = icmp sgt i32 %k.0, %p1
+  br i1 %cmp326, label %for.cond4.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.cond4.preheader:                              ; preds = %for.body, %for.cond
+  %k.1.lcssa = phi i32 [ %k.0, %for.cond ], [ %add, %for.body ]
+  %cmp528 = icmp sgt i32 %sub., %p1
+  br i1 %cmp528, label %for.inc14, label %for.body6.preheader
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br i1 undef, label %for.body6, label %min.iters.checked
+
+min.iters.checked:                                ; preds = %for.body6.preheader
+  br i1 undef, label %for.body6, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %min.iters.checked
+  %bound1 = icmp ule i32* undef, %scevgep41
+  %memcheck.conflict = and i1 undef, %bound1
+  br i1 %memcheck.conflict, label %for.body6, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  %lcmp.mod = icmp eq i64 undef, 0
+  br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.prol:                                 ; preds = %vector.body.prol, %vector.body.preheader
+  %prol.iter.cmp = icmp eq i64 undef, 0
+  br i1 %prol.iter.cmp, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.preheader.split:                      ; preds = %vector.body.prol, %vector.body.preheader
+  %tmp13 = icmp ult i64 %tmp8, 24
+  br i1 %tmp13, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.body.preheader.split
+  %index = phi i64 [ %index.next.3, %vector.body ], [ 0, %vector.body.preheader.split ]
+  %index.next = add i64 %index, 8
+  %offset.idx.1 = add i64 %tmp12, %index.next
+  %tmp14 = getelementptr inbounds i32, i32* %tmp, i64 %offset.idx.1
+  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %tmp15, align 4
+  %tmp16 = getelementptr inbounds i32, i32* %tmp1, i64 %offset.idx.1
+  %tmp17 = bitcast i32* %tmp16 to <4 x i32>*
+  store <4 x i32> %wide.load.1, <4 x i32>* %tmp17, align 4
+  %index.next.3 = add i64 %index, 32
+  br i1 undef, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %vector.body.preheader.split
+  br i1 undef, label %for.inc14, label %for.body6
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %k.127 = phi i32 [ %k.0, %for.body.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %k.127, 1
+  %tmp18 = load i32, i32* undef, align 4
+  store i32 %tmp18, i32* @b, align 4
+  br i1 undef, label %for.body, label %for.cond4.preheader
+
+for.body6:                                        ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
+  %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
+  %arrayidx8 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv32
+  %tmp19 = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %tmp1, i64 %indvars.iv32
+  store i32 %tmp19, i32* %arrayidx10, align 4
+  %cmp5 = icmp slt i64 %indvars.iv32, undef
+  br i1 %cmp5, label %for.body6, label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6, %middle.block, %for.cond4.preheader
+  %inc15 = add nuw nsw i32 %c.0, 1
+  %indvar.next = add i32 %indvar, 1
+  br label %for.cond
+}
--- a/test/CodeGen/X86/new-remat.ll
+++ b/test/CodeGen/X86/new-remat.ll
@ -0,0 +1,70 @@
+; RUN: llc < %s | FileCheck %s
+; Check all spills are rematerialized.
+; CHECK-NOT: Spill
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global double 0.000000e+00, align 8
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @uniform_testdata(i32 %p1) {
+entry:
+  %cmp3 = icmp sgt i32 %p1, 0
+  br i1 %cmp3, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = add i32 %p1, -1
+  %xtraiter = and i32 %p1, 7
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.body.preheader.split, label %for.body.prol.preheader
+
+for.body.prol.preheader:                          ; preds = %for.body.preheader
+  br label %for.body.prol
+
+for.body.prol:                                    ; preds = %for.body.prol, %for.body.prol.preheader
+  %i.04.prol = phi i32 [ %inc.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
+  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.prol.preheader ]
+  %tmp1 = load double, double* @b, align 8
+  %call.prol = tail call double @pow(double %tmp1, double 2.500000e-01)
+  %inc.prol = add nuw nsw i32 %i.04.prol, 1
+  %prol.iter.sub = add i32 %prol.iter, -1
+  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
+  br i1 %prol.iter.cmp, label %for.body.preheader.split.loopexit, label %for.body.prol
+
+for.body.preheader.split.loopexit:                ; preds = %for.body.prol
+  %inc.prol.lcssa = phi i32 [ %inc.prol, %for.body.prol ]
+  br label %for.body.preheader.split
+
+for.body.preheader.split:                         ; preds = %for.body.preheader.split.loopexit, %for.body.preheader
+  %i.04.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.prol.lcssa, %for.body.preheader.split.loopexit ]
+  %tmp2 = icmp ult i32 %tmp, 7
+  br i1 %tmp2, label %for.end.loopexit, label %for.body.preheader.split.split
+
+for.body.preheader.split.split:                   ; preds = %for.body.preheader.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.split.split
+  %i.04 = phi i32 [ %i.04.unr, %for.body.preheader.split.split ], [ %inc.7, %for.body ]
+  %tmp3 = load double, double* @b, align 8
+  %call = tail call double @pow(double %tmp3, double 2.500000e-01)
+  %tmp4 = load double, double* @b, align 8
+  %call.1 = tail call double @pow(double %tmp4, double 2.500000e-01)
+  %inc.7 = add nsw i32 %i.04, 8
+  %exitcond.7 = icmp eq i32 %inc.7, %p1
+  br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body
+
+for.end.loopexit.unr-lcssa:                       ; preds = %for.body
+  br label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.end.loopexit.unr-lcssa, %for.body.preheader.split
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %tmp5 = load i32, i32* @a, align 4
+  ret i32 %tmp5
+}
+
+; Function Attrs: nounwind
+declare double @pow(double, double)
--- a/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/test/CodeGen/X86/ragreedy-hoist-spill.ll
@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s

 ; This testing case is reduced from 254.gap SyFgets function.
-; We make sure a spill is not hoisted to a hotter outer loop.
+; We make sure a spill is hoisted to a cold BB inside the hotter outer loop.

 %struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
 %struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
@ -181,6 +181,10 @@ sw.bb474:
  br i1 %cmp476, label %if.end517, label %do.body479.preheader

 do.body479.preheader:
+  ; CHECK: do.body479.preheader
+  ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold.
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: land.rhs485
  %cmp4833314 = icmp eq i8 undef, 0
  br i1 %cmp4833314, label %if.end517, label %land.rhs485

@ -200,8 +204,8 @@ land.lhs.true490:

 lor.rhs500:
  ; CHECK: lor.rhs500
-  ; Make sure that we don't hoist the spill to outer loops.
-  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; Make sure spill is hoisted to a cold preheader in outside loop.
+  ; CHECK-NOT: movq %r{{.*}}, {{[0-9]+}}(%rsp)
  ; CHECK: callq {{.*}}maskrune
  %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
  br i1 undef, label %land.lhs.true504, label %do.body479.backedge