[ARM] Alter t2DoLoopStart to define lr

This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2025-01-31 20:51:52 +01:00 · 2020-11-10 15:57:58 +00:00 · 2020-11-10 15:57:58 +00:00 · 0773b05cfa
commit 0773b05cfa
parent 13bea66c92
145 changed files with 2166 additions and 2227 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -15502,6 +15502,45 @@ on their operand. It's a hint to the backend that can use this to set up the
 hardware-loop count with a target specific instruction, usually a move of this
 value to a special register or a hardware-loop instruction.

+
+'``llvm.start.loop.iterations.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.start.loop.iterations.i32(i32)
+      declare i64 @llvm.start.loop.iterations.i64(i64)
+
+Overview:
+"""""""""
+
+The '``llvm.start.loop.iterations.*``' intrinsics are similar to the
+'``llvm.set.loop.iterations.*``' intrinsics, used to specify the
+hardware-loop trip count but also produce a value identical to the input
+that can be used as the input to the loop. They are placed in the loop
+preheader basic block and the output is expected to be the input to the
+phi for the induction variable of the loop, decremented by the
+'``llvm.loop.decrement.reg.*``'.
+
+Arguments:
+""""""""""
+
+The integer operand is the loop trip count of the hardware-loop, and thus
+not e.g. the loop back-edge taken count.
+
+Semantics:
+""""""""""
+
+The '``llvm.start.loop.iterations.*``' intrinsics do not perform any arithmetic
+on their operand. It's a hint to the backend that can use this to set up the
+hardware-loop count with a target specific instruction, usually a move of this
+value to a special register or a hardware-loop instruction.
+
 '``llvm.test.set.loop.iterations.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -1576,6 +1576,11 @@ def int_matrix_column_major_store
 def int_set_loop_iterations :
  DefaultAttrsIntrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;

+// Same as the above, but produces a value (the same as the input operand) to
+// be fed into the loop.
+def int_start_loop_iterations :
+  DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoDuplicate]>;
+
 // Specify that the value given is the number of iterations that the next loop
 // will execute. Also test that the given count is not zero, allowing it to
 // control entry to a 'while' loop.
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@ -6672,6 +6672,10 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
        const SCEV *ClampedX = getUMinExpr(X, getNotSCEV(Y));
        return getAddExpr(ClampedX, Y, SCEV::FlagNUW);
      }
+      case Intrinsic::start_loop_iterations:
+        // A start_loop_iterations is just equivalent to the first operand for
+        // SCEV purposes.
+        return getSCEV(II->getArgOperand(0));
      default:
        break;
      }
--- a/lib/CodeGen/HardwareLoops.cpp
+++ b/lib/CodeGen/HardwareLoops.cpp
@ -165,7 +165,7 @@ namespace {
    Value *InitLoopCount();

    // Insert the set_loop_iteration intrinsic.
-    void InsertIterationSetup(Value *LoopCountInit);
+    Value *InsertIterationSetup(Value *LoopCountInit);

    // Insert the loop_decrement intrinsic.
    void InsertLoopDec();
@ -325,11 +325,11 @@ void HardwareLoop::Create() {
    return;
  }

-  InsertIterationSetup(LoopCountInit);
+  Value *Setup = InsertIterationSetup(LoopCountInit);

  if (UsePHICounter || ForceHardwareLoopPHI) {
    Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
-    Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec);
+    Value *EltsRem = InsertPHICounter(Setup, LoopDec);
    LoopDec->setOperand(0, EltsRem);
    UpdateBranch(LoopDec);
  } else
@ -437,11 +437,13 @@ Value *HardwareLoop::InitLoopCount() {
  return Count;
 }

-void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
+Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
  IRBuilder<> Builder(BeginBB->getTerminator());
  Type *Ty = LoopCountInit->getType();
-  Intrinsic::ID ID = UseLoopGuard ?
-    Intrinsic::test_set_loop_iterations : Intrinsic::set_loop_iterations;
+  bool UsePhi = UsePHICounter || ForceHardwareLoopPHI;
+  Intrinsic::ID ID = UseLoopGuard ? Intrinsic::test_set_loop_iterations
+                                  : (UsePhi ? Intrinsic::start_loop_iterations
+                                           : Intrinsic::set_loop_iterations);
  Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
  Value *SetCount = Builder.CreateCall(LoopIter, LoopCountInit);

@ -457,6 +459,7 @@ void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
  }
  LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: "
             << *SetCount << "\n");
+  return UseLoopGuard ? LoopCountInit : SetCount;
 }

 void HardwareLoop::InsertLoopDec() {
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@ -5420,9 +5420,11 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
  let isTerminator = 1;
 }

+let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {
+
 def t2DoLoopStart :
-  t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
-  [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;
+  t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
+  [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;

 let hasSideEffects = 0 in
 def t2LoopDec :
@ -5444,6 +5446,8 @@ def t2LoopEnd :

 } // end isBranch, isTerminator, hasSideEffects

+}
+
 } // end isNotDuplicable

 class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
--- a/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp
@ -429,7 +429,10 @@ namespace {
    // Return the operand for the loop start instruction. This will be the loop
    // iteration count, or the number of elements if we're tail predicating.
    MachineOperand &getLoopStartOperand() {
-      return IsTailPredicationLegal() ? TPNumElements : Start->getOperand(0);
+      if (IsTailPredicationLegal())
+        return TPNumElements;
+      return Start->getOpcode() == ARM::t2DoLoopStart ? Start->getOperand(1)
+                                                      : Start->getOperand(0);
    }

    unsigned getStartOpcode() const {
@ -495,6 +498,7 @@ namespace {
    bool RevertNonLoops();

    void RevertWhile(MachineInstr *MI) const;
+    void RevertDo(MachineInstr *MI) const;

    bool RevertLoopDec(MachineInstr *MI) const;

@ -618,8 +622,12 @@ bool LowOverheadLoop::ValidateTailPredicate() {
  // count instead of iteration count, won't affect any other instructions
  // than the LoopStart and LoopDec.
  // TODO: We should try to insert the [W|D]LSTP after any of the other uses.
-  if (StartInsertPt == Start && Start->getOperand(0).getReg() == ARM::LR) {
-    if (auto *IterCount = RDA.getMIOperand(Start, 0)) {
+  Register StartReg = Start->getOpcode() == ARM::t2DoLoopStart
+                          ? Start->getOperand(1).getReg()
+                          : Start->getOperand(0).getReg();
+  if (StartInsertPt == Start && StartReg == ARM::LR) {
+    if (auto *IterCount = RDA.getMIOperand(
+            Start, Start->getOpcode() == ARM::t2DoLoopStart ? 1 : 0)) {
      SmallPtrSet<MachineInstr *, 2> Uses;
      RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses);
      for (auto *Use : Uses) {
@ -1053,53 +1061,15 @@ void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
                                    MachineBasicBlock *&InsertBB,
                                    ReachingDefAnalysis &RDA,
                                    InstSet &ToRemove) {
-    // We can define LR because LR already contains the same value.
-    if (Start->getOperand(0).getReg() == ARM::LR) {
+    // For a t2DoLoopStart it is always valid to use the start insertion point.
+    // For WLS we can define LR if LR already contains the same value.
+    if (Start->getOpcode() == ARM::t2DoLoopStart ||
+        Start->getOperand(0).getReg() == ARM::LR) {
      InsertPt = MachineBasicBlock::iterator(Start);
      InsertBB = Start->getParent();
      return true;
    }

-    Register CountReg = Start->getOperand(0).getReg();
-    auto IsMoveLR = [&CountReg](MachineInstr *MI) {
-      return MI->getOpcode() == ARM::tMOVr &&
-             MI->getOperand(0).getReg() == ARM::LR &&
-             MI->getOperand(1).getReg() == CountReg &&
-             MI->getOperand(2).getImm() == ARMCC::AL;
-    };
-
-    // Find an insertion point:
-    // - Is there a (mov lr, Count) before Start? If so, and nothing else
-    //   writes to Count before Start, we can insert at start.
-    if (auto *LRDef =
-            RDA.getUniqueReachingMIDef(Start, MCRegister::from(ARM::LR))) {
-      if (IsMoveLR(LRDef) &&
-          RDA.hasSameReachingDef(Start, LRDef, CountReg.asMCReg())) {
-        SmallPtrSet<MachineInstr *, 2> Ignore = { Dec };
-        if (!TryRemove(LRDef, RDA, ToRemove, Ignore))
-          return false;
-        InsertPt = MachineBasicBlock::iterator(Start);
-        InsertBB = Start->getParent();
-        return true;
-      }
-    }
-
-    // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
-    //   to Count after Start, we can insert at that mov (which will now be
-    //   dead).
-    MachineBasicBlock *MBB = Start->getParent();
-    if (auto *LRDef =
-            RDA.getLocalLiveOutMIDef(MBB, MCRegister::from(ARM::LR))) {
-      if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) {
-        SmallPtrSet<MachineInstr *, 2> Ignore = { Start, Dec };
-        if (!TryRemove(LRDef, RDA, ToRemove, Ignore))
-          return false;
-        InsertPt = MachineBasicBlock::iterator(LRDef);
-        InsertBB = LRDef->getParent();
-        return true;
-      }
-    }
-
    // We've found no suitable LR def and Start doesn't use LR directly. Can we
    // just define LR anyway?
    if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR)))
@ -1364,6 +1334,16 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
  MI->eraseFromParent();
 }

+void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
+  MachineBasicBlock *MBB = MI->getParent();
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::tMOVr))
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .add(predOps(ARMCC::AL));
+  MI->eraseFromParent();
+}
+
 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
  MachineBasicBlock *MBB = MI->getParent();
@ -1432,7 +1412,7 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
 //
 //   $lr = big-itercount-expression
 //   ..
-//   t2DoLoopStart renamable $lr
+//   $lr = t2DoLoopStart renamable $lr
 //   vector.body:
 //     ..
 //     $vpr = MVE_VCTP32 renamable $r3
@ -1455,7 +1435,8 @@ void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {

  LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");

-  MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0);
+  MachineInstr *Def = RDA->getMIOperand(
+      LoLoop.Start, LoLoop.Start->getOpcode() == ARM::t2DoLoopStart ? 1 : 0);
  if (!Def) {
    LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
    return;
@ -1634,7 +1615,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
    if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
      RevertWhile(LoLoop.Start);
    else
-      LoLoop.Start->eraseFromParent();
+      RevertDo(LoLoop.Start);
    bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec);
    RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
  } else {
@ -1699,7 +1680,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
      if (Start->getOpcode() == ARM::t2WhileLoopStart)
        RevertWhile(Start);
      else
-        Start->eraseFromParent();
+        RevertDo(Start);
    }
    for (auto *Dec : Decs)
      RevertLoopDec(Dec);
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -1679,7 +1679,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
      switch (Call->getIntrinsicID()) {
      default:
        break;
-      case Intrinsic::set_loop_iterations:
+      case Intrinsic::start_loop_iterations:
      case Intrinsic::test_set_loop_iterations:
      case Intrinsic::loop_decrement:
      case Intrinsic::loop_decrement_reg:
--- a/lib/Target/ARM/MVETailPredication.cpp
+++ b/lib/Target/ARM/MVETailPredication.cpp
@ -188,7 +188,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
        continue;

      Intrinsic::ID ID = Call->getIntrinsicID();
-      if (ID == Intrinsic::set_loop_iterations ||
+      if (ID == Intrinsic::start_loop_iterations ||
          ID == Intrinsic::test_set_loop_iterations)
        return cast<IntrinsicInst>(&I);
    }
--- a/test/CodeGen/ARM/machine-outliner-unoutlinable.mir
+++ b/test/CodeGen/ARM/machine-outliner-unoutlinable.mir
@ -152,7 +152,7 @@ body:             |
    $q5 = MVE_VDUP32 $r3, 0, $noreg, $q5
    $q4 = MVE_VDUP32 $r4, 0, $noreg, $q4
    $q0 = MVE_VADDf32 $q4, $q5, 0, $noreg, $q0
-    t2DoLoopStart $r4
+    $lr = t2DoLoopStart $r4
    $r0 = MVE_VMOV_from_lane_32 renamable $q0, 1, 14, $noreg
    tBL 14, $noreg, @z
  bb.1:
@ -160,7 +160,7 @@ body:             |
    $q5 = MVE_VDUP32 $r3, 0, $noreg, $q5
    $q4 = MVE_VDUP32 $r4, 0, $noreg, $q4
    $q0 = MVE_VADDf32 $q4, $q5, 0, $noreg, $q0
-    t2DoLoopStart $r4
+    $lr = t2DoLoopStart $r4
    $r0 = MVE_VMOV_from_lane_32 renamable $q0, 1, 14, $noreg
    tBL 14, $noreg, @z
  bb.2:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll
@ -21,7 +21,7 @@
 ; CHECK-END:   b .LBB0_2
 define void @check_loop_dec_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -49,7 +49,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -64,7 +64,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_ugt_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -92,7 +92,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -107,7 +107,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_ult_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -135,7 +135,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -150,7 +150,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_ult_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -179,7 +179,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -194,7 +194,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_sgt_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -222,7 +222,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -237,7 +237,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_sge_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -265,7 +265,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -280,7 +280,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_sge_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -309,7 +309,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -324,7 +324,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_uge_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -352,7 +352,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -367,7 +367,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_uge_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
  br label %for.body.preheader

 for.body.preheader:
@ -396,7 +396,7 @@ for.header:
  %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
  %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
  %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
  br label %for.body

 for.cond.cleanup:
@ -507,6 +507,6 @@ while.end:
  ret void
 }

-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i1 @llvm.test.set.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
@ -17,17 +17,17 @@ define hidden i32 @_Z4loopPiPjiS0_i(i32* noalias nocapture readonly %s1, i32* no
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], 1
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[VECTOR_BODY75_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.body75.preheader:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[START1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY75:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[START2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP3]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[START2]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>*
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
@ -48,7 +48,7 @@ define hidden i32 @_Z4loopPiPjiS0_i(i32* noalias nocapture readonly %s1, i32* no
 ; CHECK-NEXT:    [[LSR_IV3:%.*]] = phi i32* [ [[S2:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP4:%.*]], [[VECTOR_BODY75]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[D]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[VECTOR_BODY75]] ]
 ; CHECK-NEXT:    [[INDEX80:%.*]] = phi i32 [ [[INDEX_NEXT81:%.*]], [[VECTOR_BODY75]] ], [ 0, [[VECTOR_BODY75_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[START1]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ]
 ; CHECK-NEXT:    [[LSR_IV68:%.*]] = bitcast i32* [[LSR_IV6]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV35:%.*]] = bitcast i32* [[LSR_IV3]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV2:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
@ -88,19 +88,19 @@ for.body.lr.ph:                                   ; preds = %entry
  br i1 %tobool, label %vector.body75.preheader, label %vector.ph

 vector.body75.preheader:                          ; preds = %for.body.lr.ph
-  call void @llvm.set.loop.iterations.i32(i32 %2)
+  %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %2)
  br label %vector.body75

 vector.ph:                                        ; preds = %for.body.lr.ph
  %broadcast.splatinsert71 = insertelement <4 x i32> undef, i32 %x, i32 0
  %broadcast.splat72 = shufflevector <4 x i32> %broadcast.splatinsert71, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %3)
+  %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %3)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %lsr.iv9 = phi i32* [ %scevgep10, %vector.body ], [ %d, %vector.ph ]
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %4 = phi i32 [ %3, %vector.ph ], [ %8, %vector.body ]
+  %4 = phi i32 [ %start2, %vector.ph ], [ %8, %vector.body ]
  %lsr.iv911 = bitcast i32* %lsr.iv9 to <4 x i32>*
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@ -120,7 +120,7 @@ vector.body75:                                    ; preds = %vector.body75, %vec
  %lsr.iv3 = phi i32* [ %s2, %vector.body75.preheader ], [ %scevgep4, %vector.body75 ]
  %lsr.iv = phi i32* [ %d, %vector.body75.preheader ], [ %scevgep, %vector.body75 ]
  %index80 = phi i32 [ %index.next81, %vector.body75 ], [ 0, %vector.body75.preheader ]
-  %10 = phi i32 [ %2, %vector.body75.preheader ], [ %15, %vector.body75 ]
+  %10 = phi i32 [ %start1, %vector.body75.preheader ], [ %15, %vector.body75 ]
  %lsr.iv68 = bitcast i32* %lsr.iv6 to <4 x i32>*
  %lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>*
  %lsr.iv2 = bitcast i32* %lsr.iv to <4 x i32>*
@ -148,7 +148,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %vecto
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)

 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir
@ -79,7 +79,7 @@ body:             |
    $r4 = t2MOVTi16 killed $r4, target-flags(arm-hi16) @arm_cmplx_conj_f32_mve.cmplx_conj_sign, 14 /* CC::al */, $noreg
    renamable $q0 = nnan ninf nsz MVE_VLDRWU32 killed renamable $r4, 0, 0, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.1 (align 4):
    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir
@ -9,13 +9,13 @@
  entry:
    %scevgep = getelementptr i32, i32* %q, i32 -1
    %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    br label %while.body

  while.body:                                       ; preds = %while.body, %entry
    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
-    %0 = phi i32 [ %n, %entry ], [ %2, %while.body ]
+    %0 = phi i32 [ %start, %entry ], [ %2, %while.body ]
    %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
    %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
    %1 = load i32, i32* %scevgep6, align 4
@ -30,7 +30,7 @@
    ret i32 0
  }

-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0

  attributes #0 = { noduplicate nounwind }
@ -112,7 +112,7 @@ body:             |
    frame-setup CFI_INSTRUCTION offset $lr, -4
    frame-setup CFI_INSTRUCTION offset $r7, -8
    $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart killed $r0
+    $lr = t2DoLoopStart killed $r0
    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@ -15,9 +15,9 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    bic r12, r12, #3
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #2
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    and r4, r12, #15
@ -107,9 +107,9 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a,
 ; CHECK-NEXT:    bic r4, r4, #3
 ; CHECK-NEXT:    sub.w lr, r4, #4
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
+; CHECK-NEXT:    add.w r4, r4, lr, lsr #2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    and r5, r4, #15
@ -210,9 +210,9 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
 ; CHECK-NEXT:    bic r4, r4, #3
 ; CHECK-NEXT:    sub.w lr, r4, #4
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
+; CHECK-NEXT:    add.w r4, r4, lr, lsr #2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r12
@ -309,9 +309,9 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
 ; CHECK-NEXT:    bic r4, r4, #3
 ; CHECK-NEXT:    sub.w lr, r4, #4
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
+; CHECK-NEXT:    add.w r4, r4, lr, lsr #2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r12
@ -402,8 +402,8 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %bb3
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB4_2: @ %bb9
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #4
@ -464,8 +464,8 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %bb4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:  .LBB5_2: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
--- a/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir
@ -10,11 +10,11 @@
    br i1 %cmp, label %exit, label %loop.ph

  loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
    br label %loop.body

  loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
    %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
    %addr.a = phi <8 x i16>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
    %addr.b = phi <8 x i16>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@ -46,11 +46,11 @@
    br i1 %cmp, label %exit, label %loop.ph

  loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
    br label %loop.body

  loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
    %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
    %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
    %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@ -82,11 +82,11 @@
    br i1 %cmp, label %exit, label %loop.ph

  loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
    br label %loop.body

  loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
    %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
    %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
    %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@ -115,7 +115,7 @@
  declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1 immarg)
  declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1 immarg)
  declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1 immarg)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@ -166,23 +166,23 @@ body:             |
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
  ; CHECK:   tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
  ; CHECK:   t2IT 11, 8, implicit-def $itstate
-  ; CHECK:   tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
  ; CHECK: bb.1.loop.ph:
  ; CHECK:   successors: %bb.2(0x80000000)
  ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-  ; CHECK:   dead $lr = t2DLS renamable $r12
-  ; CHECK:   $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $r4 = tMOVr killed $lr, 14 /* CC::al */, $noreg
  ; CHECK: bb.2.loop.body:
  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4
+  ; CHECK:   $lr = tMOVr $r4, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg
  ; CHECK:   MVE_VPST 4, implicit $vpr
  ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2)
  ; CHECK:   renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2)
-  ; CHECK:   $lr = tMOVr $r4, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $q1 = MVE_VCLZs8 killed renamable $q1, 0, $noreg, undef renamable $q1
  ; CHECK:   $r0 = tMOVr $r1, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg
@ -190,7 +190,7 @@ body:             |
  ; CHECK:   renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 2)
  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
  ; CHECK: bb.3.exit:
-  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
  bb.0.entry:
    successors: %bb.1(0x80000000)
    liveins: $r0, $r1, $r2, $r3, $r4, $lr
@ -201,27 +201,27 @@ body:             |
    frame-setup CFI_INSTRUCTION offset $r4, -8
    tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
    t2IT 11, 8, implicit-def $itstate
-    tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate

  bb.1.loop.ph:
    successors: %bb.2(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3

-    renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
-    $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
+    renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+    renamable $lr = t2DoLoopStart killed renamable $lr
+    $r4 = tMOVr killed $lr, 14 /* CC::al */, $noreg

  bb.2.loop.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
    liveins: $r0, $r1, $r2, $r3, $r4

+    $lr = tMOVr $r4, 14 /* CC::al */, $noreg
    renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg
    MVE_VPST 4, implicit $vpr
    renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2)
    renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2)
-    $lr = tMOVr $r4, 14 /* CC::al */, $noreg
-    renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg
+    renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
    renamable $q1 = MVE_VCLZs8 killed renamable $q1, 0, $noreg, undef renamable $q1
    renamable $lr = t2LoopDec killed renamable $lr, 1
    $r0 = tMOVr $r1, 14 /* CC::al */, $noreg
@ -232,7 +232,7 @@ body:             |
    tB %bb.3, 14 /* CC::al */, $noreg

  bb.3.exit:
-    tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc

 ...
 ---
@ -267,68 +267,69 @@ body:             |
  ; CHECK-LABEL: name: test_ctlz_i16
  ; CHECK: bb.0.entry:
  ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4
-  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
-  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
  ; CHECK:   tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
  ; CHECK:   t2IT 11, 8, implicit-def $itstate
-  ; CHECK:   tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate
  ; CHECK: bb.1.loop.ph:
  ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-  ; CHECK:   dead $lr = t2DLS renamable $r4
-  ; CHECK:   $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4
+  ; CHECK:   renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg
  ; CHECK: bb.2.loop.body:
  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r12
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r12
  ; CHECK:   $lr = tMOVr $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
  ; CHECK:   MVE_VPST 4, implicit $vpr
  ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4)
  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4)
-  ; CHECK:   renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $q1 = MVE_VCLZs16 killed renamable $q1, 0, $noreg, undef renamable $q1
  ; CHECK:   renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg
  ; CHECK:   MVE_VPST 8, implicit $vpr
  ; CHECK:   renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4)
  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
  ; CHECK: bb.3.exit:
-  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+  ; CHECK:   liveins: $r4
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def dead $r7, def $pc
  bb.0.entry:
    successors: %bb.1(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3, $r7, $lr

-    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
    frame-setup CFI_INSTRUCTION def_cfa_offset 8
    frame-setup CFI_INSTRUCTION offset $lr, -4
-    frame-setup CFI_INSTRUCTION offset $r4, -8
+    frame-setup CFI_INSTRUCTION offset $r7, -8
    tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
    t2IT 11, 8, implicit-def $itstate
-    tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate

  bb.1.loop.ph:
    successors: %bb.2(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3

-    renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
-    $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+    renamable $lr = t2DoLoopStart killed renamable $lr
+    $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg

  bb.2.loop.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
    liveins: $r0, $r1, $r2, $r3, $r12

-    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
    $lr = tMOVr $r12, 14 /* CC::al */, $noreg
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
    MVE_VPST 4, implicit $vpr
    renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4)
    renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4)
-    renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+    renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
    renamable $q1 = MVE_VCLZs16 killed renamable $q1, 0, $noreg, undef renamable $q1
    renamable $lr = t2LoopDec killed renamable $lr, 1
    renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg
@ -338,7 +339,7 @@ body:             |
    tB %bb.3, 14 /* CC::al */, $noreg

  bb.3.exit:
-    tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc

 ...
 ---
@ -373,68 +374,69 @@ body:             |
  ; CHECK-LABEL: name: test_ctlz_i32
  ; CHECK: bb.0.entry:
  ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4
-  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
-  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
  ; CHECK:   tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
  ; CHECK:   t2IT 11, 8, implicit-def $itstate
-  ; CHECK:   tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate
  ; CHECK: bb.1.loop.ph:
  ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-  ; CHECK:   dead $lr = t2DLS renamable $r4
-  ; CHECK:   $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4
+  ; CHECK:   renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg
  ; CHECK: bb.2.loop.body:
  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r12
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r12
  ; CHECK:   $lr = tMOVr $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
  ; CHECK:   MVE_VPST 4, implicit $vpr
  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4)
  ; CHECK:   renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4)
-  ; CHECK:   renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $q1 = MVE_VCLZs32 killed renamable $q1, 0, $noreg, undef renamable $q1
  ; CHECK:   renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg
  ; CHECK:   MVE_VPST 8, implicit $vpr
  ; CHECK:   renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4)
  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
  ; CHECK: bb.3.exit:
-  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+  ; CHECK:   liveins: $r4
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def dead $r7, def $pc
  bb.0.entry:
    successors: %bb.1(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3, $r7, $lr

-    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
    frame-setup CFI_INSTRUCTION def_cfa_offset 8
    frame-setup CFI_INSTRUCTION offset $lr, -4
-    frame-setup CFI_INSTRUCTION offset $r4, -8
+    frame-setup CFI_INSTRUCTION offset $r7, -8
    tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
    t2IT 11, 8, implicit-def $itstate
-    tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate

  bb.1.loop.ph:
    successors: %bb.2(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3

-    renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
-    $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+    renamable $lr = t2DoLoopStart killed renamable $lr
+    $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg

  bb.2.loop.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
    liveins: $r0, $r1, $r2, $r3, $r12

-    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
    $lr = tMOVr $r12, 14 /* CC::al */, $noreg
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
    MVE_VPST 4, implicit $vpr
    renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4)
    renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4)
-    renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+    renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
    renamable $q1 = MVE_VCLZs32 killed renamable $q1, 0, $noreg, undef renamable $q1
    renamable $lr = t2LoopDec killed renamable $lr, 1
    renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg
@ -444,6 +446,6 @@ body:             |
    tB %bb.3, 14 /* CC::al */, $noreg

  bb.3.exit:
-    tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc

 ...
--- a/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
@ -19,7 +19,7 @@
    br i1 %tmp, label %bb27, label %bb3

  bb3:                                              ; preds = %bb
-    call void @llvm.set.loop.iterations.i32(i32 %tmp6)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp6)
    %scevgep1 = getelementptr i32, i32* %arg3, i32 -4
    br label %bb9

@ -27,7 +27,7 @@
    %lsr.iv4 = phi i32* [ %scevgep6, %bb9 ], [ %scevgep1, %bb3 ]
    %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ]
    %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
-    %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
+    %tmp7 = phi i32 [ %start, %bb3 ], [ %tmp12, %bb9 ]
    %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
    %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
@ -56,7 +56,7 @@
  }
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
@ -197,7 +197,7 @@ body:             |
    VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
    renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0
    $r3 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.bb9:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
@ -11,14 +11,14 @@
    %2 = sub i32 %0, %smin
    %3 = lshr i32 %2, 2
    %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %do.body

  do.body:                                          ; preds = %do.body, %entry
    %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
    %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
    %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
-    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %5 = phi i32 [ %start, %entry ], [ %9, %do.body ]
    %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
    %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
    %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
@ -38,7 +38,7 @@
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
  declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -136,7 +136,7 @@ body:             |
    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
    renamable $r2 = tLEApcrel %const.0, 14, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.1.do.body (align 4):
    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
@ -19,14 +19,14 @@
    br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
    %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
    %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@ -47,7 +47,7 @@
  for.cond.cleanup:                                 ; preds = %vector.body, %entry
    ret void
  }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare <4 x i1> @llvm.arm.vctp32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
@ -162,7 +162,7 @@ body:             |
    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir
@ -24,14 +24,14 @@
    %5 = sub i32 %3, %smin36
    %6 = lshr i32 %5, 2
    %7 = add nuw nsw i32 %6, 1
-    call void @llvm.set.loop.iterations.i32(i32 %7)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %7)
    br label %do.body
  
  do.body:                                          ; preds = %do.body, %entry
    %count.0 = phi i32 [ %0, %entry ], [ %12, %do.body ]
    %pInT.0 = phi float* [ %pIn, %entry ], [ %add.ptr, %do.body ]
    %sumVec.0 = phi <4 x float> [ zeroinitializer, %entry ], [ %11, %do.body ]
-    %8 = phi i32 [ %7, %entry ], [ %13, %do.body ]
+    %8 = phi i32 [ %start1, %entry ], [ %13, %do.body ]
    %pInT.033 = bitcast float* %pInT.0 to <4 x float>*
    %9 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %count.0)
    %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pInT.033, i32 4, <4 x i1> %9, <4 x float> zeroinitializer)
@ -125,7 +125,7 @@
    %50 = bitcast float* %arrayidx17 to <4 x float>*
    %51 = load <4 x float>, <4 x float>* %50, align 4
    %52 = fmul fast <4 x float> %51, %40
-    call void @llvm.set.loop.iterations.i32(i32 %33)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %33)
    br label %do.body24
  
  do.body24:                                        ; preds = %do.body24, %for.body
@ -138,7 +138,7 @@
    %sumVec1.0 = phi <4 x float> [ %46, %for.body ], [ %58, %do.body24 ]
    %sumVec2.0 = phi <4 x float> [ %49, %for.body ], [ %60, %do.body24 ]
    %sumVec3.0 = phi <4 x float> [ %52, %for.body ], [ %62, %do.body24 ]
-    %53 = phi i32 [ %33, %for.body ], [ %63, %do.body24 ]
+    %53 = phi i32 [ %start2, %for.body ], [ %63, %do.body24 ]
    %lsr.iv4 = bitcast float* %lsr.iv to <4 x float>*
    %lsr.iv911 = bitcast float* %lsr.iv9 to <4 x float>*
    %lsr.iv1618 = bitcast float* %lsr.iv16 to <4 x float>*
@ -219,7 +219,7 @@
    %k.1200 = phi i32 [ %inc, %do.end66 ], [ %k.0.lcssa, %for.body56.preheader ]
    %mul57 = mul i32 %k.1200, %0
    %arrayidx58 = getelementptr inbounds float, float* %2, i32 %mul57
-    call void @llvm.set.loop.iterations.i32(i32 %38)
+    %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %38)
    br label %do.body59
  
  do.body59:                                        ; preds = %do.body59, %for.body56
@ -227,7 +227,7 @@
    %pInT.2 = phi float* [ %pIn, %for.body56 ], [ %add.ptr61, %do.body59 ]
    %pCos0.1 = phi float* [ %arrayidx58, %for.body56 ], [ %add.ptr62, %do.body59 ]
    %sumVec.1 = phi <4 x float> [ zeroinitializer, %for.body56 ], [ %93, %do.body59 ]
-    %89 = phi i32 [ %38, %for.body56 ], [ %95, %do.body59 ]
+    %89 = phi i32 [ %start3, %for.body56 ], [ %95, %do.body59 ]
    %pInT.21 = bitcast float* %pInT.2 to <4 x float>*
    %pCos0.12 = bitcast float* %pCos0.1 to <4 x float>*
    %90 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %count.2)
@ -264,7 +264,7 @@
  declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
  declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #3
  declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) #1
-  declare void @llvm.set.loop.iterations.i32(i32) #4
+  declare i32 @llvm.start.loop.iterations.i32(i32) #4
  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4

 ...
@ -414,7 +414,7 @@ body:             |
    $r0 = tMOVr $r4, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
    $r1 = tMOVr $r5, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
  
  bb.1.do.body (align 4):
    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
@ -503,7 +503,7 @@ body:             |
    $r3 = tMOVr $r10, 14 /* CC::al */, $noreg
    $r5 = tMOVr $r1, 14 /* CC::al */, $noreg
    $r4 = tMOVr $r12, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    $r7 = tMOVr $r6, 14 /* CC::al */, $noreg
    renamable $r11 = t2LDRi12 $sp, 16, 14 /* CC::al */, $noreg :: (load 4 from %stack.5)
  
@ -592,7 +592,7 @@ body:             |
    $r6 = tMOVr $r4, 14 /* CC::al */, $noreg
    $r7 = tMOVr $r5, 14 /* CC::al */, $noreg
    $lr = tMOVr $r3, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3

  bb.13:
    successors: %bb.10(0x80000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir
@ -7,7 +7,7 @@

  define void @size_limit(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
  entry:
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
    %scevgep = getelementptr i32, i32* %a, i32 -1
    %scevgep4 = getelementptr i32, i32* %c, i32 -1
    %scevgep8 = getelementptr i32, i32* %b, i32 -1
@ -35,7 +35,7 @@
    %lsr.iv9 = phi i32* [ %scevgep8, %entry ], [ %scevgep10, %for.body ]
    %lsr.iv5 = phi i32* [ %scevgep4, %entry ], [ %scevgep6, %for.body ]
    %lsr.iv1 = phi i32* [ %scevgep, %entry ], [ %scevgep2, %for.body ]
-    %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+    %count = phi i32 [ %start, %entry ], [ %count.next, %for.body ]
    br label %for.body
  }

@ -43,7 +43,7 @@
  declare i32 @llvm.arm.space(i32 immarg, i32) #0

  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1

  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@ -184,7 +184,7 @@ body:             |
    frame-setup CFI_INSTRUCTION offset $r7, -8
    $sp = frame-setup tSUBspi $sp, 8, 14, $noreg
    frame-setup CFI_INSTRUCTION def_cfa_offset 40
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg
    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg
--- a/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
@ -8,21 +8,21 @@ define void @foo(%struct.SpeexPreprocessState_* nocapture readonly %st, i16* %x)
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    ldrd r12, r4, [r0]
-; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
+; CHECK-NEXT:    ldrd r2, r3, [r0, #8]
 ; CHECK-NEXT:    rsb r12, r12, r4, lsl #1
+; CHECK-NEXT:    dlstp.16 lr, r12
 ; CHECK-NEXT:    mov r4, r12
-; CHECK-NEXT:    dlstp.16 lr, r4
 ; CHECK-NEXT:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q0, [r2], #16
-; CHECK-NEXT:    vstrh.16 q0, [r3], #16
+; CHECK-NEXT:    vldrh.u16 q0, [r3], #16
+; CHECK-NEXT:    vstrh.16 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
-; CHECK-NEXT:    ldr r3, [r0]
+; CHECK-NEXT:    ldr r2, [r0]
 ; CHECK-NEXT:    ldr r0, [r0, #8]
 ; CHECK-NEXT:    vmov.i16 q0, #0x1800
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
-; CHECK-NEXT:    dlstp.16 lr, r3
+; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB0_3: @ %do.body6
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
--- a/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
@ -9,8 +9,8 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #8
@ -69,8 +69,8 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #8
@ -129,8 +129,8 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #4
@ -189,8 +189,8 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #4
--- a/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
@ -17,11 +17,11 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
@ -52,7 +52,7 @@
    ret i32 %res.0.lcssa
  }
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)

@ -155,7 +155,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r3 = tMOVr killed $r12, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@ -49,10 +49,10 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader.new
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
-; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    add.w r3, r12, r3, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, r1, r3
@ -228,9 +228,9 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@ -321,11 +321,12 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    sub.w r12, r2, #1
 ; CHECK-NEXT:    adr r2, .LCPI2_1
-; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    mov lr, r3
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vdup.32 q2, r12
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
--- a/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir
@ -13,14 +13,14 @@
    br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv17 = phi i16* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
    %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv13 = bitcast i16* %lsr.iv to <8 x i16>*
    %lsr.iv1416 = bitcast i16* %lsr.iv14 to <8 x i16>*
@ -41,7 +41,7 @@
  for.cond.cleanup:                                 ; preds = %vector.body, %entry
    ret void
  }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare <8 x i1> @llvm.arm.mve.vctp16(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
@ -149,7 +149,7 @@ body:             |
    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir
@ -20,14 +20,14 @@
    br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
    %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
    %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@ -48,7 +48,7 @@
  for.cond.cleanup:                                 ; preds = %vector.body, %entry
    ret void
  }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@ -157,7 +157,7 @@ body:             |
    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir
@ -13,14 +13,14 @@
    br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv17 = phi i8* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
    %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv13 = bitcast i8* %lsr.iv to <16 x i8>*
    %lsr.iv1416 = bitcast i8* %lsr.iv14 to <16 x i8>*
@ -41,7 +41,7 @@
  for.cond.cleanup:                                 ; preds = %vector.body, %entry
    ret void
  }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare <16 x i1> @llvm.arm.mve.vctp8(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
@ -150,7 +150,7 @@ body:             |
    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir
@ -16,11 +16,11 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
    %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
    %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
@ -64,7 +64,7 @@
  }
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -201,7 +201,7 @@ body:             |
    renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
    renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    $r4 = tMOVr killed $lr, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir
@ -16,11 +16,11 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
    %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
    %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
@ -64,7 +64,7 @@
  }
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -201,7 +201,7 @@ body:             |
    renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
    renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    $r4 = tMOVr killed $lr, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir
@ -16,11 +16,11 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
    %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
    %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
@ -64,7 +64,7 @@
  }
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -201,7 +201,7 @@ body:             |
    renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
    renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    $r4 = tMOVr killed $lr, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
@ -16,11 +16,11 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
    %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
    %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
@ -65,7 +65,7 @@
  }
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -200,7 +200,7 @@ body:             |
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
    renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
    renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r5
+    $lr = t2DoLoopStart renamable $r5
    $r4 = tMOVr killed $r5, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
@ -18,11 +18,11 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
    %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
    %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
@ -67,7 +67,7 @@
  }
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -201,7 +201,7 @@ body:             |
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
    renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
    renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r5
+    $lr = t2DoLoopStart renamable $r5
    $r4 = tMOVr killed $r5, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
@ -14,11 +14,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv20 = phi i32* [ %scevgep20, %vector.body ], [ %c, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
@ -55,11 +55,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
@ -92,11 +92,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
@ -120,7 +120,7 @@
  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)

@ -204,7 +204,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -321,7 +321,7 @@ body:             |
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -440,7 +440,7 @@ body:             |
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
@ -12,11 +12,11 @@
    %3 = lshr i32 %2, 2
    %4 = add nuw nsw i32 %3, 1
    store i32 %4, i32* %iter.addr, align 4
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %do.body

  do.body:                                          ; preds = %do.body, %entry
-    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %4, %entry ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %start, %entry ]
    %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
    %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
    %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
@ -47,12 +47,12 @@
    %2 = sub i32 %0, %smin
    %3 = lshr i32 %2, 2
    %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    store i32 %4, i32* %iter.addr, align 4
    br label %do.body

  do.body:                                          ; preds = %do.body, %entry
-    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %4, %entry ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %start, %entry ]
    %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
    %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
    %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
@ -84,7 +84,7 @@
  declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #3

  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #4
+  declare i32 @llvm.start.loop.iterations.i32(i32) #4

  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
@ -178,7 +178,7 @@ body:             |
    renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
    t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg

  bb.1.do.body:
@ -247,8 +247,8 @@ body:             |
  ; CHECK:   renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
  ; CHECK:   t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
  ; CHECK:   $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg
  ; CHECK: bb.1.do.body:
  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
@ -282,8 +282,8 @@ body:             |
    renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
    renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $lr
    t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
+    $lr = t2DoLoopStart renamable $lr
    $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg

  bb.1.do.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir
@ -13,14 +13,14 @@
    %2 = sub i32 %0, %smin
    %3 = lshr i32 %2, 2
    %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %do.body

  do.body:                                          ; preds = %do.body, %entry
    %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
    %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
    %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
-    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %5 = phi i32 [ %start, %entry ], [ %9, %do.body ]
    %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
    %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
    %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
@ -40,7 +40,7 @@
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
  declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -149,7 +149,7 @@ body:             |
    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
    renamable $r2 = tLEApcrel %const.0, 14, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.1.do.body (align 4):
    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir
@ -14,14 +14,14 @@
    %2 = sub i32 %0, %smin
    %3 = lshr i32 %2, 2
    %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %do.body

  do.body:                                          ; preds = %do.body, %entry
    %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
    %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
    %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
-    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %5 = phi i32 [ %start, %entry ], [ %9, %do.body ]
    %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
    %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
    %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
@ -41,7 +41,7 @@
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
  declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -140,7 +140,7 @@ body:             |
    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
    renamable $r2 = tLEApcrel %const.0, 14, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.1.do.body (align 4):
    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
@ -78,6 +78,7 @@ body:             |
  ; CHECK:   successors: %bb.5(0x80000000)
  ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r4
  ; CHECK:   renamable $s4 = nnan ninf nsz VADDS renamable $s0, renamable $s1, 14 /* CC::al */, $noreg
+  ; CHECK:   dead $lr = tMOVr $r4, 14 /* CC::al */, $noreg
  ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $s4 = nnan ninf nsz VADDS renamable $s2, killed renamable $s4, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $s0 = nnan ninf nsz VADDS killed renamable $s3, killed renamable $s4, 14 /* CC::al */, $noreg, implicit killed $q0
@ -151,7 +152,7 @@ body:             |
    renamable $r4 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
    $r3 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.3:
    successors: %bb.3(0x7c000000), %bb.4(0x04000000)
@ -178,7 +179,7 @@ body:             |
    renamable $s0 = nnan ninf nsz VADDS killed renamable $s3, killed renamable $s4, 14, $noreg, implicit $q0
    $s2 = VMOVSR $r1, 14, $noreg
    renamable $s2 = VUITOS killed renamable $s2, 14, $noreg
-    t2DoLoopStart killed $r4
+    $lr = t2DoLoopStart killed $r4
    renamable $s4 = nnan ninf nsz VDIVS killed renamable $s0, killed renamable $s2, 14, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0

--- a/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir
@ -15,14 +15,14 @@
    %2 = sub i32 %0, %smin
    %3 = lshr i32 %2, 2
    %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %do.body

  do.body:                                          ; preds = %do.body, %entry
    %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
    %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
    %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
-    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %5 = phi i32 [ %start, %entry ], [ %9, %do.body ]
    %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
    %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
    %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
@ -42,7 +42,7 @@
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
  declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -149,7 +149,7 @@ body:             |
    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
    renamable $r2 = tLEApcrel %const.0, 14, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.1.do.body (align 4):
    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir
@ -20,11 +20,11 @@
    %trip.count.minus.1 = add i32 %N, -1
    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
    %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
@ -56,7 +56,7 @@
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -168,7 +168,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $r4 = tMOVr killed $r3, 14 /* CC::al */, $noreg
    renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir
@ -18,11 +18,11 @@
    %trip.count.minus.1 = add i32 %N, -1
    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
    %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
@ -54,7 +54,7 @@
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 ...
 ---
@ -165,7 +165,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $r4 = tMOVr killed $r3, 14 /* CC::al */, $noreg
    renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir
@ -19,7 +19,7 @@
    %trip.count.minus.1 = add i32 %N, -1
    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
    %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
@ -27,7 +27,7 @@
    %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
    %elts.rem = phi i32 [ %N, %vector.ph ], [ %elts.rem.next, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %12, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %12, %vector.body ]
    %lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>*
    %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
    %7 = insertelement <4 x i32> undef, i32 %div, i32 0
@ -52,7 +52,7 @@
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -147,7 +147,7 @@ body:             |
    renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)
    renamable $r3, dead $cpsr = tLSRri renamable $r2, 1, 14 /* CC::al */, $noreg
    renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir
@ -14,11 +14,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv20 = phi i32* [ %scevgep20, %vector.body ], [ %c, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
@ -50,7 +50,7 @@

  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)

@ -136,7 +136,7 @@ body:             |
    renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $r4 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r4
+    $lr = t2DoLoopStart renamable $r4
    $r12 = tMOVr killed $r4, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir
@ -18,7 +18,7 @@
    br i1 %tmp7, label %bb13, label %bb12

  bb12:                                             ; preds = %bb4
-    call void @llvm.set.loop.iterations.i32(i32 %tmp11)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11)
    br label %bb28

  bb13:                                             ; preds = %bb28, %bb4
@ -45,7 +45,7 @@
    ret void

  bb28:                                             ; preds = %bb28, %bb12
-    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ]
+    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
    %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
    %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
    %0 = bitcast i32* %arg1 to i8*
@ -145,7 +145,7 @@
    br label %bb27
  }

-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -387,7 +387,7 @@ body:             |
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
    renamable $r8 = t2MOVi 0, 14, $noreg, $noreg
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $r12 = tMOVr killed $r3, 14, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
@ -18,7 +18,7 @@
    br i1 %tmp7, label %bb13, label %bb12

  bb12:                                             ; preds = %bb4
-    call void @llvm.set.loop.iterations.i32(i32 %tmp11)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11)
    br label %bb28

  bb13:                                             ; preds = %bb28, %bb4
@ -46,7 +46,7 @@
    ret i32 %res

  bb28:                                             ; preds = %bb28, %bb12
-    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ]
+    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
    %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
    %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
    %0 = bitcast i32* %arg1 to i8*
@ -146,7 +146,7 @@
    br label %bb27
  }

-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -265,7 +265,8 @@ body:             |
  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $r3
+  ; CHECK:   dead $lr = t2DLS renamable $r3
+  ; CHECK:   $lr = tMOVr killed $r3, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
  ; CHECK: bb.5.bb28:
  ; CHECK:   successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@ -403,7 +404,7 @@ body:             |
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
    renamable $r8 = t2MOVi 0, 14, $noreg, $noreg
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $lr = tMOVr killed $r3, 14, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir
@ -18,7 +18,7 @@
    br i1 %tmp7, label %bb13, label %bb12

  bb12:                                             ; preds = %bb4
-    call void @llvm.set.loop.iterations.i32(i32 %tmp11)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11)
    br label %bb28

  bb13:                                             ; preds = %bb28, %bb4
@ -46,7 +46,7 @@
    ret i32 %res

  bb28:                                             ; preds = %bb28, %bb12
-    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ]
+    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
    %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
    %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
    %0 = bitcast i32* %arg1 to i8*
@ -146,7 +146,7 @@
    br label %bb27
  }

-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -265,7 +265,8 @@ body:             |
  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $r3
+  ; CHECK:   dead $lr = t2DLS renamable $r3
+  ; CHECK:   $lr = tMOVr killed $r3, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
  ; CHECK: bb.5.bb28:
  ; CHECK:   successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@ -403,7 +404,7 @@ body:             |
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
    renamable $r8 = t2MOVi 0, 14, $noreg, $noreg
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $lr = tMOVr $r3, 14, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
 ; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-GLOBAL

@ -16,10 +15,10 @@
 ; CHECK: ne_and_guard
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tCMPi8 renamable $r0, 0
 ; CHECK:   tBcc %bb.4
 ; CHECK: bb.2.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r0
 ; CHECK: bb.3.while.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.3
 define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
@ -49,10 +48,10 @@ if.end:                                           ; preds = %while.body, %entry
 ; CHECK: ne_preheader
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tCMPi8 renamable $r0, 0
 ; CHECK:   tBcc %bb.4
 ; CHECK: bb.2.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r0
 ; CHECK: bb.3.while.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.3
 define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
@ -84,10 +83,10 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: eq_preheader
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tCMPi8 renamable $r0, 0
 ; CHECK:   tBcc %bb.4
 ; CHECK: bb.2.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r0
 ; CHECK: bb.3.while.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.3
 define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
@ -119,10 +118,10 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: ne_prepreheader
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   t2CMPri renamable $r12, 0
 ; CHECK:   tBcc %bb.4
 ; CHECK: bb.2.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r12
 ; CHECK: bb.3.while.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.3
 define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
@ -153,7 +152,7 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: be_ne
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r12
 ; CHECK: bb.2.do.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
 define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
--- a/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
@ -15,14 +15,14 @@

  vector.ph:                                        ; preds = %entry
    %6 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %init, i32 0
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv13 = phi float* [ %scevgep14, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi float* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x float> [ %6, %vector.ph ], [ %13, %vector.body ]
-    %7 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ]
+    %7 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ]
    %8 = phi i32 [ %N, %vector.ph ], [ %10, %vector.body ]
    %lsr.iv12 = bitcast float* %lsr.iv to <4 x float>*
    %lsr.iv1315 = bitcast float* %lsr.iv13 to <4 x float>*
@ -63,14 +63,14 @@

  vector.ph:                                        ; preds = %entry
    %6 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %init, i32 0
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv14 = phi float* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi float* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x float> [ %6, %vector.ph ], [ %13, %vector.body ]
-    %7 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ]
+    %7 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ]
    %8 = phi i32 [ %shr, %vector.ph ], [ %10, %vector.body ]
    %lsr.iv13 = bitcast float* %lsr.iv to <4 x float>*
    %lsr.iv1416 = bitcast float* %lsr.iv14 to <4 x float>*
@ -99,7 +99,7 @@
  declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
  declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)

@ -205,7 +205,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
    $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1

@ -341,7 +341,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
    renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 2, 14 /* CC::al */, $noreg
    $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1
--- a/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir
@ -16,7 +16,7 @@
    %scevgep = getelementptr i32, i32* %a, i32 -1
    %scevgep4 = getelementptr i32, i32* %c, i32 -1
    %scevgep8 = getelementptr i32, i32* %b, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
    br label %for.body

  for.cond.cleanup:                                 ; preds = %for.body, %entry
@ -26,7 +26,7 @@
    %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
    %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
    %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-    %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.body ]
+    %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.body ]
    %size = call i32 @llvm.arm.space(i32 4096, i32 undef)
    %scevgep3 = getelementptr i32, i32* %lsr.iv9, i32 1
    %1 = load i32, i32* %scevgep3, align 4
@ -47,7 +47,7 @@
  declare i32 @llvm.arm.space(i32 immarg, i32) #0

  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1

  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@ -157,7 +157,7 @@ body:             |
    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
    renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg
    $lr = tMOVr $r3, 14, $noreg
-    t2DoLoopStart killed $r3
+    $lr = t2DoLoopStart killed $r3

  bb.2.for.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
@ -26,14 +26,14 @@
    call void @llvm.dbg.value(metadata i32 0, metadata !31, metadata !DIExpression()), !dbg !32
    %arrayidx7.us = getelementptr inbounds i32, i32* %e, i32 %i.031.us, !dbg !38
    %arrayidx7.promoted.us = load i32, i32* %arrayidx7.us, align 4, !dbg !41
-    call void @llvm.set.loop.iterations.i32(i32 %d), !dbg !46
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %d), !dbg !46
    br label %for.body3.us, !dbg !46

  for.body3.us:                                     ; preds = %for.body3.us, %for.cond1.preheader.us
    %lsr.iv5 = phi i16* [ %scevgep6, %for.body3.us ], [ %lsr.iv2, %for.cond1.preheader.us ], !dbg !32
    %lsr.iv1 = phi i16* [ %scevgep, %for.body3.us ], [ %l, %for.cond1.preheader.us ], !dbg !32
    %add829.us = phi i32 [ %arrayidx7.promoted.us, %for.cond1.preheader.us ], [ %add8.us, %for.body3.us ], !dbg !32
-    %1 = phi i32 [ %d, %for.cond1.preheader.us ], [ %4, %for.body3.us ], !dbg !32
+    %1 = phi i32 [ %start, %for.cond1.preheader.us ], [ %4, %for.body3.us ], !dbg !32
    call void @llvm.dbg.value(metadata i32 undef, metadata !31, metadata !DIExpression()), !dbg !32
    %2 = load i16, i16* %lsr.iv5, align 2, !dbg !47
    %conv.us = sext i16 %2 to i32, !dbg !47
@ -67,7 +67,7 @@
  }
  declare !dbg !4 dso_local arm_aapcscc signext i16 @get_input(i32, i32*, i16 signext)
  declare void @llvm.dbg.value(metadata, metadata, metadata)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

  !llvm.dbg.cu = !{!0}
@ -325,7 +325,7 @@ body:             |
    $r3 = tMOVr $r5, 14, $noreg, debug-location !32
    $r0 = tMOVr $r8, 14, $noreg, debug-location !32
    $lr = tMOVr $r10, 14, $noreg, debug-location !32
-    t2DoLoopStart renamable $r10, debug-location !46
+    $lr = t2DoLoopStart renamable $r10, debug-location !46

  bb.3.for.body3.us:
    successors: %bb.3(0x7c000000), %bb.4(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
@ -10,7 +10,7 @@
    br i1 %cmp19.i, label %for.body.i.preheader, label %c.exit.thread

  for.body.i.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %d)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %d)
    br label %for.body.i

  c.exit.thread:                                    ; preds = %entry
@ -22,7 +22,7 @@
    %lsr.iv15 = phi i32* [ %e, %for.body.i.preheader ], [ %scevgep16, %for.body.i ]
    %h.022.i = phi i16 [ %h.1.i, %for.body.i ], [ 0, %for.body.i.preheader ]
    %f.020.i = phi i32 [ %f.1.i, %for.body.i ], [ undef, %for.body.i.preheader ]
-    %0 = phi i32 [ %d, %for.body.i.preheader ], [ %2, %for.body.i ]
+    %0 = phi i32 [ %start1, %for.body.i.preheader ], [ %2, %for.body.i ]
    %1 = load i32, i32* %lsr.iv15, align 4
    %add.i = add nsw i32 %1, %f.020.i
    %cmp1.i = icmp sgt i32 %add.i, 0
@ -60,14 +60,14 @@
    %arrayidx12.us = getelementptr inbounds i32, i32* %e, i32 %i.064.us
    %arrayidx12.promoted.us = load i32, i32* %arrayidx12.us, align 4
    %11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx12.promoted.us, i32 0
-    call void @llvm.set.loop.iterations.i32(i32 %8)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %for.cond4.preheader.us
    %lsr.iv10 = phi i16* [ %scevgep11, %vector.body ], [ %lsr.iv7, %for.cond4.preheader.us ]
    %lsr.iv4 = phi i16* [ %scevgep5, %vector.body ], [ %l, %for.cond4.preheader.us ]
    %vec.phi = phi <4 x i32> [ %11, %for.cond4.preheader.us ], [ %19, %vector.body ]
-    %12 = phi i32 [ %8, %for.cond4.preheader.us ], [ %20, %vector.body ]
+    %12 = phi i32 [ %start2, %for.cond4.preheader.us ], [ %20, %vector.body ]
    %13 = phi i32 [ %d, %for.cond4.preheader.us ], [ %15, %vector.body ]
    %lsr.iv1012 = bitcast i16* %lsr.iv10 to <4 x i16>*
    %lsr.iv46 = bitcast i16* %lsr.iv4 to <4 x i16>*
@ -108,14 +108,14 @@
    br i1 %29, label %for.body.i57.preheader, label %c.exit59

  for.body.i57.preheader:                           ; preds = %for.end16
-    call void @llvm.set.loop.iterations.i32(i32 %d)
+    %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %d)
    br label %for.body.i57

  for.body.i57:                                     ; preds = %for.body.i57, %for.body.i57.preheader
    %lsr.iv1 = phi i32* [ %e, %for.body.i57.preheader ], [ %scevgep, %for.body.i57 ]
    %h.022.i44 = phi i16 [ %h.1.i54, %for.body.i57 ], [ 0, %for.body.i57.preheader ]
    %f.020.i46 = phi i32 [ %f.1.i51, %for.body.i57 ], [ undef, %for.body.i57.preheader ]
-    %30 = phi i32 [ %d, %for.body.i57.preheader ], [ %32, %for.body.i57 ]
+    %30 = phi i32 [ %start3, %for.body.i57.preheader ], [ %32, %for.body.i57 ]
    %31 = load i32, i32* %lsr.iv1, align 4
    %add.i48 = add nsw i32 %31, %f.020.i46
    %cmp1.i49 = icmp sgt i32 %add.i48, 0
@ -142,7 +142,7 @@
  declare dso_local arm_aapcs_vfpcc signext i16 @crc16(...) local_unnamed_addr #0
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -385,7 +385,7 @@ body:             |
    renamable $r2 = IMPLICIT_DEF
    $r10 = tMOVr $r0, 14, $noreg
    $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart killed renamable $r0
+    $lr = t2DoLoopStart killed renamable $r0

  bb.2.for.body.i:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -443,7 +443,7 @@ body:             |
    $r6 = tMOVr $r5, 14, $noreg
    $r1 = tMOVr $r8, 14, $noreg
    $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $r0
+    $lr = t2DoLoopStart renamable $r0

  bb.6.vector.body:
    successors: %bb.6(0x7c000000), %bb.7(0x04000000)
@ -488,7 +488,7 @@ body:             |

    renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg
    renamable $r1 = IMPLICIT_DEF
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.10.for.body.i57:
    successors: %bb.10(0x7c000000), %bb.11(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir
@ -9,13 +9,13 @@
  entry:
    %scevgep = getelementptr i32, i32* %q, i32 -1
    %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    br label %while.body

  while.body:                                       ; preds = %while.body, %entry
    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
-    %0 = phi i32 [ %n, %entry ], [ %2, %while.body ]
+    %0 = phi i32 [ %start, %entry ], [ %2, %while.body ]
    %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
    %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
    %1 = load i32, i32* %scevgep6, align 4
@ -30,7 +30,7 @@
    ret i32 0
  }

-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0

  attributes #0 = { noduplicate nounwind }
@ -91,7 +91,8 @@ body:             |
  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   $lr = t2DLS killed $r0
+  ; CHECK:   dead $lr = t2DLS $r0
+  ; CHECK:   $lr = tMOVr killed $r0, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg
  ; CHECK: bb.1.while.body:
@ -111,7 +112,7 @@ body:             |
    frame-setup CFI_INSTRUCTION def_cfa_offset 8
    frame-setup CFI_INSTRUCTION offset $lr, -4
    frame-setup CFI_INSTRUCTION offset $r7, -8
-    t2DoLoopStart $r0
+    $lr = t2DoLoopStart $r0
    $lr = tMOVr killed $r0, 14, $noreg
    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops -tail-predication=enabled %s -o - | FileCheck %s

+# TODOD: As far as I can tell this test is fine. The tail predicating the second loop means we remove the instruction that would otherwise block the first.
+
 --- |
  define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) #0 {
  entry:
@ -15,14 +17,14 @@
    %6 = sub i32 %0, %smin3
    %7 = lshr i32 %6, 2
    %8 = add nuw nsw i32 %7, 1
-    call void @llvm.set.loop.iterations.i32(i32 %8)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
    br label %do.body.i

  do.body.i:                                        ; preds = %do.body.i, %entry
    %blkCnt.0.i = phi i32 [ %13, %do.body.i ], [ %blockSize, %entry ]
    %sumVec.0.i = phi <4 x float> [ %12, %do.body.i ], [ zeroinitializer, %entry ]
    %pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ]
-    %9 = phi i32 [ %8, %entry ], [ %14, %do.body.i ]
+    %9 = phi i32 [ %start1, %entry ], [ %14, %do.body.i ]
    %pSrc.addr.0.i2 = bitcast float* %pSrc.addr.0.i to <4 x float>*
    %10 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i)
    %11 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.0.i2, i32 4, <4 x i1> %10, <4 x float> zeroinitializer)
@ -42,14 +44,14 @@
    %18 = insertelement <4 x i32> undef, i32 %17, i64 0
    %19 = shufflevector <4 x i32> %18, <4 x i32> undef, <4 x i32> zeroinitializer
    %20 = bitcast <4 x i32> %19 to <4 x float>
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %do.body

  do.body:                                          ; preds = %do.body, %arm_mean_f32_mve.exit
    %blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %26, %do.body ]
    %sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %25, %do.body ]
    %pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ]
-    %21 = phi i32 [ %4, %arm_mean_f32_mve.exit ], [ %27, %do.body ]
+    %21 = phi i32 [ %start2, %arm_mean_f32_mve.exit ], [ %27, %do.body ]
    %pSrc.addr.01 = bitcast float* %pSrc.addr.0 to <4 x float>*
    %22 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
    %23 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.01, i32 4, <4 x i1> %22, <4 x float> zeroinitializer)
@ -87,7 +89,7 @@
  declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1

  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3

  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #3
@ -152,32 +154,22 @@ body:             |
  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
-  ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
-  ; CHECK:   tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
-  ; CHECK:   t2IT 10, 8, implicit-def $itstate
-  ; CHECK:   renamable $r3 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate
-  ; CHECK:   renamable $r12 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
  ; CHECK:   $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r3
  ; CHECK:   $r4 = tMOVr $lr, 14 /* CC::al */, $noreg
  ; CHECK: bb.1.do.body.i:
  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r12
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
-  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q0
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.1
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r4, $r12
+  ; CHECK:   renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
+  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
  ; CHECK: bb.2.arm_mean_f32_mve.exit:
  ; CHECK:   successors: %bb.3(0x80000000)
  ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r4
  ; CHECK:   $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
+  ; CHECK:   dead $lr = tMOVr $r4, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
  ; CHECK:   $lr = t2DLS killed $r4
  ; CHECK:   renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
@ -224,7 +216,7 @@ body:             |
    renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
    $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
    $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    $r4 = tMOVr $lr, 14 /* CC::al */, $noreg

  bb.1.do.body.i:
@ -247,7 +239,7 @@ body:             |
    $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
    $lr = tMOVr $r4, 14 /* CC::al */, $noreg
    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0
-    t2DoLoopStart killed $r4
+    $lr = t2DoLoopStart killed $r4
    renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
    renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
@ -14,13 +14,13 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
    %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
@ -46,7 +46,7 @@
  }
  declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 ...
@ -153,7 +153,7 @@ body:             |
    renamable $r5 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
    renamable $r12 = t2LSRri killed renamable $r3, 1, 14, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
-    t2DoLoopStart renamable $r5
+    $lr = t2DoLoopStart renamable $r5
    $lr = tMOVr killed $r5, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@ -6,35 +6,31 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    cmp r1, #4
 ; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, #4
-; CHECK-NEXT:    mov.w r12, #1
-; CHECK-NEXT:    subs r3, r1, r3
+; CHECK-NEXT:    movge r4, #4
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    subs r4, r1, r4
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    adds r3, #3
-; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
+; CHECK-NEXT:    adds r4, #3
+; CHECK-NEXT:    add.w r12, r3, r4, lsr #2
 ; CHECK-NEXT:    mov r3, r1
-; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    mov r4, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:  .LBB0_1: @ %do.body.i
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q1, [r12], #16
-; CHECK-NEXT:    vaddt.f32 q0, q0, q1
-; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:    vldrw.u32 q1, [r4], #16
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %arm_mean_f32_mve.exit
 ; CHECK-NEXT:    vmov s4, r1
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    vadd.f32 s0, s3, s3
 ; CHECK-NEXT:    mov r3, r1
 ; CHECK-NEXT:    vcvt.f32.u32 s4, s4
-; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    vdiv.f32 s0, s0, s4
-; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:  .LBB0_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@ -42,7 +38,7 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float
 ; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
-; CHECK-NEXT:    vsubt.f32 q1, q1, r12
+; CHECK-NEXT:    vsubt.f32 q1, q1, r4
 ; CHECK-NEXT:    vfmat.f32 q0, q1, q1
 ; CHECK-NEXT:    le lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %do.end
--- a/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
@ -18,13 +18,13 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
    %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
@ -50,7 +50,7 @@
  }
  declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -169,7 +169,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
    $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
    renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
@ -18,13 +18,13 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
    %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
@ -50,7 +50,7 @@
  }
  declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -168,7 +168,7 @@ body:             |
    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
    renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg
--- a/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir
@ -36,17 +36,17 @@
    br i1 %26, label %49, label %31

  31:                                               ; preds = %23
-    call void @llvm.set.loop.iterations.i32(i32 %30)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %30)
    br label %65

  32:                                               ; preds = %11
-    call void @llvm.set.loop.iterations.i32(i32 %22)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %22)
    br label %33

  33:                                               ; preds = %33, %32
    %34 = phi i32* [ %46, %33 ], [ %0, %32 ]
    %35 = phi i32* [ %45, %33 ], [ %1, %32 ]
-    %36 = phi i32 [ %22, %32 ], [ %47, %33 ]
+    %36 = phi i32 [ %start2, %32 ], [ %47, %33 ]
    %37 = phi i32 [ %9, %32 ], [ %41, %33 ]
    %38 = bitcast i32* %34 to <4 x i32>*
    %39 = bitcast i32* %35 to <4 x i32>*
@ -89,7 +89,7 @@
  65:                                               ; preds = %65, %31
    %66 = phi i32 [ %108, %65 ], [ 0, %31 ]
    %67 = phi i32 [ 0, %31 ], [ %107, %65 ]
-    %68 = phi i32 [ %30, %31 ], [ %109, %65 ]
+    %68 = phi i32 [ %start1, %31 ], [ %109, %65 ]
    %69 = bitcast i32* %0 to i8*
    %70 = bitcast i32* %1 to i8*
    %71 = getelementptr i8, i8* %70, i32 %66
@ -141,7 +141,7 @@

  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -353,7 +353,7 @@ body:             |
    renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
    $r2 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.3 (%ir-block.33):
    successors: %bb.3(0x7c000000), %bb.4(0x04000000)
@ -402,7 +402,7 @@ body:             |
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14, $noreg, $noreg
    renamable $r2, dead $cpsr = tMOVi8 0, 14, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.8 (%ir-block.65):
    successors: %bb.8(0x7c000000), %bb.9(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir
@ -18,13 +18,13 @@
    br i1 %10, label %34, label %17

  17:                                               ; preds = %4
-    call void @llvm.set.loop.iterations.i32(i32 %16)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %16)
    br label %18

  18:                                               ; preds = %18, %17
    %19 = phi i32* [ %31, %18 ], [ %0, %17 ]
    %20 = phi i32* [ %30, %18 ], [ %1, %17 ]
-    %21 = phi i32 [ %16, %17 ], [ %32, %18 ]
+    %21 = phi i32 [ %start, %17 ], [ %32, %18 ]
    %22 = phi i32 [ %9, %17 ], [ %26, %18 ]
    %23 = bitcast i32* %19 to <4 x i32>*
    %24 = bitcast i32* %20 to <4 x i32>*
@ -45,7 +45,7 @@
  }
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)

@ -143,7 +143,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
    $r3 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2 (%ir-block.18):
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir
@ -8,7 +8,7 @@
    br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader

  for.body.preheader:                               ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
    br label %for.body

  for.cond.cleanup:                                 ; preds = %for.end, %entry
@ -18,7 +18,7 @@
    %lsr.iv4 = phi i32* [ %b, %for.body.preheader ], [ %scevgep5, %for.end ]
    %lsr.iv2 = phi i32* [ %c, %for.body.preheader ], [ %scevgep3, %for.end ]
    %lsr.iv1 = phi i32* [ %a, %for.body.preheader ], [ %scevgep, %for.end ]
-    %lsr.iv = phi i32 [ %N, %for.body.preheader ], [ %lsr.iv.next, %for.end ]
+    %lsr.iv = phi i32 [ %start, %for.body.preheader ], [ %lsr.iv.next, %for.end ]
    %size = call i32 @llvm.arm.space(i32 3072, i32 undef)
    %0 = load i32, i32* %lsr.iv4, align 4
    %1 = load i32, i32* %lsr.iv2, align 4
@ -46,7 +46,7 @@
  declare i32 @llvm.arm.space(i32 immarg, i32) #0

  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1

  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@ -166,7 +166,7 @@ body:             |
    liveins: $r0, $r1, $r2, $r3, $r4, $lr

    $lr = tMOVr $r3, 14, $noreg
-    t2DoLoopStart killed $r3
+    $lr = t2DoLoopStart killed $r3
    tB %bb.2, 14, $noreg

  bb.2.for.end:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir
@ -14,14 +14,14 @@
    br i1 %cmp30, label %for.cond.cleanup6, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv68 = phi i32* [ %scevgep69, %vector.body ], [ %a, %vector.ph ]
    %lsr.iv65 = phi i32* [ %scevgep66, %vector.body ], [ %c, %vector.ph ]
    %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %b, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv6870 = bitcast i32* %lsr.iv68 to <4 x i32>*
    %lsr.iv6567 = bitcast i32* %lsr.iv65 to <4 x i32>*
@ -50,14 +50,14 @@
    br i1 %13, label %for.cond.cleanup6, label %vector.ph39

  vector.ph39:                                      ; preds = %for.cond4.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %19)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %19)
    br label %vector.body38

  vector.body38:                                    ; preds = %vector.body38, %vector.ph39
    %lsr.iv59 = phi i32* [ %scevgep60, %vector.body38 ], [ %a, %vector.ph39 ]
    %lsr.iv56 = phi i32* [ %scevgep57, %vector.body38 ], [ %c, %vector.ph39 ]
    %lsr.iv = phi i32* [ %scevgep, %vector.body38 ], [ %b, %vector.ph39 ]
-    %20 = phi i32 [ %19, %vector.ph39 ], [ %26, %vector.body38 ]
+    %20 = phi i32 [ %start2, %vector.ph39 ], [ %26, %vector.body38 ]
    %21 = phi i32 [ %N, %vector.ph39 ], [ %23, %vector.body38 ]
    %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>*
    %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>*
@ -94,14 +94,14 @@
    br i1 %cmp30, label %for.cond4.preheader, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv68 = phi i32* [ %scevgep69, %vector.body ], [ %a, %vector.ph ]
    %lsr.iv65 = phi i32* [ %scevgep66, %vector.body ], [ %c, %vector.ph ]
    %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %b, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv6870 = bitcast i32* %lsr.iv68 to <4 x i32>*
    %lsr.iv6567 = bitcast i32* %lsr.iv65 to <4 x i32>*
@ -130,14 +130,14 @@
    br i1 %cmp528, label %for.cond.cleanup6, label %vector.ph39

  vector.ph39:                                      ; preds = %for.cond4.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %18)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %18)
    br label %vector.body38

  vector.body38:                                    ; preds = %vector.body38, %vector.ph39
    %lsr.iv59 = phi i32* [ %scevgep60, %vector.body38 ], [ %a, %vector.ph39 ]
    %lsr.iv56 = phi i32* [ %scevgep57, %vector.body38 ], [ %c, %vector.ph39 ]
    %lsr.iv = phi i32* [ %scevgep, %vector.body38 ], [ %b, %vector.ph39 ]
-    %19 = phi i32 [ %18, %vector.ph39 ], [ %25, %vector.body38 ]
+    %19 = phi i32 [ %start2, %vector.ph39 ], [ %25, %vector.body38 ]
    %20 = phi i32 [ %N, %vector.ph39 ], [ %22, %vector.body38 ]
    %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>*
    %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>*
@ -173,14 +173,14 @@
    br i1 %cmp54, label %for.cond.cleanup17, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv123 = phi i32* [ %scevgep124, %vector.body ], [ %a, %vector.ph ]
    %lsr.iv120 = phi i32* [ %scevgep121, %vector.body ], [ %c, %vector.ph ]
    %lsr.iv117 = phi i32* [ %scevgep118, %vector.body ], [ %b, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv123125 = bitcast i32* %lsr.iv123 to <4 x i32>*
    %lsr.iv120122 = bitcast i32* %lsr.iv120 to <4 x i32>*
@ -210,14 +210,14 @@
    br i1 %cmp552, label %for.cond15.preheader, label %vector.ph66

  vector.ph66:                                      ; preds = %for.cond4.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %18)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %18)
    br label %vector.body65

  vector.body65:                                    ; preds = %vector.body65, %vector.ph66
    %lsr.iv114 = phi i32* [ %scevgep115, %vector.body65 ], [ %a, %vector.ph66 ]
    %lsr.iv111 = phi i32* [ %scevgep112, %vector.body65 ], [ %c, %vector.ph66 ]
    %lsr.iv108 = phi i32* [ %scevgep109, %vector.body65 ], [ %b, %vector.ph66 ]
-    %19 = phi i32 [ %18, %vector.ph66 ], [ %25, %vector.body65 ]
+    %19 = phi i32 [ %start2, %vector.ph66 ], [ %25, %vector.body65 ]
    %20 = phi i32 [ %div, %vector.ph66 ], [ %22, %vector.body65 ]
    %lsr.iv114116 = bitcast i32* %lsr.iv114 to <4 x i32>*
    %lsr.iv111113 = bitcast i32* %lsr.iv111 to <4 x i32>*
@ -248,14 +248,14 @@
    br i1 %27, label %for.cond.cleanup17, label %vector.ph85

  vector.ph85:                                      ; preds = %for.cond15.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %33)
+    %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %33)
    br label %vector.body84

  vector.body84:                                    ; preds = %vector.body84, %vector.ph85
    %lsr.iv105 = phi i32* [ %scevgep106, %vector.body84 ], [ %a, %vector.ph85 ]
    %lsr.iv102 = phi i32* [ %scevgep103, %vector.body84 ], [ %c, %vector.ph85 ]
    %lsr.iv = phi i32* [ %scevgep, %vector.body84 ], [ %b, %vector.ph85 ]
-    %34 = phi i32 [ %33, %vector.ph85 ], [ %40, %vector.body84 ]
+    %34 = phi i32 [ %start3, %vector.ph85 ], [ %40, %vector.body84 ]
    %35 = phi i32 [ %N, %vector.ph85 ], [ %37, %vector.body84 ]
    %lsr.iv105107 = bitcast i32* %lsr.iv105 to <4 x i32>*
    %lsr.iv102104 = bitcast i32* %lsr.iv102 to <4 x i32>*
@ -280,7 +280,7 @@
  }
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)

@ -431,7 +431,7 @@ body:             |
    $r4 = tMOVr $r3, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r6, renamable $r12, 19, 14, $noreg, $noreg
    $r6 = tMOVr $r1, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -462,7 +462,7 @@ body:             |
    renamable $r6, dead $cpsr = tMOVi8 1, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r6, killed renamable $r12, 19, 14, $noreg, $noreg
    $r12 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.5.vector.body38:
    successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@ -637,7 +637,7 @@ body:             |
    renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 4, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs renamable $r12, killed renamable $r6, 19, 14, $noreg, $noreg
    $r6 = tMOVr $r2, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -670,7 +670,7 @@ body:             |
    renamable $r6 = t2BICri killed renamable $r6, 3, 14, $noreg, $noreg
    renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 4, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r6, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.5.vector.body38:
    successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@ -878,7 +878,7 @@ body:             |
    $r4 = tMOVr $r3, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r6, renamable $r12, 19, 14, $noreg, $noreg
    $r6 = tMOVr $r1, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -919,7 +919,7 @@ body:             |
    $r4 = tMOVr $r1, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs renamable $r8, killed renamable $r6, 19, 14, $noreg, $noreg
    $r6 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.5.vector.body65:
    successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@ -952,7 +952,7 @@ body:             |

    renamable $lr = nuw nsw t2ADDrs killed renamable $r8, killed renamable $r12, 19, 14, $noreg, $noreg
    $r5 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.8.vector.body84:
    successors: %bb.8(0x7c000000), %bb.9(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@ -92,9 +92,9 @@ define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float
 ; CHECK-NEXT:    sub.w r7, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB0_12: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
@ -311,9 +311,9 @@ define arm_aapcs_vfpcc void @float_float_add(float* nocapture readonly %a, float
 ; CHECK-NEXT:    sub.w r7, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB1_12: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
@ -530,9 +530,9 @@ define arm_aapcs_vfpcc void @float_float_sub(float* nocapture readonly %a, float
 ; CHECK-NEXT:    sub.w r7, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB2_12: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
@ -680,9 +680,9 @@ define arm_aapcs_vfpcc void @float_int_mul(float* nocapture readonly %a, i32* no
 ; CHECK-NEXT:    sub.w r7, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB3_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
@ -889,10 +889,10 @@ define arm_aapcs_vfpcc void @float_int_int_mul(i32* nocapture readonly %a, i32*
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r4], #16
@ -906,11 +906,11 @@ define arm_aapcs_vfpcc void @float_int_int_mul(i32* nocapture readonly %a, i32*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB4_6: @ %for.body.preheader11
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB4_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r0], #4
@ -994,10 +994,10 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr.w r9, [r4]
@ -1021,11 +1021,11 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    cmp r12, r3
 ; CHECK-NEXT:    beq .LBB5_8
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader11
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB5_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r1]
@ -1111,10 +1111,10 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr.w r9, [r4]
@ -1138,11 +1138,11 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    cmp r12, r3
 ; CHECK-NEXT:    beq .LBB6_8
 ; CHECK-NEXT:  .LBB6_6: @ %for.body.preheader11
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB6_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r1]
@ -1228,10 +1228,10 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr.w r9, [r4]
@ -1255,11 +1255,11 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    cmp r12, r3
 ; CHECK-NEXT:    beq .LBB7_8
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader11
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB7_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r1]
@ -1345,10 +1345,10 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q0, [r5], #8
@ -1377,11 +1377,11 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    cmp r12, r3
 ; CHECK-NEXT:    beq .LBB8_8
 ; CHECK-NEXT:  .LBB8_6: @ %for.body.preheader13
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB8_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r3, [r1], #2
@ -1476,9 +1476,9 @@ define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* n
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vldr s0, .LCPI9_0
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
+; CHECK-NEXT:    add.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:  .LBB9_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, r0, r3
@ -1633,9 +1633,9 @@ define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* n
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vldr s0, .LCPI10_0
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
+; CHECK-NEXT:    add.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:  .LBB10_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, r0, r3
@ -1790,10 +1790,10 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vldr s0, .LCPI11_0
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
+; CHECK-NEXT:    add.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    adds r3, r1, #4
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adds r2, r0, #4
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB11_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh.w r4, [r3, #2]
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@ -15,9 +15,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@ -91,9 +91,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@ -167,9 +167,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@ -243,9 +243,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@ -319,9 +319,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@ -430,10 +430,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    add.w r4, r3, #8
 ; CHECK-NEXT:    subs r5, #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
+; CHECK-NEXT:    add.w r6, r6, r5, lsr #2
 ; CHECK-NEXT:    adds r5, r0, #3
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    adds r6, r1, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r8, [r5, #-3]
@ -624,8 +624,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@ -732,10 +732,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    add.w r4, r3, #8
 ; CHECK-NEXT:    subs r5, #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
+; CHECK-NEXT:    add.w r6, r6, r5, lsr #2
 ; CHECK-NEXT:    adds r5, r0, #3
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    adds r6, r1, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r8, [r5, #-3]
@ -926,8 +926,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@ -1034,10 +1034,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
 ; CHECK-NEXT:    add.w r4, r3, #8
 ; CHECK-NEXT:    subs r5, #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
+; CHECK-NEXT:    add.w r6, r6, r5, lsr #2
 ; CHECK-NEXT:    add.w r5, r0, #8
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    add.w r6, r1, #8
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r8, [r5, #-8]
@ -1214,8 +1214,8 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.16 lr, r3
+; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #8
--- a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@ -12,47 +12,47 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[N_VEC]], -4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TT:%.*]] = add i32 [[N_VEC]], -4
+; CHECK-NEXT:    [[TT1:%.*]] = lshr i32 [[TT]], 2
+; CHECK-NEXT:    [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[I_025_US:%.*]] = phi i32 [ [[INC10_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16*, i16** [[A:%.*]], i32 [[I_025_US]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TT3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_025_US]]
 ; CHECK-NEXT:    [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX8_PROMOTED_US]], i32 0
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TT4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX8_PROMOTED_US]], i32 0
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TT2]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i16, i16* [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-NEXT:    [[TMP14]] = add nsw <4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TT8:%.*]] = bitcast i16* [[TT6]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TT8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
+; CHECK-NEXT:    [[TT9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TT10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TT11:%.*]] = bitcast i16* [[TT10]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TT11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
+; CHECK-NEXT:    [[TT12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32>
+; CHECK-NEXT:    [[TT13:%.*]] = mul nsw <4 x i32> [[TT12]], [[TT9]]
+; CHECK-NEXT:    [[TT14]] = add nsw <4 x i32> [[TT13]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP5]], i32 1)
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; CHECK-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
+; CHECK-NEXT:    [[TT15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TT5]], i32 1)
+; CHECK-NEXT:    [[TT16:%.*]] = icmp ne i32 [[TT15]], 0
+; CHECK-NEXT:    br i1 [[TT16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
-; CHECK-NEXT:    store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4
+; CHECK-NEXT:    [[TT17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TT14]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TT18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TT17]])
+; CHECK-NEXT:    store i32 [[TT18]], i32* [[ARRAYIDX8_US]], align 4
 ; CHECK-NEXT:    [[INC10_US]] = add nuw i32 [[I_025_US]], 1
 ; CHECK-NEXT:    [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND27]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
@ -69,51 +69,51 @@ for.cond1.preheader.us.preheader:                 ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer
-  %tmp = add i32 %n.vec, -4
-  %tmp1 = lshr i32 %tmp, 2
-  %tmp2 = add nuw nsw i32 %tmp1, 1
+  %tt = add i32 %n.vec, -4
+  %tt1 = lshr i32 %tt, 2
+  %tt2 = add nuw nsw i32 %tt1, 1
  br label %for.cond1.preheader.us

 for.cond1.preheader.us:                           ; preds = %middle.block, %for.cond1.preheader.us.preheader
  %i.025.us = phi i32 [ %inc10.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ]
  %arrayidx.us = getelementptr inbounds i16*, i16** %A, i32 %i.025.us
-  %tmp3 = load i16*, i16** %arrayidx.us, align 4
+  %tt3 = load i16*, i16** %arrayidx.us, align 4
  %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.025.us
  %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
-  %tmp4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx8.promoted.us, i32 0
-  call void @llvm.set.loop.iterations.i32(i32 %tmp2)
+  %tt4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx8.promoted.us, i32 0
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tt2)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %for.cond1.preheader.us
  %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
-  %vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp14, %vector.body ]
-  %tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp15, %vector.body ]
+  %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt14, %vector.body ]
+  %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-  %tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index
+  %tt6 = getelementptr inbounds i16, i16* %tt3, i32 %index

-  ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29
-  %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
+  ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat29
+  %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

-  %tmp8 = bitcast i16* %tmp6 to <4 x i16>*
-  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef)
-  %tmp9 = sext <4 x i16> %wide.masked.load to <4 x i32>
-  %tmp10 = getelementptr inbounds i16, i16* %B, i32 %index
-  %tmp11 = bitcast i16* %tmp10 to <4 x i16>*
-  %wide.masked.load30 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp11, i32 2, <4 x i1> %tmp7, <4 x i16> undef)
-  %tmp12 = sext <4 x i16> %wide.masked.load30 to <4 x i32>
-  %tmp13 = mul nsw <4 x i32> %tmp12, %tmp9
-  %tmp14 = add nsw <4 x i32> %tmp13, %vec.phi
+  %tt8 = bitcast i16* %tt6 to <4 x i16>*
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt8, i32 2, <4 x i1> %tt7, <4 x i16> undef)
+  %tt9 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %tt10 = getelementptr inbounds i16, i16* %B, i32 %index
+  %tt11 = bitcast i16* %tt10 to <4 x i16>*
+  %wide.masked.load30 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt11, i32 2, <4 x i1> %tt7, <4 x i16> undef)
+  %tt12 = sext <4 x i16> %wide.masked.load30 to <4 x i32>
+  %tt13 = mul nsw <4 x i32> %tt12, %tt9
+  %tt14 = add nsw <4 x i32> %tt13, %vec.phi
  %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp5, i32 1)
-  %tmp16 = icmp ne i32 %tmp15, 0
-  br i1 %tmp16, label %vector.body, label %middle.block
+  %tt15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tt5, i32 1)
+  %tt16 = icmp ne i32 %tt15, 0
+  br i1 %tt16, label %vector.body, label %middle.block

 middle.block:                                     ; preds = %vector.body
-  %tmp17 = select <4 x i1> %tmp7, <4 x i32> %tmp14, <4 x i32> %vec.phi
-  %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17)
-  store i32 %tmp18, i32* %arrayidx8.us, align 4
+  %tt17 = select <4 x i1> %tt7, <4 x i32> %tt14, <4 x i32> %vec.phi
+  %tt18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tt17)
+  store i32 %tt18, i32* %arrayidx8.us, align 4
  %inc10.us = add nuw i32 %i.025.us, 1
  %exitcond27 = icmp eq i32 %inc10.us, %N
  br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
@ -133,45 +133,45 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[N_VEC]], -4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TT:%.*]] = add i32 [[N_VEC]], -4
+; CHECK-NEXT:    [[TT1:%.*]] = lshr i32 [[TT]], 2
+; CHECK-NEXT:    [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[I_024_US:%.*]] = phi i32 [ [[INC9_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i32 [[I_024_US]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TT3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_024_US]]
 ; CHECK-NEXT:    [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX7_PROMOTED_US]], i32 0
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TT4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX7_PROMOTED_US]], i32 0
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TT2]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i32, i32* [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP12]] = add nsw <4 x i32> [[VEC_PHI]], [[TMP11]]
+; CHECK-NEXT:    [[TT8:%.*]] = bitcast i32* [[TT6]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TT8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TT9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TT10:%.*]] = bitcast i32* [[TT9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TT10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TT11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TT12]] = add nsw <4 x i32> [[VEC_PHI]], [[TT11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP5]], i32 1)
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
+; CHECK-NEXT:    [[TT13]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TT5]], i32 1)
+; CHECK-NEXT:    [[TT14:%.*]] = icmp ne i32 [[TT13]], 0
+; CHECK-NEXT:    br i1 [[TT14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
-; CHECK-NEXT:    store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4
+; CHECK-NEXT:    [[TT15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TT12]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TT16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TT15]])
+; CHECK-NEXT:    store i32 [[TT16]], i32* [[ARRAYIDX7_US]], align 4
 ; CHECK-NEXT:    [[INC9_US]] = add nuw i32 [[I_024_US]], 1
 ; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND26]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
@ -188,49 +188,49 @@ for.cond1.preheader.us.preheader:                 ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer
-  %tmp = add i32 %n.vec, -4
-  %tmp1 = lshr i32 %tmp, 2
-  %tmp2 = add nuw nsw i32 %tmp1, 1
+  %tt = add i32 %n.vec, -4
+  %tt1 = lshr i32 %tt, 2
+  %tt2 = add nuw nsw i32 %tt1, 1
  br label %for.cond1.preheader.us

 for.cond1.preheader.us:                           ; preds = %middle.block, %for.cond1.preheader.us.preheader
  %i.024.us = phi i32 [ %inc9.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ]
  %arrayidx.us = getelementptr inbounds i32*, i32** %A, i32 %i.024.us
-  %tmp3 = load i32*, i32** %arrayidx.us, align 4
+  %tt3 = load i32*, i32** %arrayidx.us, align 4
  %arrayidx7.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
  %arrayidx7.promoted.us = load i32, i32* %arrayidx7.us, align 4
-  %tmp4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx7.promoted.us, i32 0
-  call void @llvm.set.loop.iterations.i32(i32 %tmp2)
+  %tt4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx7.promoted.us, i32 0
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tt2)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %for.cond1.preheader.us
  %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
-  %vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp12, %vector.body ]
-  %tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp13, %vector.body ]
+  %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt12, %vector.body ]
+  %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt13, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-  %tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index
+  %tt6 = getelementptr inbounds i32, i32* %tt3, i32 %index

-  ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28
-  %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
+  ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat28
+  %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

-  %tmp8 = bitcast i32* %tmp6 to <4 x i32>*
-  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef)
-  %tmp9 = getelementptr inbounds i32, i32* %B, i32 %index
-  %tmp10 = bitcast i32* %tmp9 to <4 x i32>*
-  %wide.masked.load29 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp10, i32 4, <4 x i1> %tmp7, <4 x i32> undef)
-  %tmp11 = mul nsw <4 x i32> %wide.masked.load29, %wide.masked.load
-  %tmp12 = add nsw <4 x i32> %vec.phi, %tmp11
+  %tt8 = bitcast i32* %tt6 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt8, i32 4, <4 x i1> %tt7, <4 x i32> undef)
+  %tt9 = getelementptr inbounds i32, i32* %B, i32 %index
+  %tt10 = bitcast i32* %tt9 to <4 x i32>*
+  %wide.masked.load29 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt10, i32 4, <4 x i1> %tt7, <4 x i32> undef)
+  %tt11 = mul nsw <4 x i32> %wide.masked.load29, %wide.masked.load
+  %tt12 = add nsw <4 x i32> %vec.phi, %tt11
  %index.next = add i32 %index, 4
-  %tmp13 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp5, i32 1)
-  %tmp14 = icmp ne i32 %tmp13, 0
-  br i1 %tmp14, label %vector.body, label %middle.block
+  %tt13 = call i32 @llvm.loop.decrement.reg.i32(i32 %tt5, i32 1)
+  %tt14 = icmp ne i32 %tt13, 0
+  br i1 %tt14, label %vector.body, label %middle.block

 middle.block:                                     ; preds = %vector.body
-  %tmp15 = select <4 x i1> %tmp7, <4 x i32> %tmp12, <4 x i32> %vec.phi
-  %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp15)
-  store i32 %tmp16, i32* %arrayidx7.us, align 4
+  %tt15 = select <4 x i1> %tt7, <4 x i32> %tt12, <4 x i32> %vec.phi
+  %tt16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tt15)
+  store i32 %tt16, i32* %arrayidx7.us, align 4
  %inc9.us = add nuw i32 %i.024.us, 1
  %exitcond26 = icmp eq i32 %inc9.us, %N
  br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
@ -250,7 +250,7 @@ declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1

 ; Function Attrs: noduplicate nounwind
-declare void @llvm.set.loop.iterations.i32(i32) #2
+declare i32 @llvm.start.loop.iterations.i32(i32) #2

 ; Function Attrs: noduplicate nounwind
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #2
--- a/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
@ -13,11 +13,11 @@
    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
@ -49,7 +49,7 @@
  }
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

@ -152,7 +152,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r3 = tMOVr killed $r12, 14, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
@ -14,7 +14,7 @@
    br i1 %cmp11, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    %6 = shl i32 %4, 3
    %7 = sub i32 %N, %6
    br label %vector.body
@ -23,7 +23,7 @@
    %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %13, %vector.body ]
-    %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ]
+    %8 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ]
    %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ]
    %lsr.iv2022 = bitcast i8* %lsr.iv20 to <16 x i8>*
    %lsr.iv19 = bitcast i8* %lsr.iv to <16 x i8>*
@ -54,7 +54,7 @@

  declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
  declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4

@ -180,7 +180,7 @@ body:             |
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 35, 14, $noreg, $noreg
    renamable $r3 = t2LSRri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir
@ -14,14 +14,14 @@
    br i1 %cmp10, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv19 = phi i8* [ %scevgep20, %vector.body ], [ %res, %vector.ph ]
    %lsr.iv16 = phi i8* [ %scevgep17, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv1921 = bitcast i8* %lsr.iv19 to <16 x i8>*
    %lsr.iv1618 = bitcast i8* %lsr.iv16 to <16 x i8>*
@ -45,7 +45,7 @@

  declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
  declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <16 x i1> @llvm.arm.mve.vctp8(i32)

@ -155,7 +155,7 @@ body:             |
    renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg
    renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir
@ -14,11 +14,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
@ -39,7 +39,7 @@
  }

  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
  declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
@ -123,7 +123,7 @@ body:             |
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
@ -14,14 +14,14 @@
    br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv13 = bitcast i8* %lsr.iv to <4 x i8>*
    %lsr.iv1416 = bitcast i8* %lsr.iv14 to <4 x i8>*
@ -61,14 +61,14 @@
    br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv15 = phi i8* [ %scevgep16, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv14 = bitcast i8* %lsr.iv to <4 x i8>*
    %lsr.iv1517 = bitcast i8* %lsr.iv15 to <4 x i8>*
@ -108,14 +108,14 @@
    br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv13 = bitcast i16* %lsr.iv to <4 x i16>*
    %lsr.iv1416 = bitcast i16* %lsr.iv14 to <4 x i16>*
@ -155,14 +155,14 @@
    br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv15 = phi i16* [ %scevgep16, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv14 = bitcast i16* %lsr.iv to <4 x i16>*
    %lsr.iv1517 = bitcast i16* %lsr.iv15 to <4 x i16>*
@ -203,14 +203,14 @@
    br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv13 = phi i32* [ %scevgep14, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv12 = bitcast i32* %lsr.iv to <4 x i32>*
    %lsr.iv1315 = bitcast i32* %lsr.iv13 to <4 x i32>*
@ -249,14 +249,14 @@
    br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
    %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@ -286,7 +286,7 @@
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)

@ -372,7 +372,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body (align 4):
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -478,7 +478,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body (align 4):
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -585,7 +585,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body (align 4):
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -691,7 +691,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body (align 4):
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -797,7 +797,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body (align 4):
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -903,7 +903,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.vector.body (align 4):
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@ -69,26 +69,26 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vadd.i16 q1, q0, q1
+; CHECK-NEXT:    vadd.i16 q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, q2
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    le lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@ -142,25 +142,25 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    add.w r3, r2, #15
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #15
 ; CHECK-NEXT:    sub.w r12, r3, #16
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #4
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.8 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u8 q1, [r1], #16
+; CHECK-NEXT:    vldrbt.u8 q0, [r1], #16
 ; CHECK-NEXT:    vldrbt.u8 q2, [r0], #16
 ; CHECK-NEXT:    subs r2, #16
-; CHECK-NEXT:    vsub.i8 q1, q2, q1
-; CHECK-NEXT:    vadd.i8 q1, q1, q0
+; CHECK-NEXT:    vsub.i8 q0, q2, q0
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB2_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    uxtb r0, r0
@ -212,25 +212,25 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vsub.i16 q1, q2, q1
-; CHECK-NEXT:    vadd.i16 q1, q1, q0
+; CHECK-NEXT:    vsub.i16 q0, q2, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB3_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@ -284,25 +284,25 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    add.w r3, r2, #15
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #15
 ; CHECK-NEXT:    sub.w r12, r3, #16
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #4
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.8 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u8 q1, [r0], #16
+; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
 ; CHECK-NEXT:    vldrbt.u8 q2, [r1], #16
 ; CHECK-NEXT:    subs r2, #16
-; CHECK-NEXT:    vmul.i8 q1, q2, q1
-; CHECK-NEXT:    vadd.i8 q1, q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q2, q0
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB4_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    uxtb r0, r0
@ -354,25 +354,25 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vmul.i16 q1, q2, q1
-; CHECK-NEXT:    vadd.i16 q1, q1, q0
+; CHECK-NEXT:    vmul.i16 q0, q2, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@ -423,36 +423,36 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
 ; CHECK-NEXT:    beq .LBB6_8
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    subs r6, r3, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r6, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u32 q1, [r4], #4
+; CHECK-NEXT:    vldrbt.u32 q0, [r4], #4
 ; CHECK-NEXT:    vldrbt.u32 q2, [r5], #4
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vmul.i32 q1, q2, q1
-; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    vmul.i32 q0, q2, q0
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB6_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r12, q0
 ; CHECK-NEXT:    cbz r2, .LBB6_7
 ; CHECK-NEXT:  @ %bb.4: @ %vector.ph47
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:    add.w r3, r3, r6, lsr #2
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    vdup.32 q0, r6
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vmov.32 q0[0], r12
 ; CHECK-NEXT:  .LBB6_5: @ %vector.body46
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@ -550,32 +550,32 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur
 ; CHECK-NEXT:    cbz r2, .LBB7_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    bic r3, r3, #7
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r3, #8
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    add.w lr, r4, r3, lsr #3
-; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    add.w r3, r4, r3, lsr #3
 ; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q1, [r3], #8
+; CHECK-NEXT:    vldrbt.u16 q0, [r3], #8
 ; CHECK-NEXT:    vldrbt.u16 q4, [r4], #8
 ; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    vsub.i16 q3, q4, q1
-; CHECK-NEXT:    vmul.i16 q1, q4, q1
+; CHECK-NEXT:    vsub.i16 q3, q4, q0
+; CHECK-NEXT:    vmul.i16 q0, q4, q0
 ; CHECK-NEXT:    subs r2, #8
 ; CHECK-NEXT:    vadd.i16 q3, q3, q2
-; CHECK-NEXT:    vadd.i16 q1, q1, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB7_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u16 r4, q2
 ; CHECK-NEXT:    vaddv.u16 r2, q0
 ; CHECK-NEXT:    b .LBB7_5
@ -643,40 +643,40 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) {
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    mov r1, r0
 ; CHECK-NEXT:    movw r12, #47184
-; CHECK-NEXT:    movw r3, #23593
 ; CHECK-NEXT:    ldrd r2, lr, [r1, #4]
+; CHECK-NEXT:    movw r1, #23593
 ; CHECK-NEXT:    movt r12, #1310
-; CHECK-NEXT:    movt r3, #49807
-; CHECK-NEXT:    mla r3, lr, r3, r12
-; CHECK-NEXT:    movw r1, #55051
+; CHECK-NEXT:    movt r1, #49807
+; CHECK-NEXT:    mla r1, lr, r1, r12
+; CHECK-NEXT:    movw r3, #55051
 ; CHECK-NEXT:    movw r4, #23593
-; CHECK-NEXT:    movt r1, #163
+; CHECK-NEXT:    movt r3, #163
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    movt r4, #655
-; CHECK-NEXT:    ror.w r12, r3, #4
-; CHECK-NEXT:    cmp r12, r1
-; CHECK-NEXT:    cset r1, lo
-; CHECK-NEXT:    ror.w r3, r3, #2
+; CHECK-NEXT:    ror.w r12, r1, #4
+; CHECK-NEXT:    cmp r12, r3
+; CHECK-NEXT:    cset r3, lo
+; CHECK-NEXT:    ror.w r1, r1, #2
 ; CHECK-NEXT:    mov.w r12, #1
-; CHECK-NEXT:    cmp r3, r4
-; CHECK-NEXT:    csel r3, r1, r12, lo
+; CHECK-NEXT:    cmp r1, r4
+; CHECK-NEXT:    csel r1, r3, r12, lo
 ; CHECK-NEXT:    lsls.w r4, lr, #30
-; CHECK-NEXT:    csel r1, r1, r3, ne
+; CHECK-NEXT:    csel r3, r3, r1, ne
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
-; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    movs r4, #52
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
-; CHECK-NEXT:    movw r3, :lower16:days
-; CHECK-NEXT:    movt r3, :upper16:days
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    mla r1, r1, r4, r3
+; CHECK-NEXT:    adds r1, r2, #3
+; CHECK-NEXT:    bic r1, r1, #3
+; CHECK-NEXT:    subs r1, #4
+; CHECK-NEXT:    add.w r4, r12, r1, lsr #2
+; CHECK-NEXT:    movw r12, :lower16:days
+; CHECK-NEXT:    movt r12, :upper16:days
+; CHECK-NEXT:    movs r1, #52
+; CHECK-NEXT:    mla r1, r3, r1, r12
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
--- a/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
@ -105,8 +105,8 @@ define void @dont_remat_predicated_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32*
 ; CHECK-NEXT:    vmov.i32 q2, #0x1
 ; CHECK-NEXT:    add.w lr, r5, #3
 ; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    add.w lr, r5, lr, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r5, r5, lr, lsr #2
+; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:  .LBB1_1: @ %bb6
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r12
--- a/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir
@ -31,13 +31,13 @@
    %ind.end17 = getelementptr float, float* %pDst, i32 %n.vec
    %scevgep9 = getelementptr float, float* %pDst, i32 -4
    %scevgep14 = getelementptr float, float* %pSrc, i32 -4
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv15 = phi float* [ %scevgep16, %vector.body ], [ %scevgep14, %vector.ph ]
    %lsr.iv10 = phi float* [ %scevgep11, %vector.body ], [ %scevgep9, %vector.ph ]
-    %5 = phi i32 [ %4, %vector.ph ], [ %7, %vector.body ]
+    %5 = phi i32 [ %start1, %vector.ph ], [ %7, %vector.body ]
    %lsr.iv1517 = bitcast float* %lsr.iv15 to <4 x float>*
    %lsr.iv1012 = bitcast float* %lsr.iv10 to <4 x float>*
    %scevgep18 = getelementptr <4 x float>, <4 x float>* %lsr.iv1517, i32 1
@ -61,13 +61,13 @@
    %pDst.addr.06.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end17, %middle.block ]
    %scevgep1 = getelementptr float, float* %pSrc.addr.07.ph, i32 -1
    %scevgep4 = getelementptr float, float* %pDst.addr.06.ph, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %blkCnt.08.ph)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %blkCnt.08.ph)
    br label %while.body

  while.body:                                       ; preds = %while.body, %while.body.preheader19
    %lsr.iv5 = phi float* [ %scevgep6, %while.body ], [ %scevgep4, %while.body.preheader19 ]
    %lsr.iv = phi float* [ %scevgep2, %while.body ], [ %scevgep1, %while.body.preheader19 ]
-    %9 = phi i32 [ %blkCnt.08.ph, %while.body.preheader19 ], [ %12, %while.body ]
+    %9 = phi i32 [ %start2, %while.body.preheader19 ], [ %12, %while.body ]
    %scevgep3 = getelementptr float, float* %lsr.iv, i32 1
    %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1
    %10 = load float, float* %scevgep3, align 4
@ -84,7 +84,7 @@
  }
  declare float @llvm.fabs.f32(float)
  declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

 ...
@ -262,7 +262,7 @@ body:             |
    renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
    renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg
    $r5 = tMOVr killed $r3, 14, $noreg
    renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg
@ -305,7 +305,7 @@ body:             |

    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg
    renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.8.while.body:
    successors: %bb.8(0x7c000000), %bb.9(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir
@ -14,12 +14,12 @@
    br i1 %cmp6, label %while.end, label %while.body.preheader
  
  while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    br label %while.body
  
  while.body:                                       ; preds = %while.body, %while.body.preheader
    %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
-    %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ]
+    %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ]
    %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)()
    %add = add nsw i32 %call, %res.07
    %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
@ -33,7 +33,7 @@
  
  declare i32 @bar(...) local_unnamed_addr #0
  
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
  
  attributes #0 = { "target-features"="+mve.fp" }
@ -109,7 +109,7 @@ body:             |
  
    $lr = tMOVr $r0, 14, $noreg
    renamable $r4, dead $cpsr = tMOVi8 0, 14, $noreg
-    t2DoLoopStart killed $r0
+    $lr = t2DoLoopStart killed $r0
  
  bb.2.while.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir
@ -14,11 +14,11 @@
    br i1 %cmp6, label %while.end, label %while.body.preheader
  
  while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    br label %while.body
  
  while.body:                                       ; preds = %while.body, %while.body.preheader
-    %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ]
+    %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ]
    %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
    %add = add i32 %1, 0
    %2 = icmp ne i32 %1, 0
@ -29,7 +29,7 @@
    ret i32 %res.0.lcssa
  }
  
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
  
  attributes #0 = { "target-features"="+mve.fp" }
@ -96,7 +96,7 @@ body:             |
    liveins: $r0
  
    $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart killed $r0
+    $lr = t2DoLoopStart killed $r0
  
  bb.2.while.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir
@ -14,11 +14,11 @@
    br i1 %cmp6, label %while.end, label %while.body.preheader
  
  while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    br label %while.body
  
  while.body:                                       ; preds = %while.body, %while.body.preheader
-    %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ]
+    %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ]
    %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
    %add = add i32 %1, 2
    %2 = icmp ne i32 %1, 0
@ -30,7 +30,7 @@
  }
  
  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
  
  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@ -102,7 +102,7 @@ body:             |
    liveins: $r0
  
    $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart killed $r0
+    $lr = t2DoLoopStart killed $r0
  
  bb.2.while.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir
@ -30,7 +30,7 @@
    %gap.057 = sdiv i32 %gap.057.in, 2
    %cmp252 = icmp slt i32 %gap.057, %n
    %tmp = sub i32 %n, %gap.057
-    call void @llvm.set.loop.iterations.i32(i32 %tmp)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp)
    br i1 %cmp252, label %for.cond4.preheader.preheader, label %for.cond.loopexit
  
  for.cond4.preheader.preheader:                    ; preds = %for.cond1.preheader
@ -44,7 +44,7 @@
    %lsr.iv2 = phi i32* [ %scevgep3, %for.inc16 ], [ %scevgep1, %for.cond4.preheader.preheader ]
    %lsr.iv = phi i32* [ %v, %for.cond4.preheader.preheader ], [ %scevgep, %for.inc16 ]
    %i.053 = phi i32 [ %inc, %for.inc16 ], [ %gap.057, %for.cond4.preheader.preheader ]
-    %tmp8 = phi i32 [ %tmp, %for.cond4.preheader.preheader ], [ %tmp16, %for.inc16 ]
+    %tmp8 = phi i32 [ %start, %for.cond4.preheader.preheader ], [ %tmp16, %for.inc16 ]
    %j.048 = sub nsw i32 %i.053, %gap.057
    %cmp549 = icmp sgt i32 %j.048, -1
    br i1 %cmp549, label %land.rhs.preheader, label %for.inc16
@ -93,7 +93,7 @@
  }
  
  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
  
  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
@ -208,7 +208,7 @@ body:             |
    renamable $lr = t2SUBrs renamable $r1, renamable $r2, 9, 14, $noreg, $noreg
    renamable $r9 = t2ASRri renamable $r2, 1, 14, $noreg, $noreg
    t2CMPrs renamable $r1, killed renamable $r2, 9, 14, $noreg, implicit-def $cpsr
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
    tBcc %bb.2, 13, killed $cpsr
  
  bb.4.for.cond4.preheader.preheader:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir
@ -11,7 +11,7 @@
  entry:
    %scevgep = getelementptr i32, i32* %q, i32 -1
    %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    br label %preheader

  preheader:
@ -20,7 +20,7 @@
  while.body:                                       ; preds = %while.body, %entry
    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ]
    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ]
-    %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ]
+    %0 = phi i32 [ %start, %preheader ], [ %2, %while.body ]
    %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
    %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
    %1 = load i32, i32* %scevgep6, align 4
@ -35,7 +35,7 @@
    ret i32 0
  }

-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0

  attributes #0 = { noduplicate nounwind }
@ -120,7 +120,7 @@ body:             |
    frame-setup CFI_INSTRUCTION def_cfa_offset 8
    frame-setup CFI_INSTRUCTION offset $lr, -4
    frame-setup CFI_INSTRUCTION offset $r7, -8
-    t2DoLoopStart $r0
+    $lr = t2DoLoopStart $r0
    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir
@ -8,11 +8,11 @@
    br i1 %cmp, label %exit, label %loop.ph

  loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
    br label %loop.body

  loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
    %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
    %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
    %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@ -43,11 +43,11 @@
    br i1 %cmp, label %exit, label %loop.ph

  loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
    br label %loop.body

  loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
    %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
    %addr.a = phi <8 x i16>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
    %addr.b = phi <8 x i16>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@ -72,7 +72,7 @@
    ret void
  }

-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@ -160,7 +160,7 @@ body:             |
    liveins: $r0, $r1, $r2, $r3, $r4, $lr

    renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
+    $lr = t2DoLoopStart renamable $r4
    $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg

  bb.2.loop.body:
@ -261,7 +261,7 @@ body:             |
    liveins: $r0, $r1, $r2, $r3, $r4, $lr

    renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.loop.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
@ -15,29 +15,29 @@ define arm_aapcs_vfpcc void @test(i16* noalias nocapture readonly %off, i16* noa
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:  .LBB0_3: @ %for.body4.us
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh.w r6, [r0, r5, lsl #1]
-; CHECK-NEXT:    ldrh.w r7, [r1, r5, lsl #1]
-; CHECK-NEXT:    add r6, r7
-; CHECK-NEXT:    strh.w r6, [r4, r5, lsl #1]
-; CHECK-NEXT:    adds r5, #1
+; CHECK-NEXT:    ldrh.w r5, [r0, r6, lsl #1]
+; CHECK-NEXT:    ldrh.w r7, [r1, r6, lsl #1]
+; CHECK-NEXT:    add r5, r7
+; CHECK-NEXT:    strh.w r5, [r4, r6, lsl #1]
+; CHECK-NEXT:    adds r6, #1
 ; CHECK-NEXT:    le lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %for.body15.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:  .LBB0_5: @ %for.body15.us
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh.w r7, [r0, r5, lsl #1]
-; CHECK-NEXT:    ldrh.w r6, [r1, r5, lsl #1]
-; CHECK-NEXT:    add r6, r7
-; CHECK-NEXT:    strh.w r6, [r2, r5, lsl #1]
-; CHECK-NEXT:    adds r5, #1
+; CHECK-NEXT:    ldrh.w r7, [r0, r6, lsl #1]
+; CHECK-NEXT:    ldrh.w r5, [r1, r6, lsl #1]
+; CHECK-NEXT:    add r5, r7
+; CHECK-NEXT:    strh.w r5, [r2, r6, lsl #1]
+; CHECK-NEXT:    adds r6, #1
 ; CHECK-NEXT:    le lr, .LBB0_5
 ; CHECK-NEXT:  @ %bb.6: @ %for.cond.cleanup14.us
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
--- a/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir
@ -16,7 +16,7 @@
    %scevgep = getelementptr i32, i32* %a, i32 -1
    %scevgep4 = getelementptr i32, i32* %c, i32 -1
    %scevgep8 = getelementptr i32, i32* %b, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
    br label %for.body

  for.cond.cleanup:                                 ; preds = %for.body, %entry
@ -26,7 +26,7 @@
    %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
    %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
    %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-    %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.body ]
+    %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.body ]
    %size = call i32 @llvm.arm.space(i32 4070, i32 undef)
    %scevgep3 = getelementptr i32, i32* %lsr.iv9, i32 1
    %1 = load i32, i32* %scevgep3, align 4
@ -47,7 +47,7 @@
  declare i32 @llvm.arm.space(i32 immarg, i32) #0

  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1

  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@ -155,7 +155,7 @@ body:             |
    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
    renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg
    $lr = tMOVr $r3, 14, $noreg
-    t2DoLoopStart killed $r3
+    $lr = t2DoLoopStart killed $r3

  bb.2.for.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
@ -20,7 +20,7 @@

  vector.ph:                                        ; preds = %entry
    %7 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0, !dbg !32
-    call void @llvm.set.loop.iterations.i32(i32 %6), !dbg !32
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %6), !dbg !32
    %8 = shl i32 %5, 2, !dbg !32
    %9 = sub i32 %N, %8, !dbg !32
    br label %vector.body, !dbg !32
@ -28,7 +28,7 @@
  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %b, %vector.ph ], !dbg !33
    %vec.phi = phi <4 x i32> [ %7, %vector.ph ], [ %15, %vector.body ]
-    %10 = phi i32 [ %6, %vector.ph ], [ %16, %vector.body ]
+    %10 = phi i32 [ %start, %vector.ph ], [ %16, %vector.body ]
    %11 = phi i32 [ %N, %vector.ph ], [ %13, %vector.body ]
    %lsr.iv14 = bitcast i16* %lsr.iv to <4 x i16>*
    %12 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %11), !dbg !34
@ -59,7 +59,7 @@
  declare void @llvm.dbg.value(metadata, metadata, metadata)
  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)

@ -261,7 +261,7 @@ body:             |
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, renamable $r3, 19, 14, $noreg, $noreg, debug-location !32
    renamable $r3, dead $cpsr = tLSRri killed renamable $r3, 2, 14, $noreg, debug-location !32
    renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 18, 14, $noreg, $noreg, debug-location !32
-    t2DoLoopStart renamable $lr, debug-location !32
+    $lr = t2DoLoopStart renamable $lr, debug-location !32

  bb.2.vector.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir
@ -17,7 +17,7 @@
    br i1 %cmp11, label %for.cond.cleanup, label %for.body.preheader
  
  for.body.preheader:                               ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
    br label %for.body
  
  for.cond.cleanup:                                 ; preds = %for.inc, %entry
@ -30,7 +30,7 @@
    %lsr.iv1 = phi i8* [ %c, %for.body.preheader ], [ %scevgep, %for.inc ]
    %spaces.013 = phi i32 [ %spaces.1, %for.inc ], [ 0, %for.body.preheader ]
    %found.012 = phi i32 [ %found.1, %for.inc ], [ 0, %for.body.preheader ]
-    %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.inc ]
+    %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.inc ]
    %1 = load i8, i8* %lsr.iv1, align 1
    %2 = zext i8 %1 to i32
    switch i32 %2, label %for.inc [
@ -58,7 +58,7 @@
  }
  
  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
  
  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
@ -130,7 +130,7 @@ body:             |
    liveins: $r0, $r1
  
    $lr = tMOVr $r1, 14, $noreg
-    t2DoLoopStart killed $r1
+    $lr = t2DoLoopStart killed $r1
    renamable $r1, dead $cpsr = tMOVi8 0, 14, $noreg
    renamable $r12 = t2MOVi 1, 14, $noreg, $noreg
    renamable $r2, dead $cpsr = tMOVi8 0, 14, $noreg
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@ -25,12 +25,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
  %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@ -82,12 +82,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@ -138,12 +138,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -193,12 +193,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -252,12 +252,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -311,12 +311,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -374,7 +374,7 @@ vector.ph:
  %scevgep = getelementptr i32, i32* %A, i32 8
  %scevgep30 = getelementptr i32, i32* %C, i32 8
  %scevgep37 = getelementptr i32, i32* %B, i32 8
-  call void @llvm.set.loop.iterations.i32(i32 %v5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5)
  br label %vector.body

 vector.body:
@ -382,7 +382,7 @@ vector.body:
  %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
  %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
  %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
-  %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ]
+  %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ]
  %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
  %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
  %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
@ -447,7 +447,7 @@ entry:
  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

 vector.ph:
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
@ -455,7 +455,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
@ -496,7 +496,7 @@ entry:

 vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
@ -504,7 +504,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]

  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@ -547,7 +547,7 @@ entry:

 vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
@ -558,7 +558,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 ; AddRec base is not 0:
  %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]

-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
@ -589,7 +589,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i
 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@ -4,14 +4,14 @@
 define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 8001)
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
@ -36,7 +36,7 @@ define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture rea
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  br label %vector.body

 vector.body:
@ -44,7 +44,7 @@ vector.body:
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -77,13 +77,13 @@ for.cond.cleanup:
 define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 2000)
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 2000, [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
@ -101,14 +101,14 @@ define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture re
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 2000)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 2000)
  br label %vector.body

 vector.body:
  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
-  %0 = phi i32 [ 2000, %entry ], [ %2, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -131,14 +131,14 @@ for.cond.cleanup:
 define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 8001)
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
@ -161,7 +161,7 @@ define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture re
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  br label %vector.body

 vector.body:
@ -169,7 +169,7 @@ vector.body:
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -199,14 +199,14 @@ for.cond.cleanup:
 define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 8001)
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
@ -229,7 +229,7 @@ define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture re
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  br label %vector.body

 vector.body:
@ -237,7 +237,7 @@ vector.body:
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -273,7 +273,7 @@ for.cond.cleanup:
 ;
 define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  br label %vector.body

 vector.body:
@ -281,7 +281,7 @@ vector.body:
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -316,7 +316,7 @@ for.cond.cleanup:
 ;
 define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 1073741824)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824)
  br label %vector.body

 vector.body:
@ -324,7 +324,7 @@ vector.body:
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -359,7 +359,7 @@ for.cond.cleanup:
 ;
 define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  br label %vector.body

 vector.body:
@ -367,7 +367,7 @@ vector.body:
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -402,7 +402,7 @@ for.cond.cleanup:
 ;
 define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  br label %vector.body

 vector.body:
@ -410,7 +410,7 @@ vector.body:
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -448,7 +448,7 @@ for.cond.cleanup:
 ;
 define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  br label %vector.body

 vector.body:
@ -456,7 +456,7 @@ vector.body:
  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@ -502,7 +502,7 @@ vector.ph:                                        ; preds = %vector.ph.preheader
  %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ]
  %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ]
  %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ]
-  call void @llvm.set.loop.iterations.i32(i32 1025)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 1025)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
@ -510,7 +510,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
  %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ]
  %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ]
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 1025, %vector.ph ], [ %2, %vector.body ]
+  %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ]
  %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
  %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
  %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*
@ -546,5 +546,5 @@ for.cond.cleanup3:                                ; preds = %vector.body
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 )
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
@ -83,7 +83,7 @@ entry:

 vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
@ -91,7 +91,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]

  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@ -118,6 +118,6 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry

 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
@ -246,11 +246,11 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA,
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    adr r3, .LCPI5_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vadd.i32 q2, q0, r12
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll
@ -18,12 +18,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -50,5 +50,5 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry

 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll
@ -20,12 +20,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -65,12 +65,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -110,12 +110,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -155,12 +155,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -200,12 +200,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.2 = add i32 %N, -2
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -245,12 +245,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -289,12 +289,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %incorrect = add i32 %index, 1
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@ -335,12 +335,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -380,12 +380,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
@ -425,12 +425,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, %offsets
@ -470,12 +470,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -501,6 +501,6 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry

 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-declare void @llvm.set.loop.iterations.i32(i32) #3
+declare i32 @llvm.start.loop.iterations.i32(i32) #3
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3

--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@ -23,13 +23,13 @@ vector.ph:
  %0 = add i32 %n.vec, -8
  %1 = lshr i32 %0, 3
  %2 = add i32 %1, 1
-  call void @llvm.set.loop.iterations.i32(i32 %2)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %2)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ]
-  %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ]
+  %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@ -94,13 +94,13 @@ vector.ph:
  %0 = add i32 %n.vec, -8
  %1 = lshr i32 %0, 3
  %2 = add nuw nsw i32 %1, 1
-  call void @llvm.set.loop.iterations.i32(i32 %2)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %2)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ]
-  %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ]
+  %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@ -158,13 +158,13 @@ entry:
  %0 = add i32 %n.vec, -8
  %1 = lshr i32 %0, 3
  %2 = add nuw nsw i32 %1, 1
-  call void @llvm.set.loop.iterations.i32(i32 %2)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %2)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ]
  %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ]
-  %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ]
+  %3 = phi i32 [ %start, %entry ], [ %4, %vector.body ]
  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@ -228,7 +228,7 @@ for.body:

 vector.ph:                                        ; preds = %for.body
  %trip.count.minus.1 = add i32 %8, -1
-  call void @llvm.set.loop.iterations.i32(i32 %7)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %7)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
@ -236,7 +236,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
  %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %16, %vector.body ]
-  %9 = phi i32 [ %7, %vector.ph ], [ %17, %vector.body ]
+  %9 = phi i32 [ %start, %vector.ph ], [ %17, %vector.body ]
  %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>*
  %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>*
  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %8)
@ -278,7 +278,7 @@ for.end17:                                        ; preds = %for.end, %entry
 }

 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@ -17,12 +17,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@ -76,13 +76,13 @@ vector.ph:                                        ; preds = %entry
  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
  %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@ -139,12 +139,12 @@ vector.ph:                                        ; preds = %entry
  %trip.count.minus.1 = add i32 %N, -1
  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@ -178,7 +178,7 @@ declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg,
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
@ -7,14 +7,14 @@ define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    movs r2, #3
 ; CHECK-NEXT:    adr r3, .LCPI0_0
-; CHECK-NEXT:    mov.w lr, #3
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    vldrw.u32 q2, [r3]
 ; CHECK-NEXT:    vmov.i32 q0, #0x80000000
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vmov.i32 q3, #0xa
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vadd.i32 q4, q2, r2
--- a/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
@ -14,11 +14,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %lsr.iv.2 = phi i16* [ %scevgep.2, %vector.body ], [ %c, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -40,7 +40,7 @@
  }

  declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <8 x i1> @llvm.arm.mve.vctp16(i32)
  declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
@ -132,7 +132,7 @@ body:             |
    renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    $r12 = t2MOVi16 32768, 14 /* CC::al */, $noreg
    $r12 = t2MOVTi16 killed $r12, 65535, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
    $r5 = tMOVr killed $r3, 14 /* CC::al */, $noreg

  bb.2.vector.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
@ -57,9 +57,9 @@ define i32 @bad(i32* readonly %x, i32* nocapture readonly %y, i32 %n) {
 ; CHECK-NEXT:    subs r3, r2, r3
 ; CHECK-NEXT:    add.w r12, r3, #3
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB1_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
--- a/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir
@ -37,18 +37,18 @@
    br i1 %7, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new

  for.body.preheader.new:                           ; preds = %for.body.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %11)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %11)
    br label %for.body

  vector.ph:                                        ; preds = %vector.memcheck
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv50 = phi i8* [ %scevgep51, %vector.body ], [ %res, %vector.ph ]
    %lsr.iv47 = phi i8* [ %scevgep48, %vector.body ], [ %b, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep45, %vector.body ], [ %a, %vector.ph ]
-    %12 = phi i32 [ %5, %vector.ph ], [ %17, %vector.body ]
+    %12 = phi i32 [ %start2, %vector.ph ], [ %17, %vector.body ]
    %13 = phi i32 [ %N, %vector.ph ], [ %15, %vector.body ]
    %lsr.iv5052 = bitcast i8* %lsr.iv50 to <16 x i8>*
    %lsr.iv4749 = bitcast i8* %lsr.iv47 to <16 x i8>*
@ -88,7 +88,7 @@

  for.body:                                         ; preds = %for.body, %for.body.preheader.new
    %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
-    %21 = phi i32 [ %11, %for.body.preheader.new ], [ %30, %for.body ]
+    %21 = phi i32 [ %start1, %for.body.preheader.new ], [ %30, %for.body ]
    %scevgep23 = getelementptr i8, i8* %a, i32 %i.011
    %scevgep2453 = bitcast i8* %scevgep23 to i8*
    %22 = load i8, i8* %scevgep2453, align 1
@ -159,7 +159,7 @@

  declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
  declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
  declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4

@ -429,7 +429,7 @@ body:             |
    renamable $r6 = t2BICri killed renamable $r6, 15, 14, $noreg, $noreg
    renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 16, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r6, 35, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.5.vector.body:
    successors: %bb.5(0x7c000000), %bb.11(0x04000000)
@ -455,7 +455,7 @@ body:             |
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r3, 19, 14, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.7.for.body:
    successors: %bb.7(0x7c000000), %bb.8(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir
@ -7,14 +7,14 @@
  entry:
    %scevgep = getelementptr i32, i32* %q, i32 -1
    %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    %limit = lshr i32 %n, 1
    br label %while.body

  while.body:                                       ; preds = %while.body, %entry
    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
-    %tmp = phi i32 [ %n, %entry ], [ %tmp2, %while.body ]
+    %tmp = phi i32 [ %start, %entry ], [ %tmp2, %while.body ]
    %scevgep7 = getelementptr i32, i32* %lsr.iv, i32 1
    %scevgep4 = getelementptr i32, i32* %lsr.iv4, i32 1
    %tmp1 = load i32, i32* %scevgep7, align 4
@ -33,7 +33,7 @@
  }

  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0

  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
@ -130,7 +130,7 @@ body:             |
    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
    renamable $r2 = t2LSRri renamable $lr, 1, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.1.while.body:
    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
@ -7,14 +7,14 @@
  entry:
    %scevgep = getelementptr i32, i32* %q, i32 -1
    %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    %limit = lshr i32 %n, 1
    br label %while.body

  while.body:                                       ; preds = %while.body, %entry
    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
-    %tmp = phi i32 [ %n, %entry ], [ %tmp2, %while.body ]
+    %tmp = phi i32 [ %start, %entry ], [ %tmp2, %while.body ]
    %scevgep7 = getelementptr i32, i32* %lsr.iv, i32 1
    %scevgep4 = getelementptr i32, i32* %lsr.iv4, i32 1
    %tmp1 = load i32, i32* %scevgep7, align 4
@ -33,7 +33,7 @@
  }

  ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0

  ; Function Attrs: noduplicate nounwind
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
@ -129,7 +129,7 @@ body:             |
    frame-setup CFI_INSTRUCTION offset $r7, -8
    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
-    t2DoLoopStart renamable $r0
+    $lr = t2DoLoopStart renamable $r0
    renamable $r2 = t2LSRri renamable $r0, 1, 14, $noreg, $noreg
    $lr = tMOVr $r0, 14, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir
@ -1,122 +0,0 @@
-# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
-# CHECK-NOT: $lr = t2DLS
-# CHECK: $lr = tMOVr $r0, 14
-# CHECK-NOT: $lr = t2LEUpdate
-
--- |
-  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-  target triple = "thumbv8.1m.main"
-  
-  define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
-  entry:
-    %scevgep = getelementptr i32, i32* %q, i32 -1
-    %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
-    br label %preheader
-
-  preheader:
-    br label %while.body
-  
-  while.body:                                       ; preds = %while.body, %entry
-    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ]
-    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ]
-    %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ]
-    %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
-    %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
-    %1 = load i32, i32* %scevgep6, align 4
-    store i32 %1, i32* %scevgep2, align 4
-    %scevgep1 = getelementptr i32, i32* %lsr.iv, i32 1
-    %scevgep5 = getelementptr i32, i32* %lsr.iv4, i32 1
-    %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
-    %3 = icmp ne i32 %2, 0
-    br i1 %3, label %while.body, label %while.end
-  
-  while.end:                                        ; preds = %while.body
-    ret i32 0
-  }
-  
-  declare void @llvm.set.loop.iterations.i32(i32) #0
-  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
-  
-  attributes #0 = { noduplicate nounwind }
-  attributes #1 = { nounwind }
-
-...
---
-name:            do_copy
-alignment:       2
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-registers:       []
-liveins:
-  - { reg: '$r0', virtual-reg: '' }
-  - { reg: '$r1', virtual-reg: '' }
-  - { reg: '$r2', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       8
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 0
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:
-  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, 
-      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, 
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, 
-      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, 
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.0.entry:
-    successors: %bb.1(0x80000000)
-    liveins: $r0, $r1, $r2, $r7, $lr
-  
-    frame-setup tPUSH 14, $noreg, killed $r7, implicit-def $sp, implicit $sp
-    frame-setup CFI_INSTRUCTION def_cfa_offset 8
-    frame-setup CFI_INSTRUCTION offset $lr, -4
-    frame-setup CFI_INSTRUCTION offset $r7, -8
-    t2DoLoopStart $r0
-    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
-    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
-
-  bb.1.preheader:
-    successors: %bb.2(0x80000000)
-    liveins: $r0, $r1, $lr
-    $lr = tMOVr $r0, 14, $noreg
-  
-  bb.2.while.body:
-    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-    liveins: $lr, $r0, $r1
-  
-    renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6)
-    early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2)
-    renamable $lr = t2LoopDec killed renamable $lr, 1
-    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
-    tB %bb.3, 14, $noreg
-  
-  bb.3.while.end:
-    $r0, dead $cpsr = tMOVi8 0, 14, $noreg
-    tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
-
-...
--- a/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir
@ -8,11 +8,11 @@
    br i1 %cmp, label %exit, label %loop.ph

  loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
    br label %loop.body

  loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
    %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
    %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
    %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@ -44,11 +44,11 @@
    br i1 %cmp, label %exit, label %loop.ph

  loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
    br label %loop.body

  loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
    %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
    %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
    %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@ -75,7 +75,7 @@
    ret void
  }

-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@ -163,7 +163,7 @@ body:             |
    liveins: $r0, $r1, $r2, $r3, $r4, $lr

    renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
+    $lr = t2DoLoopStart renamable $r4
    $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg

  bb.2.loop.body:
@ -269,7 +269,7 @@ body:             |
    liveins: $r0, $r1, $r2, $r3, $r4, $lr

    renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
+    $lr = t2DoLoopStart renamable $r4
    $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg

  bb.2.loop.body:
--- a/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir
@ -9,7 +9,7 @@
  entry:
    %scevgep = getelementptr i32, i32* %q, i32 -1
    %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
    br label %preheader

  preheader:
@ -18,7 +18,7 @@
  while.body:                                       ; preds = %while.body, %entry
    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ]
    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ]
-    %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ]
+    %0 = phi i32 [ %start, %preheader ], [ %2, %while.body ]
    %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
    %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
    %1 = load i32, i32* %scevgep6, align 4
@ -33,7 +33,7 @@
    ret i32 0
  }

-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0

  attributes #0 = { noduplicate nounwind }
@ -89,11 +89,12 @@ body:             |
  ; CHECK-LABEL: name: do_copy
  ; CHECK: bb.0.entry:
  ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $lr, $r2, $r7
+  ; CHECK:   liveins: $r0, $r2, $r7
  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, implicit-def $sp, implicit $sp
  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   $lr = t2DLS killed $r0
  ; CHECK:   renamable $r0 = t2SUBri killed renamable $lr, 4, 14 /* CC::al */, $noreg, def dead $cpsr
  ; CHECK:   renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg
  ; CHECK: bb.1.preheader:
@ -105,9 +106,7 @@ body:             |
  ; CHECK:   liveins: $lr, $r0, $r1
  ; CHECK:   renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep6)
  ; CHECK:   early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.scevgep2)
-  ; CHECK:   $lr = t2SUBri killed renamable $lr, 1, 14 /* CC::al */, $noreg, def $cpsr
-  ; CHECK:   tBcc %bb.2, 1 /* CC::ne */, killed $cpsr
-  ; CHECK:   tB %bb.3, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
  ; CHECK: bb.3.while.end:
  ; CHECK:   $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
@ -119,7 +118,7 @@ body:             |
    frame-setup CFI_INSTRUCTION def_cfa_offset 8
    frame-setup CFI_INSTRUCTION offset $lr, -4
    frame-setup CFI_INSTRUCTION offset $r7, -8
-    t2DoLoopStart $r0
+    $lr = t2DoLoopStart $r0
    renamable $r0 = t2SUBri killed renamable $lr, 4, 14, $noreg, def $cpsr
    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
@ -13,11 +13,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -51,11 +51,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -89,11 +89,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -127,11 +127,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@ -165,11 +165,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -204,11 +204,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@ -243,11 +243,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -282,11 +282,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@ -321,11 +321,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -361,11 +361,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@ -401,11 +401,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -440,11 +440,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@ -479,11 +479,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -518,11 +518,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@ -557,11 +557,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@ -596,11 +596,11 @@
    br i1 %cmp9, label %exit, label %vector.ph

  vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
    %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
    %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@ -635,7 +635,7 @@
    br i1 %cmp22, label %while.body.preheader, label %while.end

  while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %while.body

  while.body:                                       ; preds = %while.body.preheader, %while.body
@ -643,7 +643,7 @@
    %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ]
    %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ]
    %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ]
-    %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ]
+    %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ]
    %tmp3 = bitcast i16* %y.addr.025 to <4 x i16>*
    %tmp1 = bitcast i16* %x.addr.026 to <4 x i16>*
    %tmp = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.023)
@ -678,7 +678,7 @@
    br i1 %cmp22, label %while.body.preheader, label %while.end

  while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %while.body

  while.body:                                       ; preds = %while.body.preheader, %while.body
@ -686,7 +686,7 @@
    %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ]
    %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ]
    %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ]
-    %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ]
+    %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ]
    %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>*
    %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>*
    %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023)
@ -720,7 +720,7 @@
    br i1 %cmp22, label %while.body.preheader, label %while.end

  while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %while.body

  while.body:                                       ; preds = %while.body.preheader, %while.body
@ -728,7 +728,7 @@
    %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ]
    %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ]
    %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ]
-    %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ]
+    %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ]
    %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>*
    %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>*
    %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023)
@ -763,7 +763,7 @@
    br i1 %cmp22, label %while.body.preheader, label %while.end

  while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
    br label %while.body

  while.body:                                       ; preds = %while.body.preheader, %while.body
@ -771,7 +771,7 @@
    %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ]
    %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ]
    %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ]
-    %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ]
+    %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ]
    %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>*
    %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>*
    %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023)
@ -803,7 +803,7 @@
  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
  declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
  declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
@ -887,7 +887,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -986,7 +986,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -1085,7 +1085,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -1185,7 +1185,7 @@ body:             |
    renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
    $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg
    renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg

@ -1304,7 +1304,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -1417,7 +1417,7 @@ body:             |
    renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
    $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg
    renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg

@ -1537,7 +1537,7 @@ body:             |
    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -1650,7 +1650,7 @@ body:             |
    renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
    $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg
    renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg

@ -1779,7 +1779,7 @@ body:             |
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -1904,7 +1904,7 @@ body:             |
    renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0)
    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
    $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -2032,7 +2032,7 @@ body:             |
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -2157,7 +2157,7 @@ body:             |
    renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0)
    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
    $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -2285,7 +2285,7 @@ body:             |
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
    renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -2410,7 +2410,7 @@ body:             |
    renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0)
    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 27, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
    $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -2538,7 +2538,7 @@ body:             |
    renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
    renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
    renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
    $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -2663,7 +2663,7 @@ body:             |
    renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0)
    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 27, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
    $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg

  bb.2.vector.body:
@ -2781,7 +2781,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
    renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.while.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -2897,7 +2897,7 @@ body:             |
    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
    renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.while.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -3026,7 +3026,7 @@ body:             |
    renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
    renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr

  bb.2.while.body:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@ -3150,7 +3150,7 @@ body:             |
    renamable $r12 = t2ADDri killed renamable $r2, 7, 14 /* CC::al */, $noreg, $noreg
    renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
    renamable $r2 = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
    $r12 = tMOVr killed $r2, 14 /* CC::al */, $noreg
    renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg

--- a/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@ -26,7 +26,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    ldr r0, [sp, #36]
 ; ENABLED-NEXT:    add.w r12, r2, #3
 ; ENABLED-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; ENABLED-NEXT:    movs r6, #0
+; ENABLED-NEXT:    mov.w r8, #0
 ; ENABLED-NEXT:    mov r9, r12
 ; ENABLED-NEXT:    uxth r0, r0
 ; ENABLED-NEXT:    rsbs r5, r0, #0
@ -37,32 +37,32 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    lsrs r0, r0, #16
 ; ENABLED-NEXT:    sub.w r9, r9, #1
-; ENABLED-NEXT:    strh.w r0, [r1, r6, lsl #1]
-; ENABLED-NEXT:    adds r6, #1
+; ENABLED-NEXT:    strh.w r0, [r1, r8, lsl #1]
+; ENABLED-NEXT:    add.w r8, r8, #1
 ; ENABLED-NEXT:    add.w r10, r10, #2
-; ENABLED-NEXT:    cmp r6, r3
+; ENABLED-NEXT:    cmp r8, r3
 ; ENABLED-NEXT:    beq .LBB0_8
 ; ENABLED-NEXT:  .LBB0_4: @ %for.body
 ; ENABLED-NEXT:    @ =>This Loop Header: Depth=1
 ; ENABLED-NEXT:    @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT:    cmp r2, r6
+; ENABLED-NEXT:    cmp r2, r8
 ; ENABLED-NEXT:    ble .LBB0_2
 ; ENABLED-NEXT:  @ %bb.5: @ %vector.ph
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    bic r0, r9, #3
 ; ENABLED-NEXT:    movs r7, #1
 ; ENABLED-NEXT:    subs r0, #4
-; ENABLED-NEXT:    subs r4, r2, r6
+; ENABLED-NEXT:    sub.w r4, r2, r8
 ; ENABLED-NEXT:    vmov.i32 q1, #0x0
-; ENABLED-NEXT:    add.w r8, r7, r0, lsr #2
-; ENABLED-NEXT:    sub.w r0, r12, r6
+; ENABLED-NEXT:    add.w r6, r7, r0, lsr #2
+; ENABLED-NEXT:    sub.w r0, r12, r8
 ; ENABLED-NEXT:    bic r0, r0, #3
 ; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    add.w r0, r7, r0, lsr #2
 ; ENABLED-NEXT:    mov r7, r10
 ; ENABLED-NEXT:    dls lr, r0
 ; ENABLED-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; ENABLED:  .LBB0_6: @ %vector.body
+; ENABLED-NEXT:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
 ; ENABLED-NEXT:    vctp.32 r4
@ -70,9 +70,9 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    vpstt
 ; ENABLED-NEXT:    vldrht.s32 q1, [r0], #8
 ; ENABLED-NEXT:    vldrht.s32 q2, [r7], #8
-; ENABLED-NEXT:    mov lr, r8
+; ENABLED-NEXT:    mov lr, r6
 ; ENABLED-NEXT:    vmul.i32 q1, q2, q1
-; ENABLED-NEXT:    sub.w r8, r8, #1
+; ENABLED-NEXT:    subs r6, #1
 ; ENABLED-NEXT:    vshl.s32 q1, r5
 ; ENABLED-NEXT:    subs r4, #4
 ; ENABLED-NEXT:    vadd.i32 q1, q1, q0
@ -97,7 +97,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    ldr r0, [sp, #36]
 ; NOREDUCTIONS-NEXT:    add.w r12, r2, #3
 ; NOREDUCTIONS-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; NOREDUCTIONS-NEXT:    movs r6, #0
+; NOREDUCTIONS-NEXT:    mov.w r8, #0
 ; NOREDUCTIONS-NEXT:    mov r9, r12
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
 ; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
@ -108,31 +108,31 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    lsrs r0, r0, #16
 ; NOREDUCTIONS-NEXT:    sub.w r9, r9, #1
-; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r6, lsl #1]
-; NOREDUCTIONS-NEXT:    adds r6, #1
+; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r8, lsl #1]
+; NOREDUCTIONS-NEXT:    add.w r8, r8, #1
 ; NOREDUCTIONS-NEXT:    add.w r10, r10, #2
-; NOREDUCTIONS-NEXT:    cmp r6, r3
-; NOREDUCTIONS:         beq .LBB0_8
+; NOREDUCTIONS-NEXT:    cmp r8, r3
+; NOREDUCTIONS-NEXT:    beq .LBB0_8
 ; NOREDUCTIONS-NEXT:  .LBB0_4: @ %for.body
 ; NOREDUCTIONS-NEXT:    @ =>This Loop Header: Depth=1
 ; NOREDUCTIONS-NEXT:    @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT:    cmp r2, r6
+; NOREDUCTIONS-NEXT:    cmp r2, r8
 ; NOREDUCTIONS-NEXT:    ble .LBB0_2
 ; NOREDUCTIONS-NEXT:  @ %bb.5: @ %vector.ph
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
 ; NOREDUCTIONS-NEXT:    movs r7, #1
 ; NOREDUCTIONS-NEXT:    subs r0, #4
-; NOREDUCTIONS-NEXT:    subs r4, r2, r6
+; NOREDUCTIONS-NEXT:    sub.w r4, r2, r8
 ; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT:    add.w r8, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT:    sub.w r0, r12, r6
+; NOREDUCTIONS-NEXT:    add.w r6, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT:    sub.w r0, r12, r8
 ; NOREDUCTIONS-NEXT:    bic r0, r0, #3
 ; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    add.w r0, r7, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r7, r10
 ; NOREDUCTIONS-NEXT:    dls lr, r0
-; NOREDUCTIONS:         ldr r0, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
@ -141,9 +141,9 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    vpstt
 ; NOREDUCTIONS-NEXT:    vldrht.s32 q1, [r0], #8
 ; NOREDUCTIONS-NEXT:    vldrht.s32 q2, [r7], #8
-; NOREDUCTIONS-NEXT:    mov lr, r8
+; NOREDUCTIONS-NEXT:    mov lr, r6
 ; NOREDUCTIONS-NEXT:    vmul.i32 q1, q2, q1
-; NOREDUCTIONS-NEXT:    sub.w r8, r8, #1
+; NOREDUCTIONS-NEXT:    subs r6, #1
 ; NOREDUCTIONS-NEXT:    vshl.s32 q1, r5
 ; NOREDUCTIONS-NEXT:    subs r4, #4
 ; NOREDUCTIONS-NEXT:    vadd.i32 q1, q1, q0
@ -184,7 +184,7 @@ for.body:                                         ; preds = %for.end, %for.body.

 vector.ph:                                        ; preds = %for.body
  %trip.count.minus.1 = add i32 %i8, -1
-  call void @llvm.set.loop.iterations.i32(i32 %i7)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %i7)
  br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
@ -192,7 +192,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
  %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i16, %vector.body ]
-  %i9 = phi i32 [ %i7, %vector.ph ], [ %i17, %vector.body ]
+  %i9 = phi i32 [ %start, %vector.ph ], [ %i17, %vector.body ]
  %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>*
  %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>*
  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i8)
@ -237,4 +237,4 @@ declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
--- a/Show More
+++ b/Show More