diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index a664f819d46..3de54247ad8 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -47,22 +47,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -145,22 +136,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -203,13 +185,13 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    beq .LBB2_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    add.w r4, r12, #3
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r4, r4, #3
-; CHECK-NEXT:    sub.w lr, r4, #4
+; CHECK-NEXT:    add.w lr, r12, #3
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
-; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    bic lr, lr, #3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    sub.w lr, lr, #4
+; CHECK-NEXT:    add.w r4, r4, lr, lsr #2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r12
@@ -219,12 +201,11 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
 ; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
 ; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    vsub.i32 q1, q2, q1
-; CHECK-NEXT:    adds r4, #4
+; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vcmpt.i32 eq, q1, zr
 ; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
 ; CHECK-NEXT:    vldrwt.u32 q2, [r2], #16
-; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vmul.i32 q1, q2, q1
 ; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB2_2
@@ -246,22 +227,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -301,13 +273,13 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    beq .LBB3_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    add.w r4, r12, #3
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r4, r4, #3
-; CHECK-NEXT:    sub.w lr, r4, #4
+; CHECK-NEXT:    add.w lr, r12, #3
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
-; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    bic lr, lr, #3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    sub.w lr, lr, #4
+; CHECK-NEXT:    add.w r4, r4, lr, lsr #2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r12
@@ -322,9 +294,8 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
 ; CHECK-NEXT:    vcmpt.i32 ne, q1, zr
 ; CHECK-NEXT:    vldrwe.u32 q1, [r3], #16
 ; CHECK-NEXT:    vldrwe.u32 q2, [r2], #16
-; CHECK-NEXT:    adds r4, #4
-; CHECK-NEXT:    vmul.i32 q1, q2, q1
 ; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
 ; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB3_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
@@ -344,22 +315,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -398,11 +360,9 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %bb3
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB4_2: @ %bb9
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vpt.i32 ne, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q1, [r0]
@@ -419,21 +379,12 @@ bb:
 bb3:                                              ; preds = %bb
   %tmp4 = add i32 %arg2, 3
   %tmp5 = and i32 %tmp4, -4
-  %tmp6 = add i32 %arg2, -1
-  %tmp7 = insertelement <4 x i32> undef, i32 %tmp6, i32 0
-  %tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %bb9
 
 bb9:                                              ; preds = %bb9, %bb3
   %tmp10 = phi i32 [ 0, %bb3 ], [ %tmp25, %bb9 ]
-  %tmp11 = insertelement <4 x i32> undef, i32 %tmp10, i32 0
-  %tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer
-  %tmp13 = add <4 x i32> %tmp12, <i32 0, i32 1, i32 2, i32 3>
   %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10
-
-  ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8
   %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %arg2)
-
   %tmp16 = bitcast i32* %tmp14 to <4 x i32>*
   %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef)
   %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
@@ -460,7 +411,6 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %bb4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -468,7 +418,6 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vcmpt.s32 le, q0, r2
 ; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
@@ -482,23 +431,14 @@ bb:
 bb4:                                              ; preds = %bb
   %tmp5 = add i32 %arg3, 3
   %tmp6 = and i32 %tmp5, -4
-  %tmp7 = add i32 %arg3, -1
-  %tmp8 = insertelement <4 x i32> undef, i32 %tmp7, i32 0
-  %tmp9 = shufflevector <4 x i32> %tmp8, <4 x i32> undef, <4 x i32> zeroinitializer
   %tmp10 = insertelement <4 x i32> undef, i32 %arg2, i32 0
   %tmp11 = shufflevector <4 x i32> %tmp10, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %bb12
 
 bb12:                                             ; preds = %bb12, %bb4
   %tmp13 = phi i32 [ 0, %bb4 ], [ %tmp30, %bb12 ]
-  %tmp14 = insertelement <4 x i32> undef, i32 %tmp13, i32 0
-  %tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer
-  %tmp16 = add <4 x i32> %tmp15, <i32 0, i32 1, i32 2, i32 3>
   %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13
-
-  ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9
   %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %arg3)
-
   %tmp19 = bitcast i32* %tmp17 to <4 x i32>*
   %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef)
   %tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
index 2627965913e..01564487b57 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
@@ -9,11 +9,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #8
 ; CHECK-NEXT:    vldrb.s16 q0, [r1], #8
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
@@ -28,21 +26,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
   %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <8 x i8>*
   %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
   %3 = sext <8 x i8> %wide.masked.load to <8 x i16>
@@ -69,11 +58,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #8
 ; CHECK-NEXT:    vldrb.u16 q0, [r1], #8
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
@@ -88,21 +75,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
   %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <8 x i8>*
   %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
   %3 = zext <8 x i8> %wide.masked.load to <8 x i16>
@@ -129,11 +107,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    vldrh.s32 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vadd.i32 q0, q1, q0
@@ -148,21 +124,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@@ -189,11 +156,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    vldrh.u32 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vadd.i32 q0, q1, q0
@@ -208,21 +173,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index faf4a2e5998..f101dd4fcec 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -6,7 +6,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq.w .LBB0_11
+; CHECK-NEXT:    beq .LBB0_11
 ; CHECK-NEXT:  @ %bb.1: @ %vector.memcheck
 ; CHECK-NEXT:    add.w r5, r0, r3, lsl #2
 ; CHECK-NEXT:    add.w r4, r2, r3, lsl #2
@@ -32,11 +32,9 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    b .LBB0_8
 ; CHECK-NEXT:  .LBB0_4: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB0_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
 ; CHECK-NEXT:    vmul.f32 q0, q1, q0
@@ -117,21 +115,12 @@ for.body.preheader.new:                           ; preds = %for.body.preheader
 vector.ph:                                        ; preds = %vector.memcheck
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %2 = getelementptr inbounds float, float* %b, i32 %index
-
-  ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
   %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %4 = bitcast float* %2 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef)
   %5 = getelementptr inbounds float, float* %c, i32 %index
@@ -219,13 +208,11 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    add.w r12, r12, r3, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    add.w r3, r12, r3, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
@@ -257,22 +244,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %c, i32 %index
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index db86cacded2..c69321daca3 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -15,13 +15,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r12, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -40,9 +38,6 @@ vector.ph:                                        ; preds = %entry
   %conv = zext i8 %a to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -50,14 +45,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <4 x i8>*
   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
   %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
@@ -91,13 +80,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r12, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -116,9 +103,6 @@ vector.ph:                                        ; preds = %entry
   %conv = sext i16 %a to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -126,14 +110,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@@ -167,13 +145,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r12, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -192,9 +168,6 @@ vector.ph:                                        ; preds = %entry
   %conv = zext i8 %a to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -202,14 +175,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <4 x i8>*
   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
   %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
@@ -243,13 +210,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r12, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -268,9 +233,6 @@ vector.ph:                                        ; preds = %entry
   %conv = sext i16 %a to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -278,14 +240,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
@@ -319,13 +275,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r12, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -343,9 +297,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %a, i32 0
   %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -353,14 +304,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12
@@ -411,11 +356,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB5_8
 ; CHECK-NEXT:  .LBB5_4: @ %vector.ph
-; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB5_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r7, #4
 ; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
 ; CHECK-NEXT:    vldrb.u32 q1, [r1], #4
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -498,23 +441,14 @@ for.body.preheader.new:                           ; preds = %for.body.preheader
 vector.ph:                                        ; preds = %for.body.lr.ph
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
   %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %2 = getelementptr inbounds i8, i8* %a, i32 %index
-
-  ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
   %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %4 = bitcast i8* %2 to <4 x i8>*
   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
   %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
@@ -618,11 +552,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrh.s32 q0, [r0], #8
 ; CHECK-NEXT:    vldrh.s32 q1, [r1], #8
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -638,23 +570,14 @@ vector.ph:                                        ; preds = %entry
   %conv3 = sext i16 %c to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
   %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %a, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@@ -707,11 +630,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB7_8
 ; CHECK-NEXT:  .LBB7_4: @ %vector.ph
-; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB7_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r7, #4
 ; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
 ; CHECK-NEXT:    vldrb.u32 q1, [r1], #4
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -794,23 +715,14 @@ for.body.preheader.new:                           ; preds = %for.body.preheader
 vector.ph:                                        ; preds = %for.body.lr.ph
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
   %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %2 = getelementptr inbounds i8, i8* %a, i32 %index
-
-;  %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
   %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %4 = bitcast i8* %2 to <4 x i8>*
   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
   %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
@@ -914,11 +826,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
 ; CHECK-NEXT:    vldrh.u32 q1, [r1], #8
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -934,23 +844,14 @@ vector.ph:                                        ; preds = %entry
   %conv3 = sext i16 %c to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
   %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %a, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
@@ -1003,11 +904,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB9_8
 ; CHECK-NEXT:  .LBB9_4: @ %vector.ph
-; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB9_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r7, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -1089,23 +988,14 @@ for.body.preheader.new:                           ; preds = %for.body.preheader
 vector.ph:                                        ; preds = %vector.memcheck
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert24 = insertelement <4 x i32> undef, i32 %c, i32 0
   %broadcast.splat25 = shufflevector <4 x i32> %broadcast.splatinsert24, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %2 = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
   %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %4 = bitcast i32* %2 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef)
   %5 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -1196,11 +1086,9 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.16 lr, r3
 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #8
 ; CHECK-NEXT:    vldrb.u16 q0, [r1], #8
 ; CHECK-NEXT:    vldrb.u16 q1, [r2], #8
 ; CHECK-NEXT:    vmul.i16 q0, q1, q0
@@ -1215,21 +1103,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-;  %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
   %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <8 x i8>*
   %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
   %3 = zext <8 x i8> %wide.masked.load to <8 x i16>
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index c8d38032a6a..6b71a070d65 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -9,9 +9,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
 ; CHECK:       for.cond1.preheader.us.preheader:
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TT:%.*]] = add i32 [[N_VEC]], -4
 ; CHECK-NEXT:    [[TT1:%.*]] = lshr i32 [[TT]], 2
 ; CHECK-NEXT:    [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
@@ -30,9 +27,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i16, i16* [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
@@ -66,9 +60,6 @@ entry:
 for.cond1.preheader.us.preheader:                 ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer
   %tt = add i32 %n.vec, -4
   %tt1 = lshr i32 %tt, 2
   %tt2 = add nuw nsw i32 %tt1, 1
@@ -88,14 +79,8 @@ vector.body:                                      ; preds = %vector.body, %for.c
   %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt14, %vector.body ]
   %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt15, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tt6 = getelementptr inbounds i16, i16* %tt3, i32 %index
-
-  ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat29
   %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tt8 = bitcast i16* %tt6 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt8, i32 2, <4 x i1> %tt7, <4 x i16> undef)
   %tt9 = sext <4 x i16> %wide.masked.load to <4 x i32>
@@ -130,9 +115,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK:       for.cond1.preheader.us.preheader:
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TT:%.*]] = add i32 [[N_VEC]], -4
 ; CHECK-NEXT:    [[TT1:%.*]] = lshr i32 [[TT]], 2
 ; CHECK-NEXT:    [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
@@ -151,9 +133,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i32, i32* [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
@@ -185,9 +164,6 @@ entry:
 for.cond1.preheader.us.preheader:                 ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer
   %tt = add i32 %n.vec, -4
   %tt1 = lshr i32 %tt, 2
   %tt2 = add nuw nsw i32 %tt1, 1
@@ -207,14 +183,8 @@ vector.body:                                      ; preds = %vector.body, %for.c
   %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt12, %vector.body ]
   %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt13, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tt6 = getelementptr inbounds i32, i32* %tt3, i32 %index
-
-  ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat28
   %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tt8 = bitcast i32* %tt6 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt8, i32 4, <4 x i1> %tt7, <4 x i32> undef)
   %tt9 = getelementptr inbounds i32, i32* %B, i32 %index
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 8e8c37bf7f5..d58ad4c697d 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -30,7 +30,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -100,7 +99,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -172,7 +170,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -242,7 +239,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -314,7 +310,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -384,7 +379,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -481,7 +475,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -510,7 +503,6 @@ middle.block:                                     ; preds = %vector.body
 vector.ph47:                                      ; preds = %middle.block
   %n.rnd.up48 = add i32 %N, 3
   %n.vec50 = and i32 %n.rnd.up48, -4
-  %trip.count.minus.154 = add i32 %N, -1
   %i11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %i10, i32 0
   br label %vector.body46
 
@@ -594,7 +586,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -719,7 +710,6 @@ lor.end:                                          ; preds = %entry, %lor.rhs
 vector.ph:                                        ; preds = %lor.end
   %n.rnd.up = add i32 %4, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %4, -1
   %5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0
   br label %vector.body
 
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
index 1492a01a272..d10cbffe2dd 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@@ -22,23 +22,14 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
-  %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp = getelementptr inbounds i8, i8* %a, i32 %index
-
-;  %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i8* %tmp to <16 x i8>*
   %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
@@ -79,23 +70,14 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
-
-;  %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i16* %tmp to <8 x i16>*
   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@@ -135,20 +117,13 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@@ -190,20 +165,13 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@@ -262,10 +230,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@@ -321,10 +286,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@@ -370,7 +332,6 @@ entry:
 
 
 vector.ph:
-  %trip.count.minus.1 = add i32 %N, -1
   %scevgep = getelementptr i32, i32* %A, i32 8
   %scevgep30 = getelementptr i32, i32* %C, i32 8
   %scevgep37 = getelementptr i32, i32* %B, i32 8
@@ -459,9 +420,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
-
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
@@ -495,7 +454,6 @@ entry:
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
@@ -509,9 +467,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
-
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
@@ -546,7 +502,6 @@ entry:
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
index 4682f1d36f3..1c173e9dfd1 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -10,22 +10,17 @@ define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture rea
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = sub i32 [[TMP1]], 4
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
@@ -48,13 +43,7 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
-  ; %1 = icmp ult <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -244,11 +233,7 @@ vector.body:
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
-; Non-uniform constant vector here. This can't be represented with
-; @llvm.get.active.lane.mask, but let's keep this test as a sanity check:
   %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -285,13 +270,8 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
 ; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -328,12 +308,7 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -371,13 +346,8 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
 ; The induction variable %N is not an IV:
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -414,12 +384,7 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -460,9 +425,6 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
@@ -514,10 +476,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
   %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
   %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*
-
 ; It's using %j.025, the induction variable from its outer loop:
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
index a2361f51863..73865945cdc 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
@@ -82,7 +82,6 @@ entry:
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
@@ -92,13 +91,10 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
-
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
-
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll
index c0b2a036f37..14f1d0c0020 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll
@@ -27,7 +27,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -77,7 +76,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll
index 5ad6d911230..66216022d64 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll
@@ -26,7 +26,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
index bd927fdcf85..024857b6580 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
@@ -26,7 +26,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -72,7 +71,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -118,7 +116,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -164,7 +161,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -210,7 +206,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -260,7 +255,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll
index 98d48d49539..0e51661e8f5 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll
@@ -27,7 +27,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -77,7 +76,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
index ef79f27ce5d..01bbd3ac28e 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -18,8 +18,6 @@ vector.ph:
   %tmp = add i32 %N, -1
   %n.rnd.up = add i32 %tmp, 8
   %n.vec = and i32 %n.rnd.up, -8
-  %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
-  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
   %0 = add i32 %n.vec, -8
   %1 = lshr i32 %0, 3
   %2 = add i32 %1, 1
@@ -30,14 +28,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ]
   %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
-
-  ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp5 = getelementptr inbounds i16, i16* %B, i32 %index
@@ -87,8 +79,6 @@ vector.ph:
   %tmp = add i32 %N, -1
   %n.rnd.up = add nuw nsw i32 %tmp, 8
   %n.vec = and i32 %n.rnd.up, -8
-  %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
-  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
   %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
   %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
   %0 = add i32 %n.vec, -8
@@ -101,14 +91,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ]
   %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
-
-  ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
@@ -151,8 +135,6 @@ entry:
   %tmp = add i32 %N, -1
   %n.rnd.up = add nuw nsw i32 %tmp, 8
   %n.vec = and i32 %n.rnd.up, -8
-  %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
-  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
   %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
   %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
   %0 = add i32 %n.vec, -8
@@ -165,14 +147,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ]
   %3 = phi i32 [ %start, %entry ], [ %4, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
-
-  ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
@@ -227,7 +203,6 @@ for.body:
   br i1 %cmp433, label %vector.ph, label %for.end
 
 vector.ph:                                        ; preds = %for.body
-  %trip.count.minus.1 = add i32 %8, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %7)
   br label %vector.body
 
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index 939d3cc5e55..c9b2905755e 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -14,23 +14,14 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
-
-  ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
-   %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
+  %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
   %tmp2 = bitcast i16* %tmp to <8 x i16>*
   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@@ -72,8 +63,6 @@ entry:
 
 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
@@ -83,14 +72,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
-
-  ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
   %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i16* %tmp to <8 x i16>*
   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@@ -136,23 +119,14 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-  ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 1ea183d4a5f..af5c76fd447 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -183,7 +183,6 @@ for.body:                                         ; preds = %for.end, %for.body.
   br i1 %cmp433, label %vector.ph, label %for.end
 
 vector.ph:                                        ; preds = %for.body
-  %trip.count.minus.1 = add i32 %i8, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %i7)
   br label %vector.body
 
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index 1a370c483ba..015af0b4097 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -11,27 +11,25 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r12, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
+; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
-; CHECK-NEXT:    adds r3, #4
-; CHECK-NEXT:    vmul.i32 q0, q2, q0
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -41,22 +39,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %a, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -92,16 +81,14 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    bic r1, r1, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    add.w r3, r3, r1, lsr #2
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
-; CHECK-NEXT:    adds r1, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB1_2
@@ -116,22 +103,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %a, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
@@ -163,16 +141,14 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    bic r1, r1, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    add.w r3, r3, r1, lsr #2
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
-; CHECK-NEXT:    adds r1, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB2_2
@@ -187,22 +163,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %a, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
@@ -228,11 +195,9 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vmul.i32 q0, q0, r2
 ; CHECK-NEXT:    vstrw.32 q0, [r0], #16
@@ -246,23 +211,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %b, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
@@ -285,11 +241,9 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vadd.i32 q0, q0, r2
 ; CHECK-NEXT:    vstrw.32 q0, [r0], #16
@@ -303,23 +257,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %b, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
@@ -342,11 +287,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.8 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #16
 ; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
 ; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
 ; CHECK-NEXT:    vmul.i8 q0, q1, q0
@@ -361,21 +304,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
-  %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-  ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13
   %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <16 x i8>*
   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
   %3 = getelementptr inbounds i8, i8* %c, i32 %index
@@ -402,11 +336,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.16 lr, r3
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
 ; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
 ; CHECK-NEXT:    vmul.i16 q0, q1, q0
@@ -421,21 +353,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
   %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
   %3 = getelementptr inbounds i16, i16* %c, i32 %index
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index e8da32611be..ec6a7554b3e 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -30,9 +30,6 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
@@ -44,13 +41,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %6 = phi i32 [ %start, %vector.ph ], [ %10, %vector.body ]
   %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
   %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
-  ; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
   %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
   %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
diff --git a/test/CodeGen/Thumb2/mve-fma-loops.ll b/test/CodeGen/Thumb2/mve-fma-loops.ll
index 7609c16ea84..a34a278103a 100644
--- a/test/CodeGen/Thumb2/mve-fma-loops.ll
+++ b/test/CodeGen/Thumb2/mve-fma-loops.ll
@@ -11,11 +11,9 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
@@ -30,23 +28,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -74,11 +63,9 @@ define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
@@ -93,23 +80,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -138,11 +116,9 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
@@ -157,23 +133,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -201,11 +168,9 @@ define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
@@ -220,23 +185,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
@@ -265,12 +221,10 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    eor r12, r12, #-2147483648
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
@@ -286,23 +240,14 @@ vector.ph:                                        ; preds = %entry
   %fneg = fneg fast float %a
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -329,14 +274,12 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
-; CHECK-NEXT:    vmov lr, s0
-; CHECK-NEXT:    vdup.32 q0, lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
 ; CHECK-NEXT:    vneg.f32 q0, q0
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
@@ -352,23 +295,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -398,11 +332,9 @@ define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vdup.32 q0, r4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
@@ -418,23 +350,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -464,11 +387,9 @@ define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:  .LBB7_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vdup.32 q0, r4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
@@ -484,23 +405,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -529,12 +441,10 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    eor r12, r12, #-2147483648
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
@@ -550,23 +460,14 @@ vector.ph:                                        ; preds = %entry
   %fneg = fneg fast float %a
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -595,11 +496,9 @@ define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:  .LBB9_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vdup.32 q0, r4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 ; CHECK-NEXT:    vfms.f32 q2, q1, q0
@@ -614,23 +513,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -659,16 +549,14 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
-; CHECK-NEXT:    adds r4, #4
-; CHECK-NEXT:    vneg.f32 q1, q1
-; CHECK-NEXT:    vfma.f32 q1, q0, r12
-; CHECK-NEXT:    vstrw.32 q1, [r2], #16
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vneg.f32 q0, q0
+; CHECK-NEXT:    vfma.f32 q0, q1, r12
+; CHECK-NEXT:    vstrw.32 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB10_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -679,23 +567,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -724,16 +603,14 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB11_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB11_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
-; CHECK-NEXT:    adds r4, #4
-; CHECK-NEXT:    vneg.f32 q1, q1
-; CHECK-NEXT:    vfma.f32 q1, q0, r12
-; CHECK-NEXT:    vstrw.32 q1, [r2], #16
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vneg.f32 q0, q0
+; CHECK-NEXT:    vfma.f32 q0, q1, r12
+; CHECK-NEXT:    vstrw.32 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB11_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -744,23 +621,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
diff --git a/test/CodeGen/Thumb2/mve-gather-increment.ll b/test/CodeGen/Thumb2/mve-gather-increment.ll
index eb088263117..13d47becdda 100644
--- a/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -232,7 +232,6 @@ define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %dat
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
 ; CHECK-NEXT:    .long 0 @ 0x0
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -276,7 +275,6 @@ define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %da
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
 vector.ph41:                                      ; preds = %for.body6.preheader
-  %ind.end47 = shl i32 %n.vec43, 1
   br label %vector.body39
 
 vector.body39:                                    ; preds = %vector.body39, %vector.ph41
diff --git a/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll b/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
index 4c5bcd836c3..83f788df61c 100644
--- a/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
+++ b/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
@@ -1,18 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-
-
-; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o 2>/dev/null - | FileCheck %s
+; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o - | FileCheck %s
 
 define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
 ; CHECK-LABEL: @push_out_add_sub_block(
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
 ; CHECK-NEXT:    [[PUSHEDOUTADD:%.*]] = add <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 6, i32 6, i32 6, i32 6>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
 ; CHECK:       lower.block:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
@@ -23,20 +20,19 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    br label [[VECTOR_BODY_END]]
 ; CHECK:       vector.body.end:
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
 
 vector.ph:
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
   %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
-  %0 = icmp eq i32 %index, 50
+  %0 = icmp eq i32 %index, 48
   br i1 %0, label %lower.block, label %end
 
 lower.block:                             ; preds = %vector.body
@@ -61,7 +57,6 @@ end:
 define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
 ; CHECK-LABEL: @push_out_mul_sub_block(
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
 ; CHECK-NEXT:    [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[PRODUCT:%.*]] = mul <4 x i32> <i32 8, i32 8, i32 8, i32 8>, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], <i32 6, i32 6, i32 6, i32 6>
@@ -69,7 +64,7 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
 ; CHECK:       lower.block:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
@@ -80,20 +75,19 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado
 ; CHECK-NEXT:    br label [[VECTOR_BODY_END]]
 ; CHECK:       vector.body.end:
 ; CHECK-NEXT:    [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
 
 vector.ph:
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
   %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
-  %0 = icmp eq i32 %index, 50
+  %0 = icmp eq i32 %index, 48
   br i1 %0, label %lower.block, label %end
 
 lower.block:                             ; preds = %vector.body
@@ -120,7 +114,6 @@ end:
 define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
 ; CHECK-LABEL: @push_out_mul_sub_loop(
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
@@ -138,19 +131,18 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readon
 ; CHECK-NEXT:    br label [[VECTOR_2_BODY_END:%.*]]
 ; CHECK:       vector.2.body.end:
 ; CHECK-NEXT:    [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]]
 ; CHECK:       vector.body.end:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
 
 vector.ph:
-  %ind.end = shl i32 %n.vec, 2
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -162,7 +154,6 @@ vector.2.ph:
   br label %vector.2.body
 
 vector.2.body:                             ; preds = %vector.body
-  %index.2 = phi i32 [ 0, %vector.2.ph ], [ %index.2.next, %vector.2.body.end ]
   %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
   %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
   %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
@@ -174,7 +165,7 @@ vector.2.body:                             ; preds = %vector.body
 
 vector.2.body.end:                             ; preds = %lower.block
   %index.2.next = add i32 %index, 4
-  %5 = icmp eq i32 %index.2.next, 15
+  %5 = icmp eq i32 %index.2.next, 16
   br i1 %5, label %vector.body.end, label %vector.2.body
 
 vector.body.end:                             ; preds = %lower.block
diff --git a/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index d6e505e1707..32a183cdac1 100644
--- a/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -39,7 +39,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -83,7 +82,6 @@ define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly
 ; CHECK-NEXT:    .long 16 @ 0x10
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -127,7 +125,6 @@ define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture read
 ; CHECK-NEXT:    .long 0 @ 0x0
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -173,7 +170,6 @@ define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonl
                                                   <4 x i32> %to.store) {
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -215,7 +211,6 @@ define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonl
                                                   <4 x i32> %to.store) {
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -259,7 +254,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture
                                                          i32* noalias nocapture %dst, i32 %n.vec) {
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -301,7 +295,6 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado
 ; CHECK-NEXT:    .long 16 @ 0x10
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -358,7 +351,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %d
 ; CHECK-NEXT:    .long 6 @ 0x6
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -411,7 +403,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %d
 ; CHECK-NEXT:    .long 6 @ 0x6
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -528,7 +519,6 @@ for.cond8.preheader.us.us.preheader.preheader:    ; preds = %entry
   %2 = add nuw i32 %1, 1
   %min.iters.check = icmp ult i32 %0, 6
   %n.vec = and i32 %2, -4
-  %ind.end = shl i32 %n.vec, 1
   %broadcast.splatinsert86 = insertelement <4 x i32> undef, i32 %m, i32 0
   %broadcast.splat87 = shufflevector <4 x i32> %broadcast.splatinsert86, <4 x i32> undef, <4 x i32> zeroinitializer
   %cmp.n = icmp eq i32 %2, %n.vec
@@ -983,7 +973,6 @@ for.body10.i:                                     ; preds = %for.cond.cleanup20.
   br i1 0, label %for.cond.cleanup20.i, label %for.cond22.preheader.lr.ph.i
 
 for.cond22.preheader.lr.ph.i:                     ; preds = %for.body10.i
-  %ind.end = add nsw i32 0, %n.vec
   %.splatinsert = insertelement <4 x i32> undef, i32 0, i32 0
   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %.splat, <i32 0, i32 1, i32 2, i32 3>
diff --git a/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index e7886dca32f..7bcc0193217 100644
--- a/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -1667,7 +1667,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1718,7 +1717,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1772,7 +1770,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1824,7 +1821,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1880,7 +1876,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1932,7 +1927,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1988,7 +1982,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2039,7 +2032,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2093,7 +2085,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2145,7 +2136,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2201,7 +2191,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2252,7 +2241,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2308,7 +2296,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2362,7 +2349,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2421,7 +2407,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph