diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll index a664f819d46..3de54247ad8 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -47,22 +47,13 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - -; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -145,22 +136,13 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - -; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -203,13 +185,13 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32 ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB2_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r4, r12, #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w lr, r4, #4 +; CHECK-NEXT: add.w lr, r12, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: bic lr, lr, #3 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: sub.w lr, lr, #4 +; CHECK-NEXT: add.w r4, r4, lr, lsr #2 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 @@ -219,12 +201,11 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vcmpt.i32 eq, q1, zr ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB2_2 @@ -246,22 +227,13 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - -; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -301,13 +273,13 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB3_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: add.w r4, r12, #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: sub.w lr, r4, #4 +; CHECK-NEXT: add.w lr, r12, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: bic lr, lr, #3 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: sub.w lr, lr, #4 +; CHECK-NEXT: add.w r4, r4, lr, lsr #2 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 @@ -322,9 +294,8 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* ; CHECK-NEXT: vcmpt.i32 ne, q1, zr ; CHECK-NEXT: vldrwe.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwe.u32 q2, [r2], #16 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -344,22 +315,13 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - -; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -398,11 +360,9 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB4_1: @ %bb3 -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_2: @ %bb9 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q1, [r0] @@ -419,21 +379,12 @@ bb: bb3: ; preds = %bb %tmp4 = add i32 %arg2, 3 %tmp5 = and i32 %tmp4, -4 - %tmp6 = add i32 %arg2, -1 - %tmp7 = insertelement <4 x i32> undef, i32 %tmp6, i32 0 - %tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer br label %bb9 bb9: ; preds = %bb9, %bb3 %tmp10 = phi i32 [ 0, %bb3 ], [ %tmp25, %bb9 ] - %tmp11 = insertelement <4 x i32> undef, i32 %tmp10, i32 0 - %tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer - %tmp13 = add <4 x i32> %tmp12, %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10 - - ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8 %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %arg2) - %tmp16 = bitcast i32* %tmp14 to <4 x i32>* %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef) %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer @@ -460,7 +411,6 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %bb4 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_2: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -468,7 +418,6 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i ; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 @@ -482,23 +431,14 @@ bb: bb4: ; preds = %bb %tmp5 = add i32 %arg3, 3 %tmp6 = and i32 %tmp5, -4 - %tmp7 = add i32 %arg3, -1 - %tmp8 = insertelement <4 x i32> undef, i32 %tmp7, i32 0 - %tmp9 = shufflevector <4 x i32> %tmp8, <4 x i32> undef, <4 x i32> zeroinitializer %tmp10 = insertelement <4 x i32> undef, i32 %arg2, i32 0 %tmp11 = shufflevector <4 x i32> %tmp10, <4 x i32> undef, <4 x i32> zeroinitializer br label %bb12 bb12: ; preds = %bb12, %bb4 %tmp13 = phi i32 [ 0, %bb4 ], [ %tmp30, %bb12 ] - %tmp14 = insertelement <4 x i32> undef, i32 %tmp13, i32 0 - %tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer - %tmp16 = add <4 x i32> %tmp15, %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13 - - ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9 %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %arg3) - %tmp19 = bitcast i32* %tmp17 to <4 x i32>* %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef) %tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll index 2627965913e..01564487b57 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll @@ -9,11 +9,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.s16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 @@ -28,21 +26,12 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = or <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - - ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = sext <8 x i8> %wide.masked.load to <8 x i16> @@ -69,11 +58,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB1_1: @ %vector.ph -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 @@ -88,21 +75,12 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = or <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - - ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11 %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %wide.masked.load to <8 x i16> @@ -129,11 +107,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16* ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB2_1: @ %vector.ph -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.s32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 @@ -148,21 +124,12 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -189,11 +156,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16* ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB3_1: @ %vector.ph -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrh.u32 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 @@ -208,21 +173,12 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index faf4a2e5998..f101dd4fcec 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -6,7 +6,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq.w .LBB0_11 +; CHECK-NEXT: beq .LBB0_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck ; CHECK-NEXT: add.w r5, r0, r3, lsl #2 ; CHECK-NEXT: add.w r4, r2, r3, lsl #2 @@ -32,11 +32,9 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_4: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 ; CHECK-NEXT: vmul.f32 q0, q1, q0 @@ -117,21 +115,12 @@ for.body.preheader.new: ; preds = %for.body.preheader vector.ph: ; preds = %vector.memcheck %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds float, float* %b, i32 %index - - ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %4 = bitcast float* %2 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef) %5 = getelementptr inbounds float, float* %c, i32 %index @@ -219,13 +208,11 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: add.w r12, r12, r3, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: add.w r3, r12, r3, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt @@ -257,22 +244,13 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %b, i32 %index - -; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %c, i32 %index diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index db86cacded2..c69321daca3 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -15,13 +15,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w r12, r3, r12, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -40,9 +38,6 @@ vector.ph: ; preds = %entry %conv = zext i8 %a to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body @@ -50,14 +45,8 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - -; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -91,13 +80,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w r12, r3, r12, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -116,9 +103,6 @@ vector.ph: ; preds = %entry %conv = sext i16 %a to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body @@ -126,14 +110,8 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - -; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -167,13 +145,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w r12, r3, r12, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -192,9 +168,6 @@ vector.ph: ; preds = %entry %conv = zext i8 %a to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body @@ -202,14 +175,8 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - -; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i8* %0 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -243,13 +210,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w r12, r3, r12, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -268,9 +233,6 @@ vector.ph: ; preds = %entry %conv = sext i16 %a to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body @@ -278,14 +240,8 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - -; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> @@ -319,13 +275,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w r12, r3, r12, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst @@ -343,9 +297,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %a, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body @@ -353,14 +304,8 @@ vector.ph: ; preds = %entry vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - -; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12 @@ -411,11 +356,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -498,23 +441,14 @@ for.body.preheader.new: ; preds = %for.body.preheader vector.ph: ; preds = %for.body.lr.ph %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0 %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i8, i8* %a, i32 %index - - ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) %5 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -618,11 +552,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB6_1: @ %vector.ph -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -638,23 +570,14 @@ vector.ph: ; preds = %entry %conv3 = sext i16 %c to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %a, i32 %index - -; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -707,11 +630,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -794,23 +715,14 @@ for.body.preheader.new: ; preds = %for.body.preheader vector.ph: ; preds = %for.body.lr.ph %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0 %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i8, i8* %a, i32 %index - -; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20 %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %4 = bitcast i8* %2 to <4 x i8>* %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef) %5 = zext <4 x i8> %wide.masked.load to <4 x i32> @@ -914,11 +826,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB8_1: @ %vector.ph -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -934,23 +844,14 @@ vector.ph: ; preds = %entry %conv3 = sext i16 %c to i32 %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %a, i32 %index - -; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i16* %0 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %wide.masked.load to <4 x i32> @@ -1003,11 +904,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph -; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -1089,23 +988,14 @@ for.body.preheader.new: ; preds = %for.body.preheader vector.ph: ; preds = %vector.memcheck %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert24 = insertelement <4 x i32> undef, i32 %c, i32 0 %broadcast.splat25 = shufflevector <4 x i32> %broadcast.splatinsert24, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %2 = getelementptr inbounds i32, i32* %a, i32 %index - -; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22 %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %4 = bitcast i32* %2 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef) %5 = getelementptr inbounds i32, i32* %b, i32 %index @@ -1196,11 +1086,9 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB10_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrb.u16 q0, [r1], #8 ; CHECK-NEXT: vldrb.u16 q1, [r2], #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 @@ -1215,21 +1103,12 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = add <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - -; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %2 = bitcast i8* %0 to <8 x i8>* %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %wide.masked.load to <8 x i16> diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll index c8d38032a6a..6b71a070d65 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -9,9 +9,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TT:%.*]] = add i32 [[N_VEC]], -4 ; CHECK-NEXT: [[TT1:%.*]] = lshr i32 [[TT]], 2 ; CHECK-NEXT: [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1 @@ -30,9 +27,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT15:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TT6:%.*]] = getelementptr inbounds i16, i16* [[TT3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 @@ -66,9 +60,6 @@ entry: for.cond1.preheader.us.preheader: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer %tt = add i32 %n.vec, -4 %tt1 = lshr i32 %tt, 2 %tt2 = add nuw nsw i32 %tt1, 1 @@ -88,14 +79,8 @@ vector.body: ; preds = %vector.body, %for.c %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt14, %vector.body ] %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt15, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %tt6 = getelementptr inbounds i16, i16* %tt3, i32 %index - - ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat29 %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tt8 = bitcast i16* %tt6 to <4 x i16>* %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt8, i32 2, <4 x i1> %tt7, <4 x i16> undef) %tt9 = sext <4 x i16> %wide.masked.load to <4 x i32> @@ -130,9 +115,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TT:%.*]] = add i32 [[N_VEC]], -4 ; CHECK-NEXT: [[TT1:%.*]] = lshr i32 [[TT]], 2 ; CHECK-NEXT: [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1 @@ -151,9 +133,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT12:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TT6:%.*]] = getelementptr inbounds i32, i32* [[TT3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 @@ -185,9 +164,6 @@ entry: for.cond1.preheader.us.preheader: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer %tt = add i32 %n.vec, -4 %tt1 = lshr i32 %tt, 2 %tt2 = add nuw nsw i32 %tt1, 1 @@ -207,14 +183,8 @@ vector.body: ; preds = %vector.body, %for.c %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt12, %vector.body ] %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt13, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %tt6 = getelementptr inbounds i32, i32* %tt3, i32 %index - - ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat28 %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tt8 = bitcast i32* %tt6 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt8, i32 4, <4 x i1> %tt7, <4 x i32> undef) %tt9 = getelementptr inbounds i32, i32* %B, i32 %index diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index 8e8c37bf7f5..d58ad4c697d 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -30,7 +30,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %N, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -100,7 +99,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %N, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -172,7 +170,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %N, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -242,7 +239,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %N, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -314,7 +310,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %N, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -384,7 +379,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %N, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -481,7 +475,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -510,7 +503,6 @@ middle.block: ; preds = %vector.body vector.ph47: ; preds = %middle.block %n.rnd.up48 = add i32 %N, 3 %n.vec50 = and i32 %n.rnd.up48, -4 - %trip.count.minus.154 = add i32 %N, -1 %i11 = insertelement <4 x i32> , i32 %i10, i32 0 br label %vector.body46 @@ -594,7 +586,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %N, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -719,7 +710,6 @@ lor.end: ; preds = %entry, %lor.rhs vector.ph: ; preds = %lor.end %n.rnd.up = add i32 %4, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %4, -1 %5 = insertelement <4 x i32> , i32 %0, i32 0 br label %vector.body diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll index 1492a01a272..d10cbffe2dd 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll @@ -22,23 +22,14 @@ entry: br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] - %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer - %induction = or <16 x i32> %broadcast.splat, %tmp = getelementptr inbounds i8, i8* %a, i32 %index - -; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) - %tmp2 = bitcast i8* %tmp to <16 x i8>* %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index @@ -79,23 +70,14 @@ entry: br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - -; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index @@ -135,20 +117,13 @@ entry: br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) @@ -190,20 +165,13 @@ entry: br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index -; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) @@ -262,10 +230,7 @@ vector.body: ; preds = %vector.body, %vecto %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - -; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) @@ -321,10 +286,7 @@ vector.body: ; preds = %vector.body, %vecto %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - -; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) @@ -370,7 +332,6 @@ entry: vector.ph: - %trip.count.minus.1 = add i32 %N, -1 %scevgep = getelementptr i32, i32* %A, i32 8 %scevgep30 = getelementptr i32, i32* %C, i32 8 %scevgep37 = getelementptr i32, i32* %B, i32 8 @@ -459,9 +420,7 @@ vector.body: ; preds = %vector.body, %vecto %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load @@ -495,7 +454,6 @@ entry: br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body @@ -509,9 +467,7 @@ vector.body: ; preds = %vector.body, %vecto %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load @@ -546,7 +502,6 @@ entry: br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll index 4682f1d36f3..1c173e9dfd1 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -10,22 +10,17 @@ define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture rea ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]]) ; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 @@ -48,13 +43,7 @@ vector.body: %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, - - ; %1 = icmp ult <4 x i32> %induction, %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -244,11 +233,7 @@ vector.body: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - -; Non-uniform constant vector here. This can't be represented with -; @llvm.get.active.lane.mask, but let's keep this test as a sanity check: %1 = icmp ult <4 x i32> %induction, - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -285,13 +270,8 @@ vector.body: %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, - ; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow: %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -328,12 +308,7 @@ vector.body: %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -371,13 +346,8 @@ vector.body: %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, - ; The induction variable %N is not an IV: %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -414,12 +384,7 @@ vector.body: %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load @@ -460,9 +425,6 @@ vector.body: %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) @@ -514,10 +476,8 @@ vector.body: ; preds = %vector.body, %vecto %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>* %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>* - ; It's using %j.025, the induction variable from its outer loop: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll index a2361f51863..73865945cdc 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll @@ -82,7 +82,6 @@ entry: br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body @@ -92,13 +91,10 @@ vector.body: ; preds = %vector.body, %vecto %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] - %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll index c0b2a036f37..14f1d0c0020 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll @@ -27,7 +27,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %blockSize, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %blockSize, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -77,7 +76,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %blockSize, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %blockSize, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll index 5ad6d911230..66216022d64 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll @@ -26,7 +26,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %blockSize, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %blockSize, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll index bd927fdcf85..024857b6580 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll @@ -26,7 +26,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -72,7 +71,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -118,7 +116,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -164,7 +161,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -210,7 +206,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -260,7 +255,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll index 98d48d49539..0e51661e8f5 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll @@ -27,7 +27,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %blockSize, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %blockSize, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -77,7 +76,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %blockSize, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %blockSize, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll index ef79f27ce5d..01bbd3ac28e 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll @@ -18,8 +18,6 @@ vector.ph: %tmp = add i32 %N, -1 %n.rnd.up = add i32 %tmp, 8 %n.vec = and i32 %n.rnd.up, -8 - %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 - %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer %0 = add i32 %n.vec, -8 %1 = lshr i32 %0, 3 %2 = add i32 %1, 1 @@ -30,14 +28,8 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ] %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - - ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = getelementptr inbounds i16, i16* %B, i32 %index @@ -87,8 +79,6 @@ vector.ph: %tmp = add i32 %N, -1 %n.rnd.up = add nuw nsw i32 %tmp, 8 %n.vec = and i32 %n.rnd.up, -8 - %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 - %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0 %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer %0 = add i32 %n.vec, -8 @@ -101,14 +91,8 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ] %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - - ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4 @@ -151,8 +135,6 @@ entry: %tmp = add i32 %N, -1 %n.rnd.up = add nuw nsw i32 %tmp, 8 %n.vec = and i32 %n.rnd.up, -8 - %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 - %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0 %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer %0 = add i32 %n.vec, -8 @@ -165,14 +147,8 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ] %3 = phi i32 [ %start, %entry ], [ %4, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = add <8 x i32> %broadcast.splat, %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index - - ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2 %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %tmp4 = bitcast i16* %tmp2 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef) %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4 @@ -227,7 +203,6 @@ for.body: br i1 %cmp433, label %vector.ph, label %for.end vector.ph: ; preds = %for.body - %trip.count.minus.1 = add i32 %8, -1 %start = call i32 @llvm.start.loop.iterations.i32(i32 %7) br label %vector.body diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll index 939d3cc5e55..c9b2905755e 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -14,23 +14,14 @@ entry: br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - - ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 - %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - + %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index @@ -72,8 +63,6 @@ entry: vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) @@ -83,14 +72,8 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ] %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = add <8 x i32> %broadcast.splat, %tmp = getelementptr inbounds i16, i16* %a, i32 %index - - ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %tmp2 = bitcast i16* %tmp to <8 x i16>* %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef) %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index @@ -136,23 +119,14 @@ entry: br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index - - ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tmp2 = bitcast i32* %tmp to <4 x i32>* %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 1ea183d4a5f..af5c76fd447 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -183,7 +183,6 @@ for.body: ; preds = %for.end, %for.body. br i1 %cmp433, label %vector.ph, label %for.end vector.ph: ; preds = %for.body - %trip.count.minus.1 = add i32 %i8, -1 %start = call i32 @llvm.start.loop.iterations.i32(i32 %i7) br label %vector.body diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll index 1a370c483ba..015af0b4097 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -11,27 +11,25 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no ; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w r12, r3, r12, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 -; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: vmul.i32 q0, q2, q0 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -41,22 +39,13 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = getelementptr inbounds i32, i32* %b, i32 %index @@ -92,16 +81,14 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w r3, r3, r1, lsr #2 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: add.w r1, r3, r1, lsr #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 -; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB1_2 @@ -116,22 +103,13 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi @@ -163,16 +141,14 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w r3, r3, r1, lsr #2 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: add.w r1, r3, r1, lsr #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 -; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB2_2 @@ -187,22 +163,13 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %a, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi @@ -228,11 +195,9 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB3_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 @@ -246,23 +211,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11 @@ -285,11 +241,9 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB4_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 @@ -303,23 +257,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, %0 = getelementptr inbounds i32, i32* %b, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %2 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11 @@ -342,11 +287,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.8 lr, r3 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #16 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16 ; CHECK-NEXT: vmul.i8 q0, q1, q0 @@ -361,21 +304,12 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer - %induction = add <16 x i32> %broadcast.splat, %0 = getelementptr inbounds i8, i8* %b, i32 %index - - ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) - %2 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef) %3 = getelementptr inbounds i8, i8* %c, i32 %index @@ -402,11 +336,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB6_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16 ; CHECK-NEXT: vmul.i16 q0, q1, q0 @@ -421,21 +353,12 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %N, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer - %induction = add <8 x i32> %broadcast.splat, %0 = getelementptr inbounds i16, i16* %b, i32 %index - - ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) - %2 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef) %3 = getelementptr inbounds i16, i16* %c, i32 %index diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll index e8da32611be..ec6a7554b3e 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -30,9 +30,6 @@ entry: br i1 %cmp8, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body @@ -44,13 +41,7 @@ vector.body: ; preds = %vector.body, %vecto %6 = phi i32 [ %start, %vector.ph ], [ %10, %vector.body ] %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>* %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, - - ; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef) %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef) %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load diff --git a/test/CodeGen/Thumb2/mve-fma-loops.ll b/test/CodeGen/Thumb2/mve-fma-loops.ll index 7609c16ea84..a34a278103a 100644 --- a/test/CodeGen/Thumb2/mve-fma-loops.ll +++ b/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -11,11 +11,9 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 @@ -30,23 +28,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -74,11 +63,9 @@ define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 @@ -93,23 +80,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -138,11 +116,9 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 @@ -157,23 +133,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -201,11 +168,9 @@ define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 @@ -220,23 +185,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 @@ -265,12 +221,10 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: eor r12, r12, #-2147483648 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vfmas.f32 q1, q0, r12 @@ -286,23 +240,14 @@ vector.ph: ; preds = %entry %fneg = fneg fast float %a %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -329,14 +274,12 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB5_1: @ %vector.ph -; CHECK-NEXT: vmov lr, s0 -; CHECK-NEXT: vdup.32 q0, lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -352,23 +295,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -398,11 +332,9 @@ define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -418,23 +350,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -464,11 +387,9 @@ define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocaptur ; CHECK-NEXT: .LBB7_1: @ %vector.ph ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 @@ -484,23 +405,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -529,12 +441,10 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: eor r12, r12, #-2147483648 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q0, r12 @@ -550,23 +460,14 @@ vector.ph: ; preds = %entry %fneg = fneg fast float %a %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -595,11 +496,9 @@ define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: .LBB9_1: @ %vector.ph ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q2, q1, q0 @@ -614,23 +513,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -659,16 +549,14 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB10_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q0, r12 -; CHECK-NEXT: vstrw.32 q1, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, r12 +; CHECK-NEXT: vstrw.32 q0, [r2], #16 ; CHECK-NEXT: letp lr, .LBB10_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} @@ -679,23 +567,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = getelementptr inbounds float, float* %y, i32 %index @@ -724,16 +603,14 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB11_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB11_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q0, r12 -; CHECK-NEXT: vstrw.32 q1, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, r12 +; CHECK-NEXT: vstrw.32 q0, [r2], #16 ; CHECK-NEXT: letp lr, .LBB11_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} @@ -744,23 +621,14 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = or <4 x i32> %broadcast.splat, %0 = getelementptr inbounds float, float* %x, i32 %index - - ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) - %2 = bitcast float* %0 to <4 x float>* %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 diff --git a/test/CodeGen/Thumb2/mve-gather-increment.ll b/test/CodeGen/Thumb2/mve-gather-increment.ll index eb088263117..13d47becdda 100644 --- a/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -232,7 +232,6 @@ define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %dat ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 ; CHECK-NEXT: .long 0 @ 0x0 vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -276,7 +275,6 @@ define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %da ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 vector.ph41: ; preds = %for.body6.preheader - %ind.end47 = shl i32 %n.vec43, 1 br label %vector.body39 vector.body39: ; preds = %vector.body39, %vector.ph41 diff --git a/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll b/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll index 4c5bcd836c3..83f788df61c 100644 --- a/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll +++ b/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll @@ -1,18 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py - - -; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o 2>/dev/null - | FileCheck %s +; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o - | FileCheck %s define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: @push_out_add_sub_block( ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1 ; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> , ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48 ; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]] ; CHECK: lower.block: ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1) @@ -23,20 +20,19 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: br label [[VECTOR_BODY_END]] ; CHECK: vector.body.end: -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]] ; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]] ; CHECK: end: ; CHECK-NEXT: ret void ; vector.ph: - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ] %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body.end ] - %0 = icmp eq i32 %index, 50 + %0 = icmp eq i32 %index, 48 br i1 %0, label %lower.block, label %end lower.block: ; preds = %vector.body @@ -61,7 +57,6 @@ end: define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: @push_out_mul_sub_block( ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1 ; CHECK-NEXT: [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> , ; CHECK-NEXT: [[PRODUCT:%.*]] = mul <4 x i32> , ; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], @@ -69,7 +64,7 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48 ; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]] ; CHECK: lower.block: ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1) @@ -80,20 +75,19 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado ; CHECK-NEXT: br label [[VECTOR_BODY_END]] ; CHECK: vector.body.end: ; CHECK-NEXT: [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]] ; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]] ; CHECK: end: ; CHECK-NEXT: ret void ; vector.ph: - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ] %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body.end ] - %0 = icmp eq i32 %index, 50 + %0 = icmp eq i32 %index, 48 br i1 %0, label %lower.block, label %end lower.block: ; preds = %vector.body @@ -120,7 +114,6 @@ end: define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: @push_out_mul_sub_loop( ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ] @@ -138,19 +131,18 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readon ; CHECK-NEXT: br label [[VECTOR_2_BODY_END:%.*]] ; CHECK: vector.2.body.end: ; CHECK-NEXT: [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]] ; CHECK: vector.body.end: ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]] ; CHECK-NEXT: br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]] ; CHECK: end: ; CHECK-NEXT: ret void ; vector.ph: - %ind.end = shl i32 %n.vec, 2 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -162,7 +154,6 @@ vector.2.ph: br label %vector.2.body vector.2.body: ; preds = %vector.body - %index.2 = phi i32 [ 0, %vector.2.ph ], [ %index.2.next, %vector.2.body.end ] %0 = mul <4 x i32> %vec.ind, %1 = add <4 x i32> %0, %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1 @@ -174,7 +165,7 @@ vector.2.body: ; preds = %vector.body vector.2.body.end: ; preds = %lower.block %index.2.next = add i32 %index, 4 - %5 = icmp eq i32 %index.2.next, 15 + %5 = icmp eq i32 %index.2.next, 16 br i1 %5, label %vector.body.end, label %vector.2.body vector.body.end: ; preds = %lower.block diff --git a/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index d6e505e1707..32a183cdac1 100644 --- a/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -39,7 +39,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -83,7 +82,6 @@ define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly ; CHECK-NEXT: .long 16 @ 0x10 vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -127,7 +125,6 @@ define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture read ; CHECK-NEXT: .long 0 @ 0x0 vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -173,7 +170,6 @@ define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonl <4 x i32> %to.store) { vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -215,7 +211,6 @@ define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonl <4 x i32> %to.store) { vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -259,7 +254,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture i32* noalias nocapture %dst, i32 %n.vec) { vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -301,7 +295,6 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado ; CHECK-NEXT: .long 16 @ 0x10 vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -358,7 +351,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %d ; CHECK-NEXT: .long 6 @ 0x6 vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -411,7 +403,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %d ; CHECK-NEXT: .long 6 @ 0x6 vector.ph: ; preds = %for.body.preheader - %ind.end = shl i32 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -528,7 +519,6 @@ for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry %2 = add nuw i32 %1, 1 %min.iters.check = icmp ult i32 %0, 6 %n.vec = and i32 %2, -4 - %ind.end = shl i32 %n.vec, 1 %broadcast.splatinsert86 = insertelement <4 x i32> undef, i32 %m, i32 0 %broadcast.splat87 = shufflevector <4 x i32> %broadcast.splatinsert86, <4 x i32> undef, <4 x i32> zeroinitializer %cmp.n = icmp eq i32 %2, %n.vec @@ -983,7 +973,6 @@ for.body10.i: ; preds = %for.cond.cleanup20. br i1 0, label %for.cond.cleanup20.i, label %for.cond22.preheader.lr.ph.i for.cond22.preheader.lr.ph.i: ; preds = %for.body10.i - %ind.end = add nsw i32 0, %n.vec %.splatinsert = insertelement <4 x i32> undef, i32 0, i32 0 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %.splat, diff --git a/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/test/CodeGen/Thumb2/mve-vecreduce-loops.ll index e7886dca32f..7bcc0193217 100644 --- a/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -1667,7 +1667,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -1718,7 +1717,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -1772,7 +1770,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -1824,7 +1821,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -1880,7 +1876,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -1932,7 +1927,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -1988,7 +1982,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -2039,7 +2032,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -2093,7 +2085,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -2145,7 +2136,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -2201,7 +2191,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -2252,7 +2241,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 15 %n.vec = and i32 %n.rnd.up, -16 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -2308,7 +2296,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -2362,7 +2349,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 3 %n.vec = and i32 %n.rnd.up, -4 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -2421,7 +2407,6 @@ entry: vector.ph: ; preds = %entry %n.rnd.up = add i32 %n, 7 %n.vec = and i32 %n.rnd.up, -8 - %trip.count.minus.1 = add i32 %n, -1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph