1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[ARM] Clean up some tests, removing dead instructions. NFC

This commit is contained in:
David Green 2021-05-22 13:38:00 +01:00
parent 99610346a2
commit aca1951bd7
23 changed files with 62 additions and 756 deletions

View File

@ -47,22 +47,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@ -145,22 +136,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@ -203,13 +185,13 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq .LBB2_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: add.w r4, r12, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: bic r4, r4, #3
; CHECK-NEXT: sub.w lr, r4, #4
; CHECK-NEXT: add.w lr, r12, #3
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: add.w lr, r4, lr, lsr #2
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: bic lr, lr, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: sub.w lr, lr, #4
; CHECK-NEXT: add.w r4, r4, lr, lsr #2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r12
@ -219,12 +201,11 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vsub.i32 q1, q2, q1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vcmpt.i32 eq, q1, zr
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB2_2
@ -246,22 +227,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@ -301,13 +273,13 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq .LBB3_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: add.w r4, r12, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: bic r4, r4, #3
; CHECK-NEXT: sub.w lr, r4, #4
; CHECK-NEXT: add.w lr, r12, #3
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: add.w lr, r4, lr, lsr #2
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: bic lr, lr, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: sub.w lr, lr, #4
; CHECK-NEXT: add.w r4, r4, lr, lsr #2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r12
@ -322,9 +294,8 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
; CHECK-NEXT: vcmpt.i32 ne, q1, zr
; CHECK-NEXT: vldrwe.u32 q1, [r3], #16
; CHECK-NEXT: vldrwe.u32 q2, [r2], #16
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB3_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
@ -344,22 +315,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@ -398,11 +360,9 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB4_1: @ %bb3
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB4_2: @ %bb9
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vpt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [r0]
@ -419,21 +379,12 @@ bb:
bb3: ; preds = %bb
%tmp4 = add i32 %arg2, 3
%tmp5 = and i32 %tmp4, -4
%tmp6 = add i32 %arg2, -1
%tmp7 = insertelement <4 x i32> undef, i32 %tmp6, i32 0
%tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer
br label %bb9
bb9: ; preds = %bb9, %bb3
%tmp10 = phi i32 [ 0, %bb3 ], [ %tmp25, %bb9 ]
%tmp11 = insertelement <4 x i32> undef, i32 %tmp10, i32 0
%tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer
%tmp13 = add <4 x i32> %tmp12, <i32 0, i32 1, i32 2, i32 3>
%tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10
; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8
%tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %arg2)
%tmp16 = bitcast i32* %tmp14 to <4 x i32>*
%tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef)
%tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
@ -460,7 +411,6 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB5_1: @ %bb4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB5_2: @ %bb12
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@ -468,7 +418,6 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vcmpt.s32 le, q0, r2
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0], #16
@ -482,23 +431,14 @@ bb:
bb4: ; preds = %bb
%tmp5 = add i32 %arg3, 3
%tmp6 = and i32 %tmp5, -4
%tmp7 = add i32 %arg3, -1
%tmp8 = insertelement <4 x i32> undef, i32 %tmp7, i32 0
%tmp9 = shufflevector <4 x i32> %tmp8, <4 x i32> undef, <4 x i32> zeroinitializer
%tmp10 = insertelement <4 x i32> undef, i32 %arg2, i32 0
%tmp11 = shufflevector <4 x i32> %tmp10, <4 x i32> undef, <4 x i32> zeroinitializer
br label %bb12
bb12: ; preds = %bb12, %bb4
%tmp13 = phi i32 [ 0, %bb4 ], [ %tmp30, %bb12 ]
%tmp14 = insertelement <4 x i32> undef, i32 %tmp13, i32 0
%tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer
%tmp16 = add <4 x i32> %tmp15, <i32 0, i32 1, i32 2, i32 3>
%tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13
; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9
%tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %arg3)
%tmp19 = bitcast i32* %tmp17 to <4 x i32>*
%tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef)
%tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer

View File

@ -9,11 +9,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vldrb.s16 q0, [r1], #8
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vadd.i16 q0, q1, q0
@ -28,21 +26,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%0 = getelementptr inbounds i8, i8* %b, i32 %index
; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%2 = bitcast i8* %0 to <8 x i8>*
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
%3 = sext <8 x i8> %wide.masked.load to <8 x i16>
@ -69,11 +58,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB1_1: @ %vector.ph
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vldrb.u16 q0, [r1], #8
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vadd.i16 q0, q1, q0
@ -88,21 +75,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%0 = getelementptr inbounds i8, i8* %b, i32 %index
; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%2 = bitcast i8* %0 to <8 x i8>*
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
%3 = zext <8 x i8> %wide.masked.load to <8 x i16>
@ -129,11 +107,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16*
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB2_1: @ %vector.ph
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: vldrh.s32 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vadd.i32 q0, q1, q0
@ -148,21 +124,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i16, i16* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i16* %0 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
%3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@ -189,11 +156,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16*
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB3_1: @ %vector.ph
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: vldrh.u32 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vadd.i32 q0, q1, q0
@ -208,21 +173,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i16, i16* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i16* %0 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
%3 = zext <4 x i16> %wide.masked.load to <4 x i32>

View File

@ -6,7 +6,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq.w .LBB0_11
; CHECK-NEXT: beq .LBB0_11
; CHECK-NEXT: @ %bb.1: @ %vector.memcheck
; CHECK-NEXT: add.w r5, r0, r3, lsl #2
; CHECK-NEXT: add.w r4, r2, r3, lsl #2
@ -32,11 +32,9 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB0_8
; CHECK-NEXT: .LBB0_4: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB0_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmul.f32 q0, q1, q0
@ -117,21 +115,12 @@ for.body.preheader.new: ; preds = %for.body.preheader
vector.ph: ; preds = %vector.memcheck
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%2 = getelementptr inbounds float, float* %b, i32 %index
; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%4 = bitcast float* %2 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef)
%5 = getelementptr inbounds float, float* %c, i32 %index
@ -219,13 +208,11 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: add.w r12, r12, r3, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: add.w r3, r12, r3, lsr #2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpstt
@ -257,22 +244,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %c, i32 %index

View File

@ -15,13 +15,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
@ -40,9 +38,6 @@ vector.ph: ; preds = %entry
%conv = zext i8 %a to i32
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
%broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
@ -50,14 +45,8 @@ vector.ph: ; preds = %entry
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i8, i8* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i8* %0 to <4 x i8>*
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
%3 = zext <4 x i8> %wide.masked.load to <4 x i32>
@ -91,13 +80,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
@ -116,9 +103,6 @@ vector.ph: ; preds = %entry
%conv = sext i16 %a to i32
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
%broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
@ -126,14 +110,8 @@ vector.ph: ; preds = %entry
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i16, i16* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i16* %0 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
%3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@ -167,13 +145,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
@ -192,9 +168,6 @@ vector.ph: ; preds = %entry
%conv = zext i8 %a to i32
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
%broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
@ -202,14 +175,8 @@ vector.ph: ; preds = %entry
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i8, i8* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i8* %0 to <4 x i8>*
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
%3 = zext <4 x i8> %wide.masked.load to <4 x i32>
@ -243,13 +210,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
@ -268,9 +233,6 @@ vector.ph: ; preds = %entry
%conv = sext i16 %a to i32
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
%broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
@ -278,14 +240,8 @@ vector.ph: ; preds = %entry
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i16, i16* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i16* %0 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
%3 = zext <4 x i16> %wide.masked.load to <4 x i32>
@ -319,13 +275,11 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
@ -343,9 +297,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %a, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
@ -353,14 +304,8 @@ vector.ph: ; preds = %entry
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12
@ -411,11 +356,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB5_8
; CHECK-NEXT: .LBB5_4: @ %vector.ph
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB5_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: vldrb.u32 q0, [r0], #4
; CHECK-NEXT: vldrb.u32 q1, [r1], #4
; CHECK-NEXT: vmlas.u32 q1, q0, r2
@ -498,23 +441,14 @@ for.body.preheader.new: ; preds = %for.body.preheader
vector.ph: ; preds = %for.body.lr.ph
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
%broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%2 = getelementptr inbounds i8, i8* %a, i32 %index
; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%4 = bitcast i8* %2 to <4 x i8>*
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
%5 = zext <4 x i8> %wide.masked.load to <4 x i32>
@ -618,11 +552,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, pc}
; CHECK-NEXT: .LBB6_1: @ %vector.ph
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB6_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrh.s32 q0, [r0], #8
; CHECK-NEXT: vldrh.s32 q1, [r1], #8
; CHECK-NEXT: vmlas.u32 q1, q0, r2
@ -638,23 +570,14 @@ vector.ph: ; preds = %entry
%conv3 = sext i16 %c to i32
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
%broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i16, i16* %a, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i16* %0 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
%3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@ -707,11 +630,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB7_8
; CHECK-NEXT: .LBB7_4: @ %vector.ph
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB7_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: vldrb.u32 q0, [r0], #4
; CHECK-NEXT: vldrb.u32 q1, [r1], #4
; CHECK-NEXT: vmlas.u32 q1, q0, r2
@ -794,23 +715,14 @@ for.body.preheader.new: ; preds = %for.body.preheader
vector.ph: ; preds = %for.body.lr.ph
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
%broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%2 = getelementptr inbounds i8, i8* %a, i32 %index
; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%4 = bitcast i8* %2 to <4 x i8>*
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
%5 = zext <4 x i8> %wide.masked.load to <4 x i32>
@ -914,11 +826,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, pc}
; CHECK-NEXT: .LBB8_1: @ %vector.ph
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB8_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrh.u32 q0, [r0], #8
; CHECK-NEXT: vldrh.u32 q1, [r1], #8
; CHECK-NEXT: vmlas.u32 q1, q0, r2
@ -934,23 +844,14 @@ vector.ph: ; preds = %entry
%conv3 = sext i16 %c to i32
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
%broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i16, i16* %a, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i16* %0 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
%3 = zext <4 x i16> %wide.masked.load to <4 x i32>
@ -1003,11 +904,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB9_8
; CHECK-NEXT: .LBB9_4: @ %vector.ph
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB9_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vmlas.u32 q1, q0, r2
@ -1089,23 +988,14 @@ for.body.preheader.new: ; preds = %for.body.preheader
vector.ph: ; preds = %vector.memcheck
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert24 = insertelement <4 x i32> undef, i32 %c, i32 0
%broadcast.splat25 = shufflevector <4 x i32> %broadcast.splatinsert24, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%2 = getelementptr inbounds i32, i32* %a, i32 %index
; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%4 = bitcast i32* %2 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef)
%5 = getelementptr inbounds i32, i32* %b, i32 %index
@ -1196,11 +1086,9 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB10_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.16 lr, r3
; CHECK-NEXT: .LBB10_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #8
; CHECK-NEXT: vldrb.u16 q0, [r1], #8
; CHECK-NEXT: vldrb.u16 q1, [r2], #8
; CHECK-NEXT: vmul.i16 q0, q1, q0
@ -1215,21 +1103,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%0 = getelementptr inbounds i8, i8* %b, i32 %index
; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%2 = bitcast i8* %0 to <8 x i8>*
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
%3 = zext <8 x i8> %wide.masked.load to <8 x i16>

View File

@ -9,9 +9,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
; CHECK: for.cond1.preheader.us.preheader:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TT:%.*]] = add i32 [[N_VEC]], -4
; CHECK-NEXT: [[TT1:%.*]] = lshr i32 [[TT]], 2
; CHECK-NEXT: [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
@ -30,9 +27,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT14:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT15:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TT6:%.*]] = getelementptr inbounds i16, i16* [[TT3]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
@ -66,9 +60,6 @@ entry:
for.cond1.preheader.us.preheader: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer
%tt = add i32 %n.vec, -4
%tt1 = lshr i32 %tt, 2
%tt2 = add nuw nsw i32 %tt1, 1
@ -88,14 +79,8 @@ vector.body: ; preds = %vector.body, %for.c
%index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt14, %vector.body ]
%tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tt6 = getelementptr inbounds i16, i16* %tt3, i32 %index
; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat29
%tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tt8 = bitcast i16* %tt6 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt8, i32 2, <4 x i1> %tt7, <4 x i16> undef)
%tt9 = sext <4 x i16> %wide.masked.load to <4 x i32>
@ -130,9 +115,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
; CHECK: for.cond1.preheader.us.preheader:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TT:%.*]] = add i32 [[N_VEC]], -4
; CHECK-NEXT: [[TT1:%.*]] = lshr i32 [[TT]], 2
; CHECK-NEXT: [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
@ -151,9 +133,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT12:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT13:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TT6:%.*]] = getelementptr inbounds i32, i32* [[TT3]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
@ -185,9 +164,6 @@ entry:
for.cond1.preheader.us.preheader: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer
%tt = add i32 %n.vec, -4
%tt1 = lshr i32 %tt, 2
%tt2 = add nuw nsw i32 %tt1, 1
@ -207,14 +183,8 @@ vector.body: ; preds = %vector.body, %for.c
%index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt12, %vector.body ]
%tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt13, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tt6 = getelementptr inbounds i32, i32* %tt3, i32 %index
; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat28
%tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tt8 = bitcast i32* %tt6 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt8, i32 4, <4 x i1> %tt7, <4 x i32> undef)
%tt9 = getelementptr inbounds i32, i32* %B, i32 %index

View File

@ -30,7 +30,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -100,7 +99,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -172,7 +170,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -242,7 +239,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -314,7 +310,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -384,7 +379,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -481,7 +475,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -510,7 +503,6 @@ middle.block: ; preds = %vector.body
vector.ph47: ; preds = %middle.block
%n.rnd.up48 = add i32 %N, 3
%n.vec50 = and i32 %n.rnd.up48, -4
%trip.count.minus.154 = add i32 %N, -1
%i11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %i10, i32 0
br label %vector.body46
@ -594,7 +586,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -719,7 +710,6 @@ lor.end: ; preds = %entry, %lor.rhs
vector.ph: ; preds = %lor.end
%n.rnd.up = add i32 %4, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %4, -1
%5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0
br label %vector.body

View File

@ -22,23 +22,14 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
%induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tmp = getelementptr inbounds i8, i8* %a, i32 %index
; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i8* %tmp to <16 x i8>*
%wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
%tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
@ -79,23 +70,14 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp = getelementptr inbounds i16, i16* %a, i32 %index
; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i16* %tmp to <8 x i16>*
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
%tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@ -135,20 +117,13 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@ -190,20 +165,13 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@ -262,10 +230,7 @@ vector.body: ; preds = %vector.body, %vecto
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@ -321,10 +286,7 @@ vector.body: ; preds = %vector.body, %vecto
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@ -370,7 +332,6 @@ entry:
vector.ph:
%trip.count.minus.1 = add i32 %N, -1
%scevgep = getelementptr i32, i32* %A, i32 8
%scevgep30 = getelementptr i32, i32* %C, i32 8
%scevgep37 = getelementptr i32, i32* %B, i32 8
@ -459,9 +420,7 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
@ -495,7 +454,6 @@ entry:
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
br label %vector.body
@ -509,9 +467,7 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
@ -546,7 +502,6 @@ entry:
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
br label %vector.body

View File

@ -10,22 +10,17 @@ define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture rea
; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
@ -48,13 +43,7 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
; %1 = icmp ult <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@ -244,11 +233,7 @@ vector.body:
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
; Non-uniform constant vector here. This can't be represented with
; @llvm.get.active.lane.mask, but let's keep this test as a sanity check:
%1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@ -285,13 +270,8 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@ -328,12 +308,7 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@ -371,13 +346,8 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
; The induction variable %N is not an IV:
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@ -414,12 +384,7 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@ -460,9 +425,6 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
@ -514,10 +476,8 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
%lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
%lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*
; It's using %j.025, the induction variable from its outer loop:
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load

View File

@ -82,7 +82,6 @@ entry:
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
br label %vector.body
@ -92,13 +91,10 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load

View File

@ -27,7 +27,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -77,7 +76,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph

View File

@ -26,7 +26,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph

View File

@ -26,7 +26,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -72,7 +71,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -118,7 +116,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -164,7 +161,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -210,7 +206,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -260,7 +255,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph

View File

@ -27,7 +27,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -77,7 +76,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph

View File

@ -18,8 +18,6 @@ vector.ph:
%tmp = add i32 %N, -1
%n.rnd.up = add i32 %tmp, 8
%n.vec = and i32 %n.rnd.up, -8
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
%broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
%0 = add i32 %n.vec, -8
%1 = lshr i32 %0, 3
%2 = add i32 %1, 1
@ -30,14 +28,8 @@ vector.body: ; preds = %vector.body, %vecto
%index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ]
%3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp5 = getelementptr inbounds i16, i16* %B, i32 %index
@ -87,8 +79,6 @@ vector.ph:
%tmp = add i32 %N, -1
%n.rnd.up = add nuw nsw i32 %tmp, 8
%n.vec = and i32 %n.rnd.up, -8
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
%broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
%broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
%broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
%0 = add i32 %n.vec, -8
@ -101,14 +91,8 @@ vector.body: ; preds = %vector.body, %vecto
%index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ]
%3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
@ -151,8 +135,6 @@ entry:
%tmp = add i32 %N, -1
%n.rnd.up = add nuw nsw i32 %tmp, 8
%n.vec = and i32 %n.rnd.up, -8
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
%broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
%broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
%broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
%0 = add i32 %n.vec, -8
@ -165,14 +147,8 @@ vector.body: ; preds = %vector.body, %vecto
%index = phi i32 [ 0, %entry], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ]
%3 = phi i32 [ %start, %entry ], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
@ -227,7 +203,6 @@ for.body:
br i1 %cmp433, label %vector.ph, label %for.end
vector.ph: ; preds = %for.body
%trip.count.minus.1 = add i32 %8, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %7)
br label %vector.body

View File

@ -14,23 +14,14 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp = getelementptr inbounds i16, i16* %a, i32 %index
; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i16* %tmp to <8 x i16>*
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
%tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@ -72,8 +63,6 @@ entry:
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
%broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
@ -83,14 +72,8 @@ vector.body: ; preds = %vector.body, %vecto
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp = getelementptr inbounds i16, i16* %a, i32 %index
; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i16* %tmp to <8 x i16>*
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
%tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@ -136,23 +119,14 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index

View File

@ -183,7 +183,6 @@ for.body: ; preds = %for.end, %for.body.
br i1 %cmp433, label %vector.ph, label %for.end
vector.ph: ; preds = %for.body
%trip.count.minus.1 = add i32 %i8, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %i7)
br label %vector.body

View File

@ -11,27 +11,25 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: vmul.i32 q0, q2, q0
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: pop {r7, pc}
entry:
@ -41,22 +39,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %b, i32 %index
@ -92,16 +81,14 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w r3, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: add.w r1, r3, r1, lsr #2
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB1_2
@ -116,22 +103,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
@ -163,16 +141,14 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w r3, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: add.w r1, r3, r1, lsr #2
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB2_2
@ -187,22 +163,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
@ -228,11 +195,9 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB3_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vmul.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r0], #16
@ -246,23 +211,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
@ -285,11 +241,9 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB4_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vadd.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r0], #16
@ -303,23 +257,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %b, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
@ -342,11 +287,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB5_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.8 lr, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #16
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
; CHECK-NEXT: vldrb.u8 q1, [r2], #16
; CHECK-NEXT: vmul.i8 q0, q1, q0
@ -361,21 +304,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
%induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%0 = getelementptr inbounds i8, i8* %b, i32 %index
; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13
%1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
%2 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
%3 = getelementptr inbounds i8, i8* %c, i32 %index
@ -402,11 +336,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB6_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.16 lr, r3
; CHECK-NEXT: .LBB6_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #8
; CHECK-NEXT: vldrh.u16 q0, [r1], #16
; CHECK-NEXT: vldrh.u16 q1, [r2], #16
; CHECK-NEXT: vmul.i16 q0, q1, q0
@ -421,21 +353,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%0 = getelementptr inbounds i16, i16* %b, i32 %index
; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%2 = bitcast i16* %0 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
%3 = getelementptr inbounds i16, i16* %c, i32 %index

View File

@ -30,9 +30,6 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
br label %vector.body
@ -44,13 +41,7 @@ vector.body: ; preds = %vector.body, %vecto
%6 = phi i32 [ %start, %vector.ph ], [ %10, %vector.body ]
%lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
%lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
%7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
%wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
%8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load

View File

@ -11,11 +11,9 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vfmas.f32 q1, q0, r12
@ -30,23 +28,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -74,11 +63,9 @@ define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB1_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vfmas.f32 q1, q0, r12
@ -93,23 +80,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -138,11 +116,9 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB2_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vfma.f32 q1, q0, r12
@ -157,23 +133,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -201,11 +168,9 @@ define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB3_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vfma.f32 q1, q0, r12
@ -220,23 +185,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
@ -265,12 +221,10 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB4_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: eor r12, r12, #-2147483648
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vfmas.f32 q1, q0, r12
@ -286,23 +240,14 @@ vector.ph: ; preds = %entry
%fneg = fneg fast float %a
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -329,14 +274,12 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB5_1: @ %vector.ph
; CHECK-NEXT: vmov lr, s0
; CHECK-NEXT: vdup.32 q0, lr
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: vdup.32 q0, r12
; CHECK-NEXT: vneg.f32 q0, q0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
@ -352,23 +295,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -398,11 +332,9 @@ define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: .LBB6_1: @ %vector.ph
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB6_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
@ -418,23 +350,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -464,11 +387,9 @@ define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: .LBB7_1: @ %vector.ph
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB7_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
@ -484,23 +405,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -529,12 +441,10 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB8_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: eor r12, r12, #-2147483648
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB8_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vfma.f32 q1, q0, r12
@ -550,23 +460,14 @@ vector.ph: ; preds = %entry
%fneg = fneg fast float %a
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -595,11 +496,9 @@ define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: .LBB9_1: @ %vector.ph
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB9_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vfms.f32 q2, q1, q0
@ -614,23 +513,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -659,16 +549,14 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB10_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB10_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vneg.f32 q1, q1
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vneg.f32 q0, q0
; CHECK-NEXT: vfma.f32 q0, q1, r12
; CHECK-NEXT: vstrw.32 q0, [r2], #16
; CHECK-NEXT: letp lr, .LBB10_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
@ -679,23 +567,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %y, i32 %index
@ -724,16 +603,14 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB11_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB11_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vneg.f32 q1, q1
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrw.32 q1, [r2], #16
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vneg.f32 q0, q0
; CHECK-NEXT: vfma.f32 q0, q1, r12
; CHECK-NEXT: vstrw.32 q0, [r2], #16
; CHECK-NEXT: letp lr, .LBB11_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
@ -744,23 +621,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %x, i32 %index
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13

View File

@ -232,7 +232,6 @@ define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %dat
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
; CHECK-NEXT: .long 0 @ 0x0
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -276,7 +275,6 @@ define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %da
; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
vector.ph41: ; preds = %for.body6.preheader
%ind.end47 = shl i32 %n.vec43, 1
br label %vector.body39
vector.body39: ; preds = %vector.body39, %vector.ph41

View File

@ -1,18 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o 2>/dev/null - | FileCheck %s
; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o - | FileCheck %s
define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_add_sub_block(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 6, i32 6, i32 6, i32 6>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
; CHECK: lower.block:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
@ -23,20 +20,19 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: br label [[VECTOR_BODY_END]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;
vector.ph:
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
%0 = icmp eq i32 %index, 50
%0 = icmp eq i32 %index, 48
br i1 %0, label %lower.block, label %end
lower.block: ; preds = %vector.body
@ -61,7 +57,6 @@ end:
define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_mul_sub_block(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
; CHECK-NEXT: [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[PRODUCT:%.*]] = mul <4 x i32> <i32 8, i32 8, i32 8, i32 8>, <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], <i32 6, i32 6, i32 6, i32 6>
@ -69,7 +64,7 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
; CHECK: lower.block:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
@ -80,20 +75,19 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado
; CHECK-NEXT: br label [[VECTOR_BODY_END]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;
vector.ph:
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
%0 = icmp eq i32 %index, 50
%0 = icmp eq i32 %index, 48
br i1 %0, label %lower.block, label %end
lower.block: ; preds = %vector.body
@ -120,7 +114,6 @@ end:
define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_mul_sub_loop(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 2
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
@ -138,19 +131,18 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readon
; CHECK-NEXT: br label [[VECTOR_2_BODY_END:%.*]]
; CHECK: vector.2.body.end:
; CHECK-NEXT: [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
; CHECK-NEXT: br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;
vector.ph:
%ind.end = shl i32 %n.vec, 2
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -162,7 +154,6 @@ vector.2.ph:
br label %vector.2.body
vector.2.body: ; preds = %vector.body
%index.2 = phi i32 [ 0, %vector.2.ph ], [ %index.2.next, %vector.2.body.end ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
@ -174,7 +165,7 @@ vector.2.body: ; preds = %vector.body
vector.2.body.end: ; preds = %lower.block
%index.2.next = add i32 %index, 4
%5 = icmp eq i32 %index.2.next, 15
%5 = icmp eq i32 %index.2.next, 16
br i1 %5, label %vector.body.end, label %vector.2.body
vector.body.end: ; preds = %lower.block

View File

@ -39,7 +39,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -83,7 +82,6 @@ define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly
; CHECK-NEXT: .long 16 @ 0x10
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -127,7 +125,6 @@ define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture read
; CHECK-NEXT: .long 0 @ 0x0
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -173,7 +170,6 @@ define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonl
<4 x i32> %to.store) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -215,7 +211,6 @@ define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonl
<4 x i32> %to.store) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -259,7 +254,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture
i32* noalias nocapture %dst, i32 %n.vec) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -301,7 +295,6 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado
; CHECK-NEXT: .long 16 @ 0x10
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -358,7 +351,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %d
; CHECK-NEXT: .long 6 @ 0x6
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -411,7 +403,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %d
; CHECK-NEXT: .long 6 @ 0x6
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -528,7 +519,6 @@ for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry
%2 = add nuw i32 %1, 1
%min.iters.check = icmp ult i32 %0, 6
%n.vec = and i32 %2, -4
%ind.end = shl i32 %n.vec, 1
%broadcast.splatinsert86 = insertelement <4 x i32> undef, i32 %m, i32 0
%broadcast.splat87 = shufflevector <4 x i32> %broadcast.splatinsert86, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp.n = icmp eq i32 %2, %n.vec
@ -983,7 +973,6 @@ for.body10.i: ; preds = %for.cond.cleanup20.
br i1 0, label %for.cond.cleanup20.i, label %for.cond22.preheader.lr.ph.i
for.cond22.preheader.lr.ph.i: ; preds = %for.body10.i
%ind.end = add nsw i32 0, %n.vec
%.splatinsert = insertelement <4 x i32> undef, i32 0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %.splat, <i32 0, i32 1, i32 2, i32 3>

View File

@ -1667,7 +1667,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -1718,7 +1717,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -1772,7 +1770,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -1824,7 +1821,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -1880,7 +1876,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -1932,7 +1927,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -1988,7 +1982,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -2039,7 +2032,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -2093,7 +2085,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -2145,7 +2136,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -2201,7 +2191,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -2252,7 +2241,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -2308,7 +2296,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -2362,7 +2349,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
@ -2421,7 +2407,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph