1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-qrintr.ll
David Green 151143cebb [ARM] Sink splats to MVE intrinsics
The predicated MVE intrinsics are generated as, for example,
llvm.arm.mve.add.predicated(x, splat(y). p). We need to sink the splat
value back into the loop, like we do for other instructions, so we can
re-select qr variants.

Differential Revision: https://reviews.llvm.org/D87693
2020-09-17 16:00:51 +01:00

694 lines
32 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
define void @vadd(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vadd:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB0_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB0_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vadd.i32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vsub(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vsub:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB1_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vsub.i32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vmul(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vmul:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB2_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB2_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmul.i32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB2_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vqadd(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vqadd:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB3_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB3_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vqadd.s32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB3_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vqsub(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vqsub:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB4_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB4_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vqsub.s32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB4_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vhadd(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vhadd:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB5_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB5_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vhadd.s32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB5_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vhsub(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vhsub:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB6_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB6_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vhsub.s32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB6_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vqdmull(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vqdmull:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB7_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB7_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vqdmullb.s16 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB7_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%conv = trunc i32 %c0 to i16
%.splatinsert = insertelement <8 x i16> undef, i16 %conv, i32 0
%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i16>*
%2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer)
%3 = sext <4 x i16> %2 to <4 x i32>
%4 = bitcast <4 x i32> %3 to <8 x i16>
%5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> %.splat, i32 0, <4 x i1> %0, <4 x i32> %3)
%6 = bitcast i32* %s1.addr.013 to <4 x i32>*
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %6, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vqdmulh(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vqdmulh:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB8_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB8_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vqdmulh.s32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB8_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vqrdmulh(i32* %s1, i32 %c0, i32 %N) {
; CHECK-LABEL: vqrdmulh:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB9_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB9_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vqrdmulh.s32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB9_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast i32* %s1.addr.013 to <4 x i32>*
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
%3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vaddf(float* %s1, float %c0, i32 %N) {
; CHECK-LABEL: vaddf:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB10_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB10_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vadd.f32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB10_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast float* %s1.addr.013 to <4 x float>*
%2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
%3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vsubf(float* %s1, float %c0, i32 %N) {
; CHECK-LABEL: vsubf:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB11_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB11_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vsub.f32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB11_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast float* %s1.addr.013 to <4 x float>*
%2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
%3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vmulf(float* %s1, float %c0, i32 %N) {
; CHECK-LABEL: vmulf:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB12_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB12_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmul.f32 q0, q0, r1
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB12_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
br i1 %cmp11, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
%1 = bitcast float* %s1.addr.013 to <4 x float>*
%2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
%3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
%add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
%sub = add nsw i32 %N.addr.012, -4
%cmp = icmp sgt i32 %N.addr.012, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vfma(float* %s1, float* %s2, float %c0, i32 %N) {
; CHECK-LABEL: vfma:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB13_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB13_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vfma.f32 q1, q0, r2
; CHECK-NEXT: vstrw.32 q1, [r0], #16
; CHECK-NEXT: letp lr, .LBB13_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp12 = icmp sgt i32 %N, 0
br i1 %cmp12, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%0 = bitcast float* %s2 to <4 x float>*
%.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
%2 = bitcast float* %s1.addr.014 to <4 x float>*
%3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
%4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
%5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> %.splat, <4 x float> %3, <4 x i1> %1)
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1)
%add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4
%sub = add nsw i32 %N.addr.013, -4
%cmp = icmp sgt i32 %N.addr.013, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
define void @vfmas(float* %s1, float* %s2, float %c0, i32 %N) {
; CHECK-LABEL: vfmas:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB14_1: @ %while.body.lr.ph
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB14_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vfmas.f32 q1, q0, r2
; CHECK-NEXT: vstrw.32 q1, [r0], #16
; CHECK-NEXT: letp lr, .LBB14_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp12 = icmp sgt i32 %N, 0
br i1 %cmp12, label %while.body.lr.ph, label %while.end
while.body.lr.ph: ; preds = %entry
%0 = bitcast float* %s2 to <4 x float>*
%.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
%1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
%2 = bitcast float* %s1.addr.014 to <4 x float>*
%3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
%4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
%5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> %.splat, <4 x i1> %1)
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1)
%add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4
%sub = add nsw i32 %N.addr.013, -4
%cmp = icmp sgt i32 %N.addr.013, 4
br i1 %cmp, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret void
}
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>)
declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)