1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 13:11:39 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
David Green b9acbecea9 [ARM][HWLoops] Create hardware loops for sibling loops
Given a loop with two subloops, it should be possible for both to be
converted to hardware loops. That's what this patch does, simply enough.
It slightly alters the loop iterating order to try and convert all
subloops. If one (or more) succeeds, it stops as before.

Differential Revision: https://reviews.llvm.org/D78502
2020-07-03 17:20:02 +01:00

1098 lines
50 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"min_enum_size", i32 4}
!2 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 26f04d01a39a33d73fd23165c208b215bf5c350d)"}
!3 = !{!4, !4, i64 0}
!4 = !{!"int", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.isvectorized", i32 1}
!9 = distinct !{!9, !10, !8}
!10 = !{!"llvm.loop.unroll.runtime.disable"}
define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: push_out_mul_gather:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r3, .LCPI0_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [q0, #96]!
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrb.8 q1, [r1], #16
; CHECK-NEXT: bne .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
; CHECK-NEXT: .long 4294967200 @ 0xffffffa0
; CHECK-NEXT: .long 4294967224 @ 0xffffffb8
; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%2 = getelementptr inbounds i32, i32* %dst, i32 %index
%3 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: push_out_add_gather:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r3, .LCPI1_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [q0, #32]!
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrb.8 q1, [r1], #16
; CHECK-NEXT: bne .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI1_0:
; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 16 @ 0x10
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%2 = getelementptr inbounds i32, i32* %dst, i32 %index
%3 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: push_out_mul_add_gather:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r3, .LCPI2_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [q0, #96]!
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrb.8 q1, [r1], #16
; CHECK-NEXT: bne .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI2_0:
; CHECK-NEXT: .long 4294967224 @ 0xffffffb8
; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
; CHECK-NEXT: .long 0 @ 0x0
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %dst, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonly %data,
; CHECK-LABEL: push_out_mul_scatter:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r1, .LCPI3_0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrw.32 q0, [q1, #96]!
; CHECK-NEXT: bne .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI3_0:
; CHECK-NEXT: .long 4294967200 @ 0xffffffa0
; CHECK-NEXT: .long 4294967224 @ 0xffffffb8
; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
i32* noalias nocapture %dst, i32 %n.vec,
<4 x i32> %to.store) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %to.store, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%2 = icmp eq i32 %index.next, %n.vec
br i1 %2, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonly %data,
; CHECK-LABEL: push_out_add_scatter:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r1, .LCPI4_0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrw.32 q0, [q1, #32]!
; CHECK-NEXT: bne .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI4_0:
; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 16 @ 0x10
i32* noalias nocapture %dst, i32 %n.vec,
<4 x i32> %to.store) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %to.store, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%2 = icmp eq i32 %index.next, %n.vec
br i1 %2, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture readonly %data,
; CHECK-LABEL: push_out_mul_gather_scatter:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r1, .LCPI5_0
; CHECK-NEXT: vmov.i32 q0, #0x18
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: .LBB5_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2]
; CHECK-NEXT: vadd.i32 q3, q1, q0
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2]
; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: bne .LBB5_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI5_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 12 @ 0xc
; CHECK-NEXT: .long 18 @ 0x12
i32* noalias nocapture %dst, i32 %n.vec) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.gather, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%2 = icmp eq i32 %index.next, %n.vec
br i1 %2, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: push_out_add_sub_block:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r3, .LCPI6_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [q0, #32]!
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrb.8 q1, [r1], #16
; CHECK-NEXT: bne .LBB6_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI6_0:
; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 16 @ 0x10
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
br label %lower.block;
lower.block: ; preds = %vector.body
%0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%2 = getelementptr inbounds i32, i32* %dst, i32 %index
%3 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
br label %vector.body.end
vector.body.end: ; preds = %lower.block
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: non_gatscat_use1:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r3, .LCPI7_0
; CHECK-NEXT: vmov.i32 q0, #0x8
; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: vmov.i32 q1, #0xc
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q3, q2, q0
; CHECK-NEXT: vmlas.u32 q2, q1, r0
; CHECK-NEXT: vldrw.u32 q4, [q2, #24]
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vstrb.8 q4, [r1], #16
; CHECK-NEXT: bne .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI7_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 6 @ 0x6
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %dst, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%non_gatscat_use = mul <4 x i32> %0, <i32 3, i32 3, i32 3, i32 3>
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: non_gatscat_use2:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r3, .LCPI8_0
; CHECK-NEXT: vmov.i32 q0, #0x8
; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: vmov.i32 q1, #0xc
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q3, q2, q0
; CHECK-NEXT: vmlas.u32 q2, q1, r0
; CHECK-NEXT: vldrw.u32 q4, [q2, #24]
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vstrb.8 q4, [r1], #16
; CHECK-NEXT: bne .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI8_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 6 @ 0x6
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %dst, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%non_gatscat_use = mul <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %end, label %vector.body
end:
ret void;
}
define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {
; CHECK-LABEL: arm_mat_mult_q31:
; CHECK: @ %bb.0: @ %for.cond8.preheader.us.us.preheader.preheader
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: ldrd r9, r12, [sp, #128]
; CHECK-NEXT: sub.w r7, r12, #1
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: adr r5, .LCPI9_0
; CHECK-NEXT: add.w r7, r6, r7, lsr #1
; CHECK-NEXT: vdup.32 q1, r9
; CHECK-NEXT: bic r7, r7, #3
; CHECK-NEXT: vldrw.u32 q2, [r5]
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: vshl.i32 q3, q1, #3
; CHECK-NEXT: add.w r7, r6, r7, lsr #2
; CHECK-NEXT: adr r6, .LCPI9_1
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_2 Depth 2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: mul r10, r8, r9
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: mul r11, r8, r12
; CHECK-NEXT: .LBB9_2: @ %vector.ph
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: vdup.32 q5, r11
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vshl.i32 q5, q5, #2
; CHECK-NEXT: vmov q6, q1
; CHECK-NEXT: vadd.i32 q5, q5, r0
; CHECK-NEXT: vmov.i32 q4, #0x0
; CHECK-NEXT: vadd.i32 q5, q5, q0
; CHECK-NEXT: vmlas.u32 q6, q2, r5
; CHECK-NEXT: dls lr, r7
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vadd.i32 q7, q6, q3
; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2]
; CHECK-NEXT: vldrw.u32 q6, [q5, #32]!
; CHECK-NEXT: vmul.i32 q0, q0, q6
; CHECK-NEXT: vmov q6, q7
; CHECK-NEXT: vadd.i32 q4, q0, q4
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2
; CHECK-NEXT: add.w r6, r5, r10
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: vaddv.u32 r4, q4
; CHECK-NEXT: cmp r5, r9
; CHECK-NEXT: str.w r4, [r2, r6, lsl #2]
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1
; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: bne .LBB9_1
; CHECK-NEXT: @ %bb.6: @ %for.end25
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.7:
; CHECK-NEXT: .LCPI9_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .LCPI9_1:
; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry
%0 = add i32 %l, -1
%1 = lshr i32 %0, 1
%2 = add nuw i32 %1, 1
%min.iters.check = icmp ult i32 %0, 6
%n.vec = and i32 %2, -4
%ind.end = shl i32 %n.vec, 1
%broadcast.splatinsert86 = insertelement <4 x i32> undef, i32 %m, i32 0
%broadcast.splat87 = shufflevector <4 x i32> %broadcast.splatinsert86, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp.n = icmp eq i32 %2, %n.vec
br label %for.cond8.preheader.us.us.preheader
for.cond8.preheader.us.us.preheader: ; preds = %for.cond8.preheader.us.us.preheader.preheader, %for.cond4.for.cond.cleanup6_crit_edge.us
%i.054.us = phi i32 [ %inc24.us, %for.cond4.for.cond.cleanup6_crit_edge.us ], [ 0, %for.cond8.preheader.us.us.preheader.preheader ]
%mul.us = mul i32 %i.054.us, %l
%mul18.us = mul i32 %i.054.us, %m
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %mul.us, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.ph
for.cond4.for.cond.cleanup6_crit_edge.us: ; preds = %for.cond8.for.cond.cleanup10_crit_edge.us.us
%inc24.us = add nuw nsw i32 %i.054.us, 1
%exitcond85 = icmp eq i32 %inc24.us, %n
br i1 %exitcond85, label %for.end25, label %for.cond8.preheader.us.us.preheader
vector.ph: ; preds = %middle.block, %for.cond8.preheader.us.us.preheader
%j.051.us.us = phi i32 [ %inc.us.us, %middle.block ], [ 0, %for.cond8.preheader.us.us.preheader ]
%broadcast.splatinsert88 = insertelement <4 x i32> undef, i32 %j.051.us.us, i32 0
%broadcast.splat89 = shufflevector <4 x i32> %broadcast.splatinsert88, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
%3 = add <4 x i32> %vec.ind, %broadcast.splat
%4 = getelementptr inbounds i32, i32* %A, <4 x i32> %3
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %4, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa !3
%5 = mul <4 x i32> %vec.ind, %broadcast.splat87
%6 = add <4 x i32> %5, %broadcast.splat89
%7 = getelementptr inbounds i32, i32* %B, <4 x i32> %6
%wide.masked.gather90 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %7, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa !3
%8 = mul nsw <4 x i32> %wide.masked.gather90, %wide.masked.gather
%9 = add <4 x i32> %8, %vec.phi
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%10 = icmp eq i32 %index.next, %n.vec
br i1 %10, label %middle.block, label %vector.body, !llvm.loop !7
middle.block: ; preds = %vector.body
%11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
;for.cond8.for.cond.cleanup10_crit_edge.us.us: ; preds = %for.body11.us.us, %middle.block
%add19.us.us = add i32 %j.051.us.us, %mul18.us
%arrayidx20.us.us = getelementptr inbounds i32, i32* %C, i32 %add19.us.us
store i32 %11, i32* %arrayidx20.us.us, align 4, !tbaa !3
%inc.us.us = add nuw nsw i32 %j.051.us.us, 1
%exitcond = icmp eq i32 %inc.us.us, %m
br i1 %exitcond, label %for.cond4.for.cond.cleanup6_crit_edge.us, label %vector.ph
for.end25: ; preds = %for.cond4.for.cond.cleanup6_crit_edge.us, %entry
ret void
}
define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i16* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {
; CHECK-LABEL: arm_mat_mult_q15:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: strd r0, r2, [sp, #16] @ 8-byte Folded Spill
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrne r0, [sp, #112]
; CHECK-NEXT: cmpne r0, #0
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader
; CHECK-NEXT: ldr.w r11, [sp, #116]
; CHECK-NEXT: mov r6, r1
; CHECK-NEXT: movs r1, #1
; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: bic r10, r11, #3
; CHECK-NEXT: sub.w r0, r10, #4
; CHECK-NEXT: add.w r8, r1, r0, lsr #2
; CHECK-NEXT: ldr r1, [sp, #112]
; CHECK-NEXT: lsl.w r0, r11, #1
; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
; CHECK-NEXT: adr r0, .LCPI10_0
; CHECK-NEXT: vdup.32 q4, r1
; CHECK-NEXT: vldrw.u32 q5, [r0]
; CHECK-NEXT: lsls r4, r1, #1
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vshl.i32 q6, q4, #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: b .LBB10_5
; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: mov r1, r4
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: bl __aeabi_memclr
; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
; CHECK-NEXT: add r9, r11
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: add r1, r0
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: adds r1, #1
; CHECK-NEXT: cmp r1, r0
; CHECK-NEXT: beq .LBB10_1
; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB10_8 Depth 2
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
; CHECK-NEXT: ldr r0, [sp, #112]
; CHECK-NEXT: cmp.w r11, #0
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: mul r12, r1, r0
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: b .LBB10_8
; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: add.w r3, r1, r12
; CHECK-NEXT: adds r1, #1
; CHECK-NEXT: strh.w r2, [r0, r3, lsl #1]
; CHECK-NEXT: ldr r0, [sp, #112]
; CHECK-NEXT: cmp r1, r0
; CHECK-NEXT: beq .LBB10_4
; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
; CHECK-NEXT: cmp.w r11, #3
; CHECK-NEXT: bhi .LBB10_10
; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB10_13
; CHECK-NEXT: .LBB10_10: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: vmov q1, q4
; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vmlas.u32 q1, q5, r1
; CHECK-NEXT: dls lr, r8
; CHECK-NEXT: .LBB10_11: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vadd.i32 q2, q1, q6
; CHECK-NEXT: vldrh.s32 q3, [r6, q1, uxtw #1]
; CHECK-NEXT: vldrh.s32 q1, [r2], #8
; CHECK-NEXT: vmul.i32 q1, q3, q1
; CHECK-NEXT: vadd.i32 q0, q1, q0
; CHECK-NEXT: vmov q1, q2
; CHECK-NEXT: le lr, .LBB10_11
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: vaddv.u32 r2, q0
; CHECK-NEXT: cmp r10, r11
; CHECK-NEXT: mov r5, r10
; CHECK-NEXT: beq .LBB10_7
; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: ldr r0, [sp, #112]
; CHECK-NEXT: sub.w lr, r11, r5
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: mla r3, r0, r5, r1
; CHECK-NEXT: add r5, r9
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: add.w r5, r0, r5, lsl #1
; CHECK-NEXT: add.w r3, r6, r3, lsl #1
; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: ldrsh.w r0, [r3]
; CHECK-NEXT: add r3, r4
; CHECK-NEXT: ldrsh r7, [r5], #2
; CHECK-NEXT: smlabb r2, r0, r7, r2
; CHECK-NEXT: le lr, .LBB10_14
; CHECK-NEXT: b .LBB10_7
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.15:
; CHECK-NEXT: .LCPI10_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
entry:
%cmp48 = icmp eq i32 %n, 0
br i1 %cmp48, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
for.cond1.preheader.lr.ph: ; preds = %entry
%cmp245 = icmp eq i32 %m, 0
%cmp642 = icmp eq i32 %l, 0
br i1 %cmp245, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph
%0 = shl nuw i32 %m, 1
%min.iters.check = icmp ult i32 %l, 4
%n.vec = and i32 %l, -4
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %m, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp.n = icmp eq i32 %n.vec, %l
br label %for.cond1.preheader.us
for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
%i.049.us = phi i32 [ %inc23.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
%1 = mul i32 %i.049.us, %m
%mul.us = mul i32 %i.049.us, %l
br i1 %cmp642, label %for.cond5.preheader.us73.preheader, label %for.cond5.preheader.us.us
for.cond5.preheader.us73.preheader: ; preds = %for.cond1.preheader.us
%scevgep = getelementptr i16, i16* %C, i32 %1
%scevgep82 = bitcast i16* %scevgep to i8*
call void @llvm.memset.p0i8.i32(i8* align 2 %scevgep82, i8 0, i32 %0, i1 false)
br label %for.cond1.for.cond.cleanup3_crit_edge.us
for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us.us, %for.cond5.preheader.us73.preheader
%inc23.us = add nuw nsw i32 %i.049.us, 1
%exitcond84 = icmp eq i32 %inc23.us, %n
br i1 %exitcond84, label %for.cond.cleanup, label %for.cond1.preheader.us
for.cond5.preheader.us.us: ; preds = %for.cond1.preheader.us, %for.cond5.for.cond.cleanup7_crit_edge.us.us
%j.046.us.us = phi i32 [ %inc20.us.us, %for.cond5.for.cond.cleanup7_crit_edge.us.us ], [ 0, %for.cond1.preheader.us ]
br i1 %min.iters.check, label %for.body8.us.us.preheader, label %vector.ph
for.body8.us.us.preheader: ; preds = %middle.block, %for.cond5.preheader.us.us
%k.044.us.us.ph = phi i32 [ 0, %for.cond5.preheader.us.us ], [ %n.vec, %middle.block ]
%sum.043.us.us.ph = phi i32 [ 0, %for.cond5.preheader.us.us ], [ %13, %middle.block ]
br label %for.body8.us.us
vector.ph: ; preds = %for.cond5.preheader.us.us
%broadcast.splatinsert85 = insertelement <4 x i32> undef, i32 %j.046.us.us, i32 0
%broadcast.splat86 = shufflevector <4 x i32> %broadcast.splatinsert85, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
%2 = add i32 %index, %mul.us
%3 = getelementptr inbounds i16, i16* %A, i32 %2
%4 = bitcast i16* %3 to <4 x i16>*
%wide.load = load <4 x i16>, <4 x i16>* %4, align 2, !tbaa !3
%5 = sext <4 x i16> %wide.load to <4 x i32>
%6 = mul <4 x i32> %vec.ind, %broadcast.splat
%7 = add <4 x i32> %6, %broadcast.splat86
%8 = getelementptr inbounds i16, i16* %B, <4 x i32> %7
%wide.masked.gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %8, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef), !tbaa !3
%9 = sext <4 x i16> %wide.masked.gather to <4 x i32>
%10 = mul nsw <4 x i32> %9, %5
%11 = add <4 x i32> %10, %vec.phi
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
%12 = icmp eq i32 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body, !llvm.loop !7
middle.block: ; preds = %vector.body
%13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %11)
br i1 %cmp.n, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us.preheader
for.cond5.for.cond.cleanup7_crit_edge.us.us: ; preds = %for.body8.us.us, %middle.block
%add14.us.us.lcssa = phi i32 [ %13, %middle.block ], [ %add14.us.us, %for.body8.us.us ]
%conv15.us.us = trunc i32 %add14.us.us.lcssa to i16
%add17.us.us = add i32 %j.046.us.us, %1
%arrayidx18.us.us = getelementptr inbounds i16, i16* %C, i32 %add17.us.us
store i16 %conv15.us.us, i16* %arrayidx18.us.us, align 2, !tbaa !3
%inc20.us.us = add nuw nsw i32 %j.046.us.us, 1
%exitcond83 = icmp eq i32 %inc20.us.us, %m
br i1 %exitcond83, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.cond5.preheader.us.us
for.body8.us.us: ; preds = %for.body8.us.us.preheader, %for.body8.us.us
%k.044.us.us = phi i32 [ %inc.us.us, %for.body8.us.us ], [ %k.044.us.us.ph, %for.body8.us.us.preheader ]
%sum.043.us.us = phi i32 [ %add14.us.us, %for.body8.us.us ], [ %sum.043.us.us.ph, %for.body8.us.us.preheader ]
%add.us.us = add i32 %k.044.us.us, %mul.us
%arrayidx.us.us = getelementptr inbounds i16, i16* %A, i32 %add.us.us
%14 = load i16, i16* %arrayidx.us.us, align 2, !tbaa !3
%conv.us.us = sext i16 %14 to i32
%mul9.us.us = mul i32 %k.044.us.us, %m
%add10.us.us = add i32 %mul9.us.us, %j.046.us.us
%arrayidx11.us.us = getelementptr inbounds i16, i16* %B, i32 %add10.us.us
%15 = load i16, i16* %arrayidx11.us.us, align 2, !tbaa !3
%conv12.us.us = sext i16 %15 to i32
%mul13.us.us = mul nsw i32 %conv12.us.us, %conv.us.us
%add14.us.us = add nsw i32 %mul13.us.us, %sum.043.us.us
%inc.us.us = add nuw nsw i32 %k.044.us.us, 1
%exitcond = icmp eq i32 %inc.us.us, %l
br i1 %exitcond, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us, !llvm.loop !9
for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry
ret void
}
define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly %input, i16 zeroext %input_x, i16 zeroext %input_y, i16 zeroext %input_ch, i8* nocapture readonly %kernel, i16 zeroext %output_ch, i16 zeroext %ch_mult, i16 zeroext %kernel_x, i16 zeroext %kernel_y, i16 zeroext %pad_x, i16 zeroext %pad_y, i16 zeroext %stride_x, i16 zeroext %stride_y, i32* nocapture readonly %bias, i8* nocapture %output, i32* nocapture readonly %output_shift, i32* nocapture readonly %output_mult, i16 zeroext %output_x, i16 zeroext %output_y, i32 %output_offset, i32 %input_offset, i32 %output_activation_min, i32 %output_activation_max, i16 zeroext %dilation_x, i16 zeroext %dilation_y, i16* nocapture readnone %buffer_a) local_unnamed_addr #0 {
; CHECK-LABEL: arm_depthwise_conv_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: ldrd r2, r7, [sp, #104]
; CHECK-NEXT: add.w r12, r7, #10
; CHECK-NEXT: adr r7, .LCPI11_0
; CHECK-NEXT: ldr r1, [sp, #96]
; CHECK-NEXT: vdup.32 q1, r2
; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: mov.w r10, #11
; CHECK-NEXT: vshl.i32 q1, q1, #2
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: .LBB11_1: @ %for.body10.i
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB11_2 Depth 2
; CHECK-NEXT: @ Child Loop BB11_3 Depth 3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB11_3 Depth 3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: .LBB11_3: @ %for.body27.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ => This Loop Header: Depth=3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: mov.w lr, #6
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: movs r5, #4
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ => This Loop Header: Depth=4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: mul r7, r5, r10
; CHECK-NEXT: vdup.32 q3, r6
; CHECK-NEXT: vdup.32 q2, r8
; CHECK-NEXT: mov r11, r12
; CHECK-NEXT: vadd.i32 q4, q0, r7
; CHECK-NEXT: vmla.u32 q3, q4, r2
; CHECK-NEXT: adds r7, #113
; CHECK-NEXT: vadd.i32 q4, q0, r7
; CHECK-NEXT: vmla.u32 q2, q4, r2
; CHECK-NEXT: .LBB11_5: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
; CHECK-NEXT: vadd.i32 q5, q2, q1
; CHECK-NEXT: vadd.i32 q4, q3, q1
; CHECK-NEXT: subs.w r11, r11, #4
; CHECK-NEXT: vadd.i32 q2, q6, r2
; CHECK-NEXT: vldrb.s32 q6, [r1, q3]
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vmlava.u32 r4, q2, q6
; CHECK-NEXT: vmov q2, q5
; CHECK-NEXT: bne .LBB11_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: le lr, .LBB11_4
; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup77.i
; CHECK-NEXT: @ in Loop: Header=BB11_3 Depth=3
; CHECK-NEXT: adds r6, #1
; CHECK-NEXT: add.w r9, r9, #1
; CHECK-NEXT: cmp r6, r2
; CHECK-NEXT: bne .LBB11_3
; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2
; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i
; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: ldr r7, [sp, #148]
; CHECK-NEXT: adds r6, #1
; CHECK-NEXT: cmp r6, r7
; CHECK-NEXT: it eq
; CHECK-NEXT: moveq r6, #0
; CHECK-NEXT: b .LBB11_1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI11_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
entry:
%conv = zext i16 %ch_mult to i32
%conv6.i = zext i16 %output_x to i32
%conv17.i = zext i16 %input_ch to i32
%conv60.i = zext i16 %kernel_x to i32
%broadcast.splatinsert63 = insertelement <4 x i32> undef, i32 %conv, i32 0
%broadcast.splat64 = shufflevector <4 x i32> %broadcast.splatinsert63, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert69 = insertelement <4 x i32> undef, i32 %conv, i32 0
%broadcast.splat70 = shufflevector <4 x i32> %broadcast.splatinsert69, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert73 = insertelement <4 x i32> undef, i32 %conv, i32 0
%broadcast.splat74 = shufflevector <4 x i32> %broadcast.splatinsert73, <4 x i32> undef, <4 x i32> zeroinitializer
%unroll_iter = and i32 %conv, 65534
br label %for.body.i38
for.body.i38: ; preds = %for.cond.cleanup9.i, %entry
%i_out.024.i = phi i32 [ 0, %entry ], [ %i_out.1.lcssa.i, %for.cond.cleanup9.i ]
%i_out_y.023.i = phi i32 [ 0, %entry ], [ %inc140.i, %for.cond.cleanup9.i ]
br label %for.body10.i
for.cond.cleanup9.i: ; preds = %for.cond.cleanup20.i, %for.body.i38
%i_out.1.lcssa.i = phi i32 [ %i_out.2.lcssa.i, %for.cond.cleanup20.i ]
%inc140.i = add nuw nsw i32 %i_out_y.023.i, 1
br i1 0, label %if.end, label %for.body.i38
for.body10.i: ; preds = %for.cond.cleanup20.i, %for.body.i38
%i_out.120.i = phi i32 [ %i_out.024.i, %for.body.i38 ], [ %i_out.2.lcssa.i, %for.cond.cleanup20.i ]
%i_out_x.019.i = phi i32 [ 0, %for.body.i38 ], [ %inc137.i, %for.cond.cleanup20.i ]
%n.vec = add nsw i32 %conv60.i, 10
br i1 0, label %for.cond.cleanup20.i, label %for.cond22.preheader.lr.ph.i
for.cond22.preheader.lr.ph.i: ; preds = %for.body10.i
%ind.end = add nsw i32 0, %n.vec
%.splatinsert = insertelement <4 x i32> undef, i32 0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %.splat, <i32 0, i32 1, i32 2, i32 3>
%cmp.n = icmp eq i32 10, %n.vec
br label %for.cond22.preheader.i
for.cond22.preheader.i: ; preds = %for.cond.cleanup26.i, %for.cond22.preheader.lr.ph.i
%i_out.216.i = phi i32 [ %i_out.120.i, %for.cond22.preheader.lr.ph.i ], [ %i_out.3.lcssa.i, %for.cond.cleanup26.i ]
%i_input_ch.014.i = phi i32 [ 0, %for.cond22.preheader.lr.ph.i ], [ %inc134.i, %for.cond.cleanup26.i ]
br i1 0, label %for.cond.cleanup26.i, label %for.body27.lr.ph.i
for.body27.lr.ph.i: ; preds = %for.cond22.preheader.i
br i1 0, label %for.body27.i.us.preheader, label %for.body27.i.preheader
for.body27.i.preheader: ; preds = %for.body27.lr.ph.i
%broadcast.splatinsert65 = insertelement <4 x i32> undef, i32 %i_input_ch.014.i, i32 0
%broadcast.splat66 = shufflevector <4 x i32> %broadcast.splatinsert65, <4 x i32> undef, <4 x i32> zeroinitializer
br label %for.body27.i
for.body27.i.us.preheader: ; preds = %for.body27.lr.ph.i
br i1 0, label %for.cond.cleanup26.i.loopexit.unr-lcssa, label %for.body27.i.us
for.body27.i.us: ; preds = %for.body27.i.us, %for.body27.i.us.preheader
%i_out.311.i.us = phi i32 [ %inc128.i.us.1, %for.body27.i.us ], [ %i_out.216.i, %for.body27.i.us.preheader ]
%i_ch_mult.010.i.us = phi i32 [ %inc131.i.us.1, %for.body27.i.us ], [ 0, %for.body27.i.us.preheader ]
%niter = phi i32 [ 0, %for.body27.i.us ], [ %unroll_iter, %for.body27.i.us.preheader ]
%inc128.i.us.1 = add nsw i32 %i_out.311.i.us, 2
%inc131.i.us.1 = add nuw nsw i32 %i_ch_mult.010.i.us, 2
br i1 0, label %for.cond.cleanup26.i.loopexit.unr-lcssa, label %for.body27.i.us
for.cond.cleanup20.i: ; preds = %for.cond.cleanup26.i, %for.body10.i
%i_out.2.lcssa.i = phi i32 [ %i_out.120.i, %for.body10.i ], [ %i_out.3.lcssa.i, %for.cond.cleanup26.i ]
%inc137.i = add nuw nsw i32 %i_out_x.019.i, 1
%exitcond27.i = icmp eq i32 %inc137.i, %conv6.i
br i1 %exitcond27.i, label %for.cond.cleanup9.i, label %for.body10.i
for.cond.cleanup26.i.loopexit.unr-lcssa: ; preds = %for.body27.i.us, %for.body27.i.us.preheader
%inc128.i.us.lcssa.ph = phi i32 [ undef, %for.body27.i.us.preheader ], [ %inc128.i.us.1, %for.body27.i.us ]
br label %for.cond.cleanup26.i
for.cond.cleanup26.i: ; preds = %for.cond.cleanup77.i, %for.cond.cleanup26.i.loopexit.unr-lcssa, %for.cond22.preheader.i
%i_out.3.lcssa.i = phi i32 [ %i_out.216.i, %for.cond22.preheader.i ], [ %inc128.i.us.lcssa.ph, %for.cond.cleanup26.i.loopexit.unr-lcssa ], [ %inc128.i, %for.cond.cleanup77.i ]
%inc134.i = add nuw nsw i32 %i_input_ch.014.i, 1
%exitcond26.i = icmp eq i32 %inc134.i, %conv17.i
br i1 %exitcond26.i, label %for.cond.cleanup20.i, label %for.cond22.preheader.i
for.body27.i: ; preds = %for.cond.cleanup77.i, %for.body27.i.preheader
%i_out.311.i = phi i32 [ %inc128.i, %for.cond.cleanup77.i ], [ %i_out.216.i, %for.body27.i.preheader ]
%i_ch_mult.010.i = phi i32 [ %inc131.i, %for.cond.cleanup77.i ], [ 0, %for.body27.i.preheader ]
%broadcast.splatinsert71 = insertelement <4 x i32> undef, i32 %i_ch_mult.010.i, i32 0
%broadcast.splat72 = shufflevector <4 x i32> %broadcast.splatinsert71, <4 x i32> undef, <4 x i32> zeroinitializer
br label %for.body78.us.i
for.body78.us.i: ; preds = %middle.block, %for.body27.i
%i_ker_y.06.us.i = phi i32 [ %inc110.us.i, %middle.block ], [ 4, %for.body27.i ]
%acc_0.05.us.i = phi i32 [ %tmp89, %middle.block ], [ 0, %for.body27.i ]
%add80.us.i43 = add nsw i32 %i_ker_y.06.us.i, 10
%mul89.us.i = mul nsw i32 %add80.us.i43, 11
%add87.us.i44 = add i32 %mul89.us.i, 3
%mul95.us.i = mul nsw i32 %i_ker_y.06.us.i, 11
br label %vector.ph
vector.ph: ; preds = %for.body78.us.i
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %add87.us.i44, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert67 = insertelement <4 x i32> undef, i32 %mul95.us.i, i32 0
%broadcast.splat68 = shufflevector <4 x i32> %broadcast.splatinsert67, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ %induction, %vector.ph ], [ %vec.ind.next, %vector.body ]
%vec.phi = phi i32 [ %acc_0.05.us.i, %vector.ph ], [ %tmp89, %vector.body ]
%tmp76 = add <4 x i32> %broadcast.splat, %vec.ind
%tmp77 = mul nsw <4 x i32> %tmp76, %broadcast.splat64
%tmp78 = add nsw <4 x i32> %tmp77, %broadcast.splat66
%tmp79 = add nsw <4 x i32> %vec.ind, %broadcast.splat68
%tmp80 = mul nsw <4 x i32> %broadcast.splat70, %tmp79
%tmp81 = add nsw <4 x i32> %tmp80, %broadcast.splat72
%tmp82 = getelementptr inbounds i8, i8* %input, <4 x i32> %tmp78
%wide.masked.gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %tmp82, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
%tmp83 = sext <4 x i8> %wide.masked.gather to <4 x i32>
%tmp84 = add nsw <4 x i32> %broadcast.splat74, %tmp83
%tmp85 = getelementptr inbounds i8, i8* %kernel, <4 x i32> %tmp81
%wide.masked.gather75 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %tmp85, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
%tmp86 = sext <4 x i8> %wide.masked.gather75 to <4 x i32>
%tmp87 = mul nsw <4 x i32> %tmp84, %tmp86
%tmp88 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp87)
%tmp89 = add i32 %tmp88, %vec.phi
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
%tmp90 = icmp eq i32 %index.next, %n.vec
br i1 %tmp90, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%inc110.us.i = add nsw i32 %i_ker_y.06.us.i, 1
%cmp75.us.i = icmp slt i32 %inc110.us.i, 10
br i1 %cmp75.us.i, label %for.body78.us.i, label %for.cond.cleanup77.i
for.cond.cleanup77.i: ; preds = %middle.block
%inc128.i = add nsw i32 %i_out.311.i, 1
%inc131.i = add nuw nsw i32 %i_ch_mult.010.i, 1
%exitcond.i50 = icmp eq i32 %inc131.i, %conv
br i1 %exitcond.i50, label %for.cond.cleanup26.i, label %for.body27.i
if.end: ; preds = %for.cond.cleanup9.i, %entry, %for.cond.cleanup7.i, %if.then
ret i32 0
}
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>) #3
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
declare void @llvm.memset.p0i8.i32(i8* align 2, i8, i32, i1)
declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)