1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00
llvm-mirror/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
David Green 67f2592469 [ARM][RegAlloc] Add t2LoopEndDec
We currently have problems with the way that low overhead loops are
specified, with LR being spilled between the t2LoopDec and the t2LoopEnd
forcing the entire loop to be reverted late in the backend. As they will
eventually become a single instruction, this patch introduces a
t2LoopEndDec which is the combination of the two, combined before
registry allocation to make sure this does not fail.

Unfortunately this instruction is a terminator that produces a value
(and also branches - it only produces the value around the branching
edge). So this needs some adjustment to phi elimination and the register
allocator to make sure that we do not spill this LR def around the loop
(needing to put a spill after the terminator). We treat the loop very
carefully, making sure that there is nothing else like calls that would
break it's ability to use LR. For that, this adds a
isUnspillableTerminator to opt in the new behaviour.

There is a chance that this could cause problems, and so I have added an
escape option incase. But I have not seen any problems in the testing
that I've tried, and not reverting Low overhead loops is important for
our performance. If this does work then we can hopefully do the same for
t2WhileLoopStart and t2DoLoopStart instructions.

This patch also contains the code needed to convert or revert the
t2LoopEndDec in the backend (which just needs a subs; bne) and the code
pre-ra to create them.

Differential Revision: https://reviews.llvm.org/D91358
2020-12-10 12:14:23 +00:00

1939 lines
80 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s
define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: float_float_mul:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB0_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB0_4
; CHECK-NEXT: .LBB0_3: @ %vector.memcheck
; CHECK-NEXT: add.w r5, r2, r3, lsl #2
; CHECK-NEXT: add.w r6, r1, r3, lsl #2
; CHECK-NEXT: cmp r5, r1
; CHECK-NEXT: add.w r4, r0, r3, lsl #2
; CHECK-NEXT: cset r7, hi
; CHECK-NEXT: cmp r6, r2
; CHECK-NEXT: cset r6, hi
; CHECK-NEXT: cmp r5, r0
; CHECK-NEXT: cset r5, hi
; CHECK-NEXT: cmp r4, r2
; CHECK-NEXT: cset r4, hi
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: ands r5, r4
; CHECK-NEXT: lsls r5, r5, #31
; CHECK-NEXT: itt eq
; CHECK-NEXT: andeq r7, r6
; CHECK-NEXT: lslseq.w r7, r7, #31
; CHECK-NEXT: beq .LBB0_11
; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22
; CHECK-NEXT: mvn.w r7, r12
; CHECK-NEXT: adds r4, r7, r3
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: wls lr, lr, .LBB0_7
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
; CHECK-NEXT: .LBB0_6: @ %for.body.prol
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr s0, [r6]
; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: vldr s2, [r5]
; CHECK-NEXT: adds r5, #4
; CHECK-NEXT: add.w r12, r12, #1
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r7]
; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: le lr, .LBB0_6
; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: blo .LBB0_10
; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
; CHECK-NEXT: sub.w r3, r3, r12
; CHECK-NEXT: lsl.w r12, r12, #2
; CHECK-NEXT: .LBB0_9: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r7, r1, r12
; CHECK-NEXT: add.w r6, r0, r12
; CHECK-NEXT: add.w r5, r2, r12
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: vldr s0, [r7]
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: vldr s2, [r6]
; CHECK-NEXT: adds r2, #16
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5]
; CHECK-NEXT: vldr s0, [r7, #4]
; CHECK-NEXT: vldr s2, [r6, #4]
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #4]
; CHECK-NEXT: vldr s0, [r7, #8]
; CHECK-NEXT: vldr s2, [r6, #8]
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #8]
; CHECK-NEXT: vldr s0, [r7, #12]
; CHECK-NEXT: vldr s2, [r6, #12]
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #12]
; CHECK-NEXT: bne .LBB0_9
; CHECK-NEXT: .LBB0_10: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .LBB0_11: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: sub.w r7, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB0_12: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r5], #16
; CHECK-NEXT: vldrw.u32 q1, [r4], #16
; CHECK-NEXT: vmul.f32 q0, q1, q0
; CHECK-NEXT: vstrb.8 q0, [r6], #16
; CHECK-NEXT: le lr, .LBB0_12
; CHECK-NEXT: @ %bb.13: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: bne .LBB0_4
; CHECK-NEXT: b .LBB0_10
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
%i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%0 = xor i32 %i.09.ph, -1
%1 = add i32 %0, %N
%xtraiter = and i32 %N, 3
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
%i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
%prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
%arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
%2 = load float, float* %arrayidx.prol, align 4
%arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
%3 = load float, float* %arrayidx1.prol, align 4
%mul.prol = fmul float %2, %3
%arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
store float %mul.prol, float* %arrayidx2.prol, align 4
%inc.prol = add nuw i32 %i.09.prol, 1
%prol.iter.sub = add i32 %prol.iter, -1
%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
%i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
%4 = icmp ult i32 %1, 3
br i1 %4, label %for.cond.cleanup, label %for.body
vector.memcheck: ; preds = %for.body.preheader
%scevgep = getelementptr float, float* %c, i32 %N
%scevgep13 = getelementptr float, float* %a, i32 %N
%scevgep16 = getelementptr float, float* %b, i32 %N
%bound0 = icmp ugt float* %scevgep13, %c
%bound1 = icmp ugt float* %scevgep, %a
%found.conflict = and i1 %bound0, %bound1
%bound018 = icmp ugt float* %scevgep16, %c
%bound119 = icmp ugt float* %scevgep, %b
%found.conflict20 = and i1 %bound018, %bound119
%conflict.rdx = or i1 %found.conflict, %found.conflict20
br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
vector.ph: ; preds = %vector.memcheck
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%5 = getelementptr inbounds float, float* %a, i32 %index
%6 = bitcast float* %5 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %6, align 4
%7 = getelementptr inbounds float, float* %b, i32 %index
%8 = bitcast float* %7 to <4 x float>*
%wide.load21 = load <4 x float>, <4 x float>* %8, align 4
%9 = fmul <4 x float> %wide.load, %wide.load21
%10 = getelementptr inbounds float, float* %c, i32 %index
%11 = bitcast float* %10 to <4 x float>*
store <4 x float> %9, <4 x float>* %11, align 4
%index.next = add i32 %index, 4
%12 = icmp eq i32 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.prol.loopexit, %for.body
%i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
%arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
%13 = load float, float* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
%14 = load float, float* %arrayidx1, align 4
%mul = fmul float %13, %14
%arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
store float %mul, float* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
%15 = load float, float* %arrayidx.1, align 4
%arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
%16 = load float, float* %arrayidx1.1, align 4
%mul.1 = fmul float %15, %16
%arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
store float %mul.1, float* %arrayidx2.1, align 4
%inc.1 = add nuw i32 %i.09, 2
%arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
%17 = load float, float* %arrayidx.2, align 4
%arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
%18 = load float, float* %arrayidx1.2, align 4
%mul.2 = fmul float %17, %18
%arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
store float %mul.2, float* %arrayidx2.2, align 4
%inc.2 = add nuw i32 %i.09, 3
%arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
%19 = load float, float* %arrayidx.3, align 4
%arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
%20 = load float, float* %arrayidx1.3, align 4
%mul.3 = fmul float %19, %20
%arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
store float %mul.3, float* %arrayidx2.3, align 4
%inc.3 = add nuw i32 %i.09, 4
%exitcond.3 = icmp eq i32 %inc.3, %N
br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc void @float_float_add(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: float_float_add:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB1_10
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB1_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_3: @ %vector.memcheck
; CHECK-NEXT: add.w r5, r2, r3, lsl #2
; CHECK-NEXT: add.w r6, r1, r3, lsl #2
; CHECK-NEXT: cmp r5, r1
; CHECK-NEXT: add.w r4, r0, r3, lsl #2
; CHECK-NEXT: cset r7, hi
; CHECK-NEXT: cmp r6, r2
; CHECK-NEXT: cset r6, hi
; CHECK-NEXT: cmp r5, r0
; CHECK-NEXT: cset r5, hi
; CHECK-NEXT: cmp r4, r2
; CHECK-NEXT: cset r4, hi
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: ands r5, r4
; CHECK-NEXT: lsls r5, r5, #31
; CHECK-NEXT: itt eq
; CHECK-NEXT: andeq r7, r6
; CHECK-NEXT: lslseq.w r7, r7, #31
; CHECK-NEXT: beq .LBB1_11
; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22
; CHECK-NEXT: mvn.w r7, r12
; CHECK-NEXT: adds r4, r7, r3
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: wls lr, lr, .LBB1_7
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
; CHECK-NEXT: .LBB1_6: @ %for.body.prol
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr s0, [r6]
; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: vldr s2, [r5]
; CHECK-NEXT: adds r5, #4
; CHECK-NEXT: add.w r12, r12, #1
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r7]
; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: le lr, .LBB1_6
; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: blo .LBB1_10
; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
; CHECK-NEXT: sub.w r3, r3, r12
; CHECK-NEXT: lsl.w r12, r12, #2
; CHECK-NEXT: .LBB1_9: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r7, r1, r12
; CHECK-NEXT: add.w r6, r0, r12
; CHECK-NEXT: add.w r5, r2, r12
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: vldr s0, [r7]
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: vldr s2, [r6]
; CHECK-NEXT: adds r2, #16
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5]
; CHECK-NEXT: vldr s0, [r7, #4]
; CHECK-NEXT: vldr s2, [r6, #4]
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #4]
; CHECK-NEXT: vldr s0, [r7, #8]
; CHECK-NEXT: vldr s2, [r6, #8]
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #8]
; CHECK-NEXT: vldr s0, [r7, #12]
; CHECK-NEXT: vldr s2, [r6, #12]
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #12]
; CHECK-NEXT: bne .LBB1_9
; CHECK-NEXT: .LBB1_10: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .LBB1_11: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: sub.w r7, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB1_12: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r5], #16
; CHECK-NEXT: vldrw.u32 q1, [r4], #16
; CHECK-NEXT: vadd.f32 q0, q1, q0
; CHECK-NEXT: vstrb.8 q0, [r6], #16
; CHECK-NEXT: le lr, .LBB1_12
; CHECK-NEXT: @ %bb.13: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: bne .LBB1_4
; CHECK-NEXT: b .LBB1_10
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
%i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%0 = xor i32 %i.09.ph, -1
%1 = add i32 %0, %N
%xtraiter = and i32 %N, 3
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
%i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
%prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
%arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
%2 = load float, float* %arrayidx.prol, align 4
%arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
%3 = load float, float* %arrayidx1.prol, align 4
%add.prol = fadd float %2, %3
%arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
store float %add.prol, float* %arrayidx2.prol, align 4
%inc.prol = add nuw i32 %i.09.prol, 1
%prol.iter.sub = add i32 %prol.iter, -1
%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
%i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
%4 = icmp ult i32 %1, 3
br i1 %4, label %for.cond.cleanup, label %for.body
vector.memcheck: ; preds = %for.body.preheader
%scevgep = getelementptr float, float* %c, i32 %N
%scevgep13 = getelementptr float, float* %a, i32 %N
%scevgep16 = getelementptr float, float* %b, i32 %N
%bound0 = icmp ugt float* %scevgep13, %c
%bound1 = icmp ugt float* %scevgep, %a
%found.conflict = and i1 %bound0, %bound1
%bound018 = icmp ugt float* %scevgep16, %c
%bound119 = icmp ugt float* %scevgep, %b
%found.conflict20 = and i1 %bound018, %bound119
%conflict.rdx = or i1 %found.conflict, %found.conflict20
br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
vector.ph: ; preds = %vector.memcheck
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%5 = getelementptr inbounds float, float* %a, i32 %index
%6 = bitcast float* %5 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %6, align 4
%7 = getelementptr inbounds float, float* %b, i32 %index
%8 = bitcast float* %7 to <4 x float>*
%wide.load21 = load <4 x float>, <4 x float>* %8, align 4
%9 = fadd <4 x float> %wide.load, %wide.load21
%10 = getelementptr inbounds float, float* %c, i32 %index
%11 = bitcast float* %10 to <4 x float>*
store <4 x float> %9, <4 x float>* %11, align 4
%index.next = add i32 %index, 4
%12 = icmp eq i32 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.prol.loopexit, %for.body
%i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
%arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
%13 = load float, float* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
%14 = load float, float* %arrayidx1, align 4
%add = fadd float %13, %14
%arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
store float %add, float* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
%15 = load float, float* %arrayidx.1, align 4
%arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
%16 = load float, float* %arrayidx1.1, align 4
%add.1 = fadd float %15, %16
%arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
store float %add.1, float* %arrayidx2.1, align 4
%inc.1 = add nuw i32 %i.09, 2
%arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
%17 = load float, float* %arrayidx.2, align 4
%arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
%18 = load float, float* %arrayidx1.2, align 4
%add.2 = fadd float %17, %18
%arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
store float %add.2, float* %arrayidx2.2, align 4
%inc.2 = add nuw i32 %i.09, 3
%arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
%19 = load float, float* %arrayidx.3, align 4
%arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
%20 = load float, float* %arrayidx1.3, align 4
%add.3 = fadd float %19, %20
%arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
store float %add.3, float* %arrayidx2.3, align 4
%inc.3 = add nuw i32 %i.09, 4
%exitcond.3 = icmp eq i32 %inc.3, %N
br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc void @float_float_sub(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: float_float_sub:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB2_10
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB2_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB2_4
; CHECK-NEXT: .LBB2_3: @ %vector.memcheck
; CHECK-NEXT: add.w r5, r2, r3, lsl #2
; CHECK-NEXT: add.w r6, r1, r3, lsl #2
; CHECK-NEXT: cmp r5, r1
; CHECK-NEXT: add.w r4, r0, r3, lsl #2
; CHECK-NEXT: cset r7, hi
; CHECK-NEXT: cmp r6, r2
; CHECK-NEXT: cset r6, hi
; CHECK-NEXT: cmp r5, r0
; CHECK-NEXT: cset r5, hi
; CHECK-NEXT: cmp r4, r2
; CHECK-NEXT: cset r4, hi
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: ands r5, r4
; CHECK-NEXT: lsls r5, r5, #31
; CHECK-NEXT: itt eq
; CHECK-NEXT: andeq r7, r6
; CHECK-NEXT: lslseq.w r7, r7, #31
; CHECK-NEXT: beq .LBB2_11
; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22
; CHECK-NEXT: mvn.w r7, r12
; CHECK-NEXT: adds r4, r7, r3
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: wls lr, lr, .LBB2_7
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
; CHECK-NEXT: .LBB2_6: @ %for.body.prol
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr s0, [r6]
; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: vldr s2, [r5]
; CHECK-NEXT: adds r5, #4
; CHECK-NEXT: add.w r12, r12, #1
; CHECK-NEXT: vsub.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r7]
; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: le lr, .LBB2_6
; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: blo .LBB2_10
; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
; CHECK-NEXT: sub.w r3, r3, r12
; CHECK-NEXT: lsl.w r12, r12, #2
; CHECK-NEXT: .LBB2_9: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r7, r1, r12
; CHECK-NEXT: add.w r6, r0, r12
; CHECK-NEXT: add.w r5, r2, r12
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: vldr s0, [r7]
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: vldr s2, [r6]
; CHECK-NEXT: adds r2, #16
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vsub.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5]
; CHECK-NEXT: vldr s0, [r7, #4]
; CHECK-NEXT: vldr s2, [r6, #4]
; CHECK-NEXT: vsub.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #4]
; CHECK-NEXT: vldr s0, [r7, #8]
; CHECK-NEXT: vldr s2, [r6, #8]
; CHECK-NEXT: vsub.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #8]
; CHECK-NEXT: vldr s0, [r7, #12]
; CHECK-NEXT: vldr s2, [r6, #12]
; CHECK-NEXT: vsub.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r5, #12]
; CHECK-NEXT: bne .LBB2_9
; CHECK-NEXT: .LBB2_10: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .LBB2_11: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: sub.w r7, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB2_12: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r5], #16
; CHECK-NEXT: vldrw.u32 q1, [r4], #16
; CHECK-NEXT: vsub.f32 q0, q1, q0
; CHECK-NEXT: vstrb.8 q0, [r6], #16
; CHECK-NEXT: le lr, .LBB2_12
; CHECK-NEXT: @ %bb.13: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: bne .LBB2_4
; CHECK-NEXT: b .LBB2_10
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
%i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%0 = xor i32 %i.09.ph, -1
%1 = add i32 %0, %N
%xtraiter = and i32 %N, 3
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
%i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
%prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
%arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
%2 = load float, float* %arrayidx.prol, align 4
%arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
%3 = load float, float* %arrayidx1.prol, align 4
%sub.prol = fsub float %2, %3
%arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
store float %sub.prol, float* %arrayidx2.prol, align 4
%inc.prol = add nuw i32 %i.09.prol, 1
%prol.iter.sub = add i32 %prol.iter, -1
%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
%i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
%4 = icmp ult i32 %1, 3
br i1 %4, label %for.cond.cleanup, label %for.body
vector.memcheck: ; preds = %for.body.preheader
%scevgep = getelementptr float, float* %c, i32 %N
%scevgep13 = getelementptr float, float* %a, i32 %N
%scevgep16 = getelementptr float, float* %b, i32 %N
%bound0 = icmp ugt float* %scevgep13, %c
%bound1 = icmp ugt float* %scevgep, %a
%found.conflict = and i1 %bound0, %bound1
%bound018 = icmp ugt float* %scevgep16, %c
%bound119 = icmp ugt float* %scevgep, %b
%found.conflict20 = and i1 %bound018, %bound119
%conflict.rdx = or i1 %found.conflict, %found.conflict20
br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
vector.ph: ; preds = %vector.memcheck
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%5 = getelementptr inbounds float, float* %a, i32 %index
%6 = bitcast float* %5 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %6, align 4
%7 = getelementptr inbounds float, float* %b, i32 %index
%8 = bitcast float* %7 to <4 x float>*
%wide.load21 = load <4 x float>, <4 x float>* %8, align 4
%9 = fsub <4 x float> %wide.load, %wide.load21
%10 = getelementptr inbounds float, float* %c, i32 %index
%11 = bitcast float* %10 to <4 x float>*
store <4 x float> %9, <4 x float>* %11, align 4
%index.next = add i32 %index, 4
%12 = icmp eq i32 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.prol.loopexit, %for.body
%i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
%arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
%13 = load float, float* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
%14 = load float, float* %arrayidx1, align 4
%sub = fsub float %13, %14
%arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
store float %sub, float* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
%15 = load float, float* %arrayidx.1, align 4
%arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
%16 = load float, float* %arrayidx1.1, align 4
%sub.1 = fsub float %15, %16
%arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
store float %sub.1, float* %arrayidx2.1, align 4
%inc.1 = add nuw i32 %i.09, 2
%arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
%17 = load float, float* %arrayidx.2, align 4
%arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
%18 = load float, float* %arrayidx1.2, align 4
%sub.2 = fsub float %17, %18
%arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
store float %sub.2, float* %arrayidx2.2, align 4
%inc.2 = add nuw i32 %i.09, 3
%arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
%19 = load float, float* %arrayidx.3, align 4
%arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
%20 = load float, float* %arrayidx1.3, align 4
%sub.3 = fsub float %19, %20
%arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
store float %sub.3, float* %arrayidx2.3, align 4
%inc.3 = add nuw i32 %i.09, 4
%exitcond.3 = icmp eq i32 %inc.3, %N
br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc void @float_int_mul(float* nocapture readonly %a, i32* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: float_int_mul:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq.w .LBB3_13
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bls .LBB3_6
; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
; CHECK-NEXT: add.w r7, r0, r3, lsl #2
; CHECK-NEXT: cmp r7, r2
; CHECK-NEXT: itt hi
; CHECK-NEXT: addhi.w r7, r2, r3, lsl #2
; CHECK-NEXT: cmphi r7, r0
; CHECK-NEXT: bhi .LBB3_6
; CHECK-NEXT: @ %bb.3: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: sub.w r7, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB3_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r5], #16
; CHECK-NEXT: vldrw.u32 q1, [r4], #16
; CHECK-NEXT: vcvt.f32.s32 q0, q0
; CHECK-NEXT: vmul.f32 q0, q1, q0
; CHECK-NEXT: vstrb.8 q0, [r6], #16
; CHECK-NEXT: le lr, .LBB3_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: bne .LBB3_7
; CHECK-NEXT: b .LBB3_13
; CHECK-NEXT: .LBB3_6:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16
; CHECK-NEXT: mvn.w r7, r12
; CHECK-NEXT: add.w r8, r7, r3
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: wls lr, lr, .LBB3_10
; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
; CHECK-NEXT: add.w r7, r2, r12, lsl #2
; CHECK-NEXT: .LBB3_9: @ %for.body.prol
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r4, [r6], #4
; CHECK-NEXT: add.w r12, r12, #1
; CHECK-NEXT: vldr s2, [r5]
; CHECK-NEXT: adds r5, #4
; CHECK-NEXT: vmov s0, r4
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r7]
; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: le lr, .LBB3_9
; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit
; CHECK-NEXT: cmp.w r8, #3
; CHECK-NEXT: blo .LBB3_13
; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1
; CHECK-NEXT: add.w r1, r1, r12, lsl #2
; CHECK-NEXT: sub.w r3, r3, r12
; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: lsl.w r12, r12, #2
; CHECK-NEXT: .LBB3_12: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr s0, [r1, #-8]
; CHECK-NEXT: add.w r7, r0, r12
; CHECK-NEXT: add.w r6, r2, r12
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vldr s2, [r7]
; CHECK-NEXT: adds r2, #16
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r6]
; CHECK-NEXT: vldr s0, [r1, #-4]
; CHECK-NEXT: vldr s2, [r7, #4]
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r6, #4]
; CHECK-NEXT: vldr s0, [r1]
; CHECK-NEXT: vldr s2, [r7, #8]
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r6, #8]
; CHECK-NEXT: vldr s0, [r1, #4]
; CHECK-NEXT: add.w r1, r1, #16
; CHECK-NEXT: vldr s2, [r7, #12]
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r6, #12]
; CHECK-NEXT: bne .LBB3_12
; CHECK-NEXT: .LBB3_13: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
for.body.preheader16: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
%i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%0 = xor i32 %i.09.ph, -1
%1 = add i32 %0, %N
%xtraiter = and i32 %N, 3
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
for.body.prol: ; preds = %for.body.preheader16, %for.body.prol
%i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader16 ]
%prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader16 ]
%arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
%2 = load float, float* %arrayidx.prol, align 4
%arrayidx1.prol = getelementptr inbounds i32, i32* %b, i32 %i.09.prol
%3 = load i32, i32* %arrayidx1.prol, align 4
%conv.prol = sitofp i32 %3 to float
%mul.prol = fmul float %2, %conv.prol
%arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
store float %mul.prol, float* %arrayidx2.prol, align 4
%inc.prol = add nuw i32 %i.09.prol, 1
%prol.iter.sub = add i32 %prol.iter, -1
%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader16
%i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
%4 = icmp ult i32 %1, 3
br i1 %4, label %for.cond.cleanup, label %for.body
vector.memcheck: ; preds = %for.body.preheader
%scevgep = getelementptr float, float* %c, i32 %N
%scevgep13 = getelementptr float, float* %a, i32 %N
%bound0 = icmp ugt float* %scevgep13, %c
%bound1 = icmp ugt float* %scevgep, %a
%found.conflict = and i1 %bound0, %bound1
br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
vector.ph: ; preds = %vector.memcheck
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%5 = getelementptr inbounds float, float* %a, i32 %index
%6 = bitcast float* %5 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %6, align 4
%7 = getelementptr inbounds i32, i32* %b, i32 %index
%8 = bitcast i32* %7 to <4 x i32>*
%wide.load15 = load <4 x i32>, <4 x i32>* %8, align 4
%9 = sitofp <4 x i32> %wide.load15 to <4 x float>
%10 = fmul <4 x float> %wide.load, %9
%11 = getelementptr inbounds float, float* %c, i32 %index
%12 = bitcast float* %11 to <4 x float>*
store <4 x float> %10, <4 x float>* %12, align 4
%index.next = add i32 %index, 4
%13 = icmp eq i32 %index.next, %n.vec
br i1 %13, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.prol.loopexit, %for.body
%i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
%arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
%14 = load float, float* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
%15 = load i32, i32* %arrayidx1, align 4
%conv = sitofp i32 %15 to float
%mul = fmul float %14, %conv
%arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
store float %mul, float* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
%16 = load float, float* %arrayidx.1, align 4
%arrayidx1.1 = getelementptr inbounds i32, i32* %b, i32 %inc
%17 = load i32, i32* %arrayidx1.1, align 4
%conv.1 = sitofp i32 %17 to float
%mul.1 = fmul float %16, %conv.1
%arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
store float %mul.1, float* %arrayidx2.1, align 4
%inc.1 = add nuw i32 %i.09, 2
%arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
%18 = load float, float* %arrayidx.2, align 4
%arrayidx1.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
%19 = load i32, i32* %arrayidx1.2, align 4
%conv.2 = sitofp i32 %19 to float
%mul.2 = fmul float %18, %conv.2
%arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
store float %mul.2, float* %arrayidx2.2, align 4
%inc.2 = add nuw i32 %i.09, 3
%arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
%20 = load float, float* %arrayidx.3, align 4
%arrayidx1.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
%21 = load i32, i32* %arrayidx1.3, align 4
%conv.3 = sitofp i32 %21 to float
%mul.3 = fmul float %20, %conv.3
%arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
store float %mul.3, float* %arrayidx2.3, align 4
%inc.3 = add nuw i32 %i.09, 4
%exitcond.3 = icmp eq i32 %inc.3, %N
br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc void @float_int_int_mul(i32* nocapture readonly %a, i32* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: float_int_int_mul:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: cbz r3, .LBB4_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB4_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB4_6
; CHECK-NEXT: .LBB4_3: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: sub.w r6, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: .LBB4_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r4], #16
; CHECK-NEXT: vldrw.u32 q1, [r5], #16
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: vcvt.f32.s32 q0, q0
; CHECK-NEXT: vstrb.8 q0, [r6], #16
; CHECK-NEXT: le lr, .LBB4_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, r5, r6, pc}
; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11
; CHECK-NEXT: sub.w lr, r3, r12
; CHECK-NEXT: add.w r0, r0, r12, lsl #2
; CHECK-NEXT: add.w r1, r1, r12, lsl #2
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB4_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r3, [r0], #4
; CHECK-NEXT: ldr r6, [r1], #4
; CHECK-NEXT: muls r3, r6, r3
; CHECK-NEXT: vmov s0, r3
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vstmia r2!, {s0}
; CHECK-NEXT: le lr, .LBB4_7
; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
for.body.preheader11: ; preds = %middle.block, %for.body.preheader
%i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
br label %for.body
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, i32* %a, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = getelementptr inbounds i32, i32* %b, i32 %index
%3 = bitcast i32* %2 to <4 x i32>*
%wide.load10 = load <4 x i32>, <4 x i32>* %3, align 4
%4 = mul nsw <4 x i32> %wide.load10, %wide.load
%5 = sitofp <4 x i32> %4 to <4 x float>
%6 = getelementptr inbounds float, float* %c, i32 %index
%7 = bitcast float* %6 to <4 x float>*
store <4 x float> %5, <4 x float>* %7, align 4
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader11, %for.body
%i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
%arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.09
%9 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
%10 = load i32, i32* %arrayidx1, align 4
%mul = mul nsw i32 %10, %9
%conv = sitofp i32 %mul to float
%arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
store float %conv, float* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: half_half_mul:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB5_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB5_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB5_6
; CHECK-NEXT: .LBB5_3: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: sub.w r6, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: .LBB5_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr.w r9, [r4]
; CHECK-NEXT: ldr r7, [r5]
; CHECK-NEXT: ldr.w r8, [r4, #4]
; CHECK-NEXT: vmov.32 q0[0], r9
; CHECK-NEXT: ldr.w r10, [r5, #4]
; CHECK-NEXT: vmov.32 q1[0], r7
; CHECK-NEXT: vmov.32 q0[1], r8
; CHECK-NEXT: adds r4, #8
; CHECK-NEXT: vmov.32 q1[1], r10
; CHECK-NEXT: adds r5, #8
; CHECK-NEXT: vmul.f16 q0, q0, q1
; CHECK-NEXT: vcvtt.f32.f16 s7, s1
; CHECK-NEXT: vcvtb.f32.f16 s6, s1
; CHECK-NEXT: vcvtt.f32.f16 s5, s0
; CHECK-NEXT: vcvtb.f32.f16 s4, s0
; CHECK-NEXT: vstrb.8 q1, [r6], #16
; CHECK-NEXT: le lr, .LBB5_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: beq .LBB5_8
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11
; CHECK-NEXT: sub.w lr, r3, r12
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB5_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr.16 s0, [r1]
; CHECK-NEXT: vldr.16 s2, [r0]
; CHECK-NEXT: adds r0, #2
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: vmul.f16 s0, s2, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vstr s0, [r2]
; CHECK-NEXT: adds r2, #4
; CHECK-NEXT: le lr, .LBB5_7
; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
for.body.preheader11: ; preds = %middle.block, %for.body.preheader
%i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
br label %for.body
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds half, half* %a, i32 %index
%1 = bitcast half* %0 to <4 x half>*
%wide.load = load <4 x half>, <4 x half>* %1, align 2
%2 = getelementptr inbounds half, half* %b, i32 %index
%3 = bitcast half* %2 to <4 x half>*
%wide.load10 = load <4 x half>, <4 x half>* %3, align 2
%4 = fmul <4 x half> %wide.load, %wide.load10
%5 = fpext <4 x half> %4 to <4 x float>
%6 = getelementptr inbounds float, float* %c, i32 %index
%7 = bitcast float* %6 to <4 x float>*
store <4 x float> %5, <4 x float>* %7, align 4
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader11, %for.body
%i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
%arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
%9 = load half, half* %arrayidx, align 2
%arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
%10 = load half, half* %arrayidx1, align 2
%mul = fmul half %9, %10
%conv = fpext half %mul to float
%arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
store float %conv, float* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: half_half_add:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB6_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB6_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB6_6
; CHECK-NEXT: .LBB6_3: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: sub.w r6, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: .LBB6_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr.w r9, [r4]
; CHECK-NEXT: ldr r7, [r5]
; CHECK-NEXT: ldr.w r8, [r4, #4]
; CHECK-NEXT: vmov.32 q0[0], r9
; CHECK-NEXT: ldr.w r10, [r5, #4]
; CHECK-NEXT: vmov.32 q1[0], r7
; CHECK-NEXT: vmov.32 q0[1], r8
; CHECK-NEXT: adds r4, #8
; CHECK-NEXT: vmov.32 q1[1], r10
; CHECK-NEXT: adds r5, #8
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vcvtt.f32.f16 s7, s1
; CHECK-NEXT: vcvtb.f32.f16 s6, s1
; CHECK-NEXT: vcvtt.f32.f16 s5, s0
; CHECK-NEXT: vcvtb.f32.f16 s4, s0
; CHECK-NEXT: vstrb.8 q1, [r6], #16
; CHECK-NEXT: le lr, .LBB6_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: beq .LBB6_8
; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11
; CHECK-NEXT: sub.w lr, r3, r12
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB6_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr.16 s0, [r1]
; CHECK-NEXT: vldr.16 s2, [r0]
; CHECK-NEXT: adds r0, #2
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: vadd.f16 s0, s2, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vstr s0, [r2]
; CHECK-NEXT: adds r2, #4
; CHECK-NEXT: le lr, .LBB6_7
; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
for.body.preheader11: ; preds = %middle.block, %for.body.preheader
%i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
br label %for.body
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds half, half* %a, i32 %index
%1 = bitcast half* %0 to <4 x half>*
%wide.load = load <4 x half>, <4 x half>* %1, align 2
%2 = getelementptr inbounds half, half* %b, i32 %index
%3 = bitcast half* %2 to <4 x half>*
%wide.load10 = load <4 x half>, <4 x half>* %3, align 2
%4 = fadd <4 x half> %wide.load, %wide.load10
%5 = fpext <4 x half> %4 to <4 x float>
%6 = getelementptr inbounds float, float* %c, i32 %index
%7 = bitcast float* %6 to <4 x float>*
store <4 x float> %5, <4 x float>* %7, align 4
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader11, %for.body
%i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
%arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
%9 = load half, half* %arrayidx, align 2
%arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
%10 = load half, half* %arrayidx1, align 2
%add = fadd half %9, %10
%conv = fpext half %add to float
%arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
store float %conv, float* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: half_half_sub:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB7_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB7_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB7_6
; CHECK-NEXT: .LBB7_3: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: sub.w r6, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: .LBB7_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr.w r9, [r4]
; CHECK-NEXT: ldr r7, [r5]
; CHECK-NEXT: ldr.w r8, [r4, #4]
; CHECK-NEXT: vmov.32 q0[0], r9
; CHECK-NEXT: ldr.w r10, [r5, #4]
; CHECK-NEXT: vmov.32 q1[0], r7
; CHECK-NEXT: vmov.32 q0[1], r8
; CHECK-NEXT: adds r4, #8
; CHECK-NEXT: vmov.32 q1[1], r10
; CHECK-NEXT: adds r5, #8
; CHECK-NEXT: vsub.f16 q0, q0, q1
; CHECK-NEXT: vcvtt.f32.f16 s7, s1
; CHECK-NEXT: vcvtb.f32.f16 s6, s1
; CHECK-NEXT: vcvtt.f32.f16 s5, s0
; CHECK-NEXT: vcvtb.f32.f16 s4, s0
; CHECK-NEXT: vstrb.8 q1, [r6], #16
; CHECK-NEXT: le lr, .LBB7_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: beq .LBB7_8
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11
; CHECK-NEXT: sub.w lr, r3, r12
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB7_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr.16 s0, [r1]
; CHECK-NEXT: vldr.16 s2, [r0]
; CHECK-NEXT: adds r0, #2
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: vsub.f16 s0, s2, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vstr s0, [r2]
; CHECK-NEXT: adds r2, #4
; CHECK-NEXT: le lr, .LBB7_7
; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
for.body.preheader11: ; preds = %middle.block, %for.body.preheader
%i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
br label %for.body
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds half, half* %a, i32 %index
%1 = bitcast half* %0 to <4 x half>*
%wide.load = load <4 x half>, <4 x half>* %1, align 2
%2 = getelementptr inbounds half, half* %b, i32 %index
%3 = bitcast half* %2 to <4 x half>*
%wide.load10 = load <4 x half>, <4 x half>* %3, align 2
%4 = fsub <4 x half> %wide.load, %wide.load10
%5 = fpext <4 x half> %4 to <4 x float>
%6 = getelementptr inbounds float, float* %c, i32 %index
%7 = bitcast float* %6 to <4 x float>*
store <4 x float> %5, <4 x float>* %7, align 4
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
br i1 %8, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader11, %for.body
%i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
%arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
%9 = load half, half* %arrayidx, align 2
%arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
%10 = load half, half* %arrayidx1, align 2
%sub = fsub half %9, %10
%conv = fpext half %sub to float
%arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
store float %conv, float* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* nocapture readonly %b, float* nocapture %c, i32 %N) {
; CHECK-LABEL: half_short_mul:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB8_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB8_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB8_6
; CHECK-NEXT: .LBB8_3: @ %vector.ph
; CHECK-NEXT: bic r12, r3, #3
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: sub.w r6, r12, #4
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: add.w lr, r5, r6, lsr #2
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: .LBB8_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u32 q0, [r5], #8
; CHECK-NEXT: ldr.w r9, [r4]
; CHECK-NEXT: ldr.w r8, [r4, #4]
; CHECK-NEXT: adds r4, #8
; CHECK-NEXT: vmov r7, s0
; CHECK-NEXT: vmov.16 q1[0], r7
; CHECK-NEXT: vmov r7, s1
; CHECK-NEXT: vmov.16 q1[1], r7
; CHECK-NEXT: vmov r7, s2
; CHECK-NEXT: vmov.16 q1[2], r7
; CHECK-NEXT: vmov r7, s3
; CHECK-NEXT: vmov.16 q1[3], r7
; CHECK-NEXT: vcvt.f16.s16 q0, q1
; CHECK-NEXT: vmov.32 q1[0], r9
; CHECK-NEXT: vmov.32 q1[1], r8
; CHECK-NEXT: vmul.f16 q0, q1, q0
; CHECK-NEXT: vcvtt.f32.f16 s7, s1
; CHECK-NEXT: vcvtb.f32.f16 s6, s1
; CHECK-NEXT: vcvtt.f32.f16 s5, s0
; CHECK-NEXT: vcvtb.f32.f16 s4, s0
; CHECK-NEXT: vstrb.8 q1, [r6], #16
; CHECK-NEXT: le lr, .LBB8_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: beq .LBB8_8
; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13
; CHECK-NEXT: sub.w lr, r3, r12
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
; CHECK-NEXT: add.w r2, r2, r12, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB8_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrsh r3, [r1], #2
; CHECK-NEXT: vldr.16 s0, [r0]
; CHECK-NEXT: adds r0, #2
; CHECK-NEXT: vmov s2, r3
; CHECK-NEXT: vcvt.f16.s32 s2, s2
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vstr s0, [r2]
; CHECK-NEXT: adds r2, #4
; CHECK-NEXT: le lr, .LBB8_7
; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
entry:
%cmp10 = icmp eq i32 %N, 0
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %N, 4
br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph
for.body.preheader13: ; preds = %middle.block, %for.body.preheader
%i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
br label %for.body
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %N, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds half, half* %a, i32 %index
%1 = bitcast half* %0 to <4 x half>*
%wide.load = load <4 x half>, <4 x half>* %1, align 2
%2 = getelementptr inbounds i16, i16* %b, i32 %index
%3 = bitcast i16* %2 to <4 x i16>*
%wide.load12 = load <4 x i16>, <4 x i16>* %3, align 2
%4 = sitofp <4 x i16> %wide.load12 to <4 x half>
%5 = fmul <4 x half> %wide.load, %4
%6 = fpext <4 x half> %5 to <4 x float>
%7 = getelementptr inbounds float, float* %c, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n.vec
br i1 %9, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %N
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader13, %for.body
%i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader13 ]
%arrayidx = getelementptr inbounds half, half* %a, i32 %i.011
%10 = load half, half* %arrayidx, align 2
%arrayidx1 = getelementptr inbounds i16, i16* %b, i32 %i.011
%11 = load i16, i16* %arrayidx1, align 2
%conv2 = sitofp i16 %11 to half
%mul = fmul half %10, %conv2
%conv3 = fpext half %mul to float
%arrayidx4 = getelementptr inbounds float, float* %c, i32 %i.011
store float %conv3, float* %arrayidx4, align 4
%inc = add nuw i32 %i.011, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: half_half_mac:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cbz r2, .LBB9_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and r5, r2, #3
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB9_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI9_0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB9_6
; CHECK-NEXT: .LBB9_3:
; CHECK-NEXT: vldr s0, .LCPI9_0
; CHECK-NEXT: b .LBB9_9
; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vldr s0, .LCPI9_0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB9_5: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, r0, r3
; CHECK-NEXT: adds r2, r1, r3
; CHECK-NEXT: vldr.16 s2, [r2, #6]
; CHECK-NEXT: vldr.16 s4, [r4, #6]
; CHECK-NEXT: vldr.16 s6, [r4, #4]
; CHECK-NEXT: vldr.16 s8, [r4, #2]
; CHECK-NEXT: vmul.f16 s2, s4, s2
; CHECK-NEXT: vldr.16 s4, [r2, #4]
; CHECK-NEXT: vldr.16 s10, [r4]
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vmul.f16 s4, s6, s4
; CHECK-NEXT: vldr.16 s6, [r2, #2]
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vmul.f16 s6, s8, s6
; CHECK-NEXT: vldr.16 s8, [r2]
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vmul.f16 s8, s10, s8
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
; CHECK-NEXT: vadd.f32 s0, s0, s8
; CHECK-NEXT: vadd.f32 s0, s0, s6
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB9_5
; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, r5, .LBB9_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
; CHECK-NEXT: mov lr, r5
; CHECK-NEXT: .LBB9_8: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr.16 s2, [r1]
; CHECK-NEXT: vldr.16 s4, [r0]
; CHECK-NEXT: adds r0, #2
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: vmul.f16 s2, s4, s2
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB9_8
; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI9_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%0 = add i32 %N, -1
%xtraiter = and i32 %N, 3
%1 = icmp ult i32 %0, 3
br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
for.body.preheader.new: ; preds = %for.body.preheader
%unroll_iter = sub i32 %N, %xtraiter
br label %for.body
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
%add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
%i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
%res.09.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
%i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
%res.09.epil = phi float [ %add.epil, %for.body.epil ], [ %res.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
%epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
%arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.010.epil
%2 = load half, half* %arrayidx.epil, align 2
%arrayidx1.epil = getelementptr inbounds half, half* %b, i32 %i.010.epil
%3 = load half, half* %arrayidx1.epil, align 2
%mul.epil = fmul half %2, %3
%conv.epil = fpext half %mul.epil to float
%add.epil = fadd float %res.09.epil, %conv.epil
%inc.epil = add nuw i32 %i.010.epil, 1
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
%res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
ret float %res.0.lcssa
for.body: ; preds = %for.body, %for.body.preheader.new
%i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
%res.09 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
%niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
%arrayidx = getelementptr inbounds half, half* %a, i32 %i.010
%4 = load half, half* %arrayidx, align 2
%arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.010
%5 = load half, half* %arrayidx1, align 2
%mul = fmul half %4, %5
%conv = fpext half %mul to float
%add = fadd float %res.09, %conv
%inc = or i32 %i.010, 1
%arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
%6 = load half, half* %arrayidx.1, align 2
%arrayidx1.1 = getelementptr inbounds half, half* %b, i32 %inc
%7 = load half, half* %arrayidx1.1, align 2
%mul.1 = fmul half %6, %7
%conv.1 = fpext half %mul.1 to float
%add.1 = fadd float %add, %conv.1
%inc.1 = or i32 %i.010, 2
%arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
%8 = load half, half* %arrayidx.2, align 2
%arrayidx1.2 = getelementptr inbounds half, half* %b, i32 %inc.1
%9 = load half, half* %arrayidx1.2, align 2
%mul.2 = fmul half %8, %9
%conv.2 = fpext half %mul.2 to float
%add.2 = fadd float %add.1, %conv.2
%inc.2 = or i32 %i.010, 3
%arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
%10 = load half, half* %arrayidx.3, align 2
%arrayidx1.3 = getelementptr inbounds half, half* %b, i32 %inc.2
%11 = load half, half* %arrayidx1.3, align 2
%mul.3 = fmul half %10, %11
%conv.3 = fpext half %mul.3 to float
%add.3 = fadd float %add.2, %conv.3
%inc.3 = add nuw i32 %i.010, 4
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
}
define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: half_half_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cbz r2, .LBB10_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and r5, r2, #3
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB10_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI10_0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB10_6
; CHECK-NEXT: .LBB10_3:
; CHECK-NEXT: vldr s0, .LCPI10_0
; CHECK-NEXT: b .LBB10_9
; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vldr s0, .LCPI10_0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB10_5: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r4, r0, r3
; CHECK-NEXT: adds r2, r1, r3
; CHECK-NEXT: vldr.16 s2, [r2, #6]
; CHECK-NEXT: vldr.16 s4, [r4, #6]
; CHECK-NEXT: vldr.16 s6, [r4, #4]
; CHECK-NEXT: vldr.16 s8, [r4, #2]
; CHECK-NEXT: vadd.f16 s2, s4, s2
; CHECK-NEXT: vldr.16 s4, [r2, #4]
; CHECK-NEXT: vldr.16 s10, [r4]
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vadd.f16 s4, s6, s4
; CHECK-NEXT: vldr.16 s6, [r2, #2]
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vadd.f16 s6, s8, s6
; CHECK-NEXT: vldr.16 s8, [r2]
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vadd.f16 s8, s10, s8
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
; CHECK-NEXT: vadd.f32 s0, s0, s8
; CHECK-NEXT: vadd.f32 s0, s0, s6
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB10_5
; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, r5, .LBB10_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
; CHECK-NEXT: mov lr, r5
; CHECK-NEXT: .LBB10_8: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr.16 s2, [r1]
; CHECK-NEXT: vldr.16 s4, [r0]
; CHECK-NEXT: adds r0, #2
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: vadd.f16 s2, s4, s2
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB10_8
; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI10_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp9 = icmp eq i32 %N, 0
br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%0 = add i32 %N, -1
%xtraiter = and i32 %N, 3
%1 = icmp ult i32 %0, 3
br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
for.body.preheader.new: ; preds = %for.body.preheader
%unroll_iter = sub i32 %N, %xtraiter
br label %for.body
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
%add2.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add2.3, %for.body ]
%i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
%res.010.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2.3, %for.body ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
%i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
%res.010.epil = phi float [ %add2.epil, %for.body.epil ], [ %res.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
%epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
%arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.011.epil
%2 = load half, half* %arrayidx.epil, align 2
%arrayidx1.epil = getelementptr inbounds half, half* %b, i32 %i.011.epil
%3 = load half, half* %arrayidx1.epil, align 2
%add.epil = fadd half %2, %3
%conv.epil = fpext half %add.epil to float
%add2.epil = fadd float %res.010.epil, %conv.epil
%inc.epil = add nuw i32 %i.011.epil, 1
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
%res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add2.epil, %for.body.epil ]
ret float %res.0.lcssa
for.body: ; preds = %for.body, %for.body.preheader.new
%i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
%res.010 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add2.3, %for.body ]
%niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
%arrayidx = getelementptr inbounds half, half* %a, i32 %i.011
%4 = load half, half* %arrayidx, align 2
%arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.011
%5 = load half, half* %arrayidx1, align 2
%add = fadd half %4, %5
%conv = fpext half %add to float
%add2 = fadd float %res.010, %conv
%inc = or i32 %i.011, 1
%arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
%6 = load half, half* %arrayidx.1, align 2
%arrayidx1.1 = getelementptr inbounds half, half* %b, i32 %inc
%7 = load half, half* %arrayidx1.1, align 2
%add.1 = fadd half %6, %7
%conv.1 = fpext half %add.1 to float
%add2.1 = fadd float %add2, %conv.1
%inc.1 = or i32 %i.011, 2
%arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
%8 = load half, half* %arrayidx.2, align 2
%arrayidx1.2 = getelementptr inbounds half, half* %b, i32 %inc.1
%9 = load half, half* %arrayidx1.2, align 2
%add.2 = fadd half %8, %9
%conv.2 = fpext half %add.2 to float
%add2.2 = fadd float %add2.1, %conv.2
%inc.2 = or i32 %i.011, 3
%arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
%10 = load half, half* %arrayidx.3, align 2
%arrayidx1.3 = getelementptr inbounds half, half* %b, i32 %inc.2
%11 = load half, half* %arrayidx1.3, align 2
%add.3 = fadd half %10, %11
%conv.3 = fpext half %add.3 to float
%add2.3 = fadd float %add2.2, %conv.3
%inc.3 = add nuw i32 %i.011, 4
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
}
define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: half_short_mac:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: cbz r2, .LBB11_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and r6, r2, #3
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB11_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI11_0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB11_6
; CHECK-NEXT: .LBB11_3:
; CHECK-NEXT: vldr s0, .LCPI11_0
; CHECK-NEXT: b .LBB11_9
; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vldr s0, .LCPI11_0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: adds r3, r1, #4
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: adds r2, r0, #4
; CHECK-NEXT: .LBB11_5: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrsh.w r4, [r3, #2]
; CHECK-NEXT: vldr.16 s2, [r2, #2]
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vmov s4, r4
; CHECK-NEXT: ldrsh r4, [r3], #8
; CHECK-NEXT: vcvt.f16.s32 s4, s4
; CHECK-NEXT: ldrsh r5, [r3, #-10]
; CHECK-NEXT: vmul.f16 s2, s2, s4
; CHECK-NEXT: vmov s6, r4
; CHECK-NEXT: vldr.16 s4, [r2]
; CHECK-NEXT: vcvt.f16.s32 s6, s6
; CHECK-NEXT: ldrsh r4, [r3, #-12]
; CHECK-NEXT: vmul.f16 s4, s4, s6
; CHECK-NEXT: vmov s8, r5
; CHECK-NEXT: vldr.16 s6, [r2, #-2]
; CHECK-NEXT: vcvt.f16.s32 s8, s8
; CHECK-NEXT: vmov s10, r4
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
; CHECK-NEXT: vmul.f16 s6, s6, s8
; CHECK-NEXT: vldr.16 s8, [r2, #-4]
; CHECK-NEXT: vcvt.f16.s32 s10, s10
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
; CHECK-NEXT: vmul.f16 s8, s8, s10
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
; CHECK-NEXT: adds r2, #8
; CHECK-NEXT: vadd.f32 s0, s0, s8
; CHECK-NEXT: vadd.f32 s0, s0, s6
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB11_5
; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, r6, .LBB11_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
; CHECK-NEXT: mov lr, r6
; CHECK-NEXT: .LBB11_8: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrsh r2, [r1], #2
; CHECK-NEXT: vldr.16 s2, [r0]
; CHECK-NEXT: adds r0, #2
; CHECK-NEXT: vmov s4, r2
; CHECK-NEXT: vcvt.f16.s32 s4, s4
; CHECK-NEXT: vmul.f16 s2, s2, s4
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB11_8
; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI11_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp10 = icmp eq i32 %N, 0
br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
%0 = add i32 %N, -1
%xtraiter = and i32 %N, 3
%1 = icmp ult i32 %0, 3
br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
for.body.preheader.new: ; preds = %for.body.preheader
%unroll_iter = sub i32 %N, %xtraiter
br label %for.body
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
%add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
%i.012.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
%res.011.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
%i.012.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.012.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
%res.011.epil = phi float [ %add.epil, %for.body.epil ], [ %res.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
%epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
%arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.012.epil
%2 = load half, half* %arrayidx.epil, align 2
%arrayidx1.epil = getelementptr inbounds i16, i16* %b, i32 %i.012.epil
%3 = load i16, i16* %arrayidx1.epil, align 2
%conv2.epil = sitofp i16 %3 to half
%mul.epil = fmul half %2, %conv2.epil
%conv3.epil = fpext half %mul.epil to float
%add.epil = fadd float %res.011.epil, %conv3.epil
%inc.epil = add nuw i32 %i.012.epil, 1
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
%res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
ret float %res.0.lcssa
for.body: ; preds = %for.body, %for.body.preheader.new
%i.012 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
%res.011 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
%niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
%arrayidx = getelementptr inbounds half, half* %a, i32 %i.012
%4 = load half, half* %arrayidx, align 2
%arrayidx1 = getelementptr inbounds i16, i16* %b, i32 %i.012
%5 = load i16, i16* %arrayidx1, align 2
%conv2 = sitofp i16 %5 to half
%mul = fmul half %4, %conv2
%conv3 = fpext half %mul to float
%add = fadd float %res.011, %conv3
%inc = or i32 %i.012, 1
%arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
%6 = load half, half* %arrayidx.1, align 2
%arrayidx1.1 = getelementptr inbounds i16, i16* %b, i32 %inc
%7 = load i16, i16* %arrayidx1.1, align 2
%conv2.1 = sitofp i16 %7 to half
%mul.1 = fmul half %6, %conv2.1
%conv3.1 = fpext half %mul.1 to float
%add.1 = fadd float %add, %conv3.1
%inc.1 = or i32 %i.012, 2
%arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
%8 = load half, half* %arrayidx.2, align 2
%arrayidx1.2 = getelementptr inbounds i16, i16* %b, i32 %inc.1
%9 = load i16, i16* %arrayidx1.2, align 2
%conv2.2 = sitofp i16 %9 to half
%mul.2 = fmul half %8, %conv2.2
%conv3.2 = fpext half %mul.2 to float
%add.2 = fadd float %add.1, %conv3.2
%inc.2 = or i32 %i.012, 3
%arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
%10 = load half, half* %arrayidx.3, align 2
%arrayidx1.3 = getelementptr inbounds i16, i16* %b, i32 %inc.2
%11 = load i16, i16* %arrayidx1.3, align 2
%conv2.3 = sitofp i16 %11 to half
%mul.3 = fmul half %10, %conv2.3
%conv3.3 = fpext half %mul.3 to float
%add.3 = fadd float %add.2, %conv3.3
%inc.3 = add nuw i32 %i.012, 4
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
}