1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
David Green a9b961b8da [ARM] CSEL generation
This adds a peephole optimisation to turn a t2MOVccr that could not be
folded into any other instruction into a CSEL on 8.1-m. The t2MOVccr
would usually be expanded into a conditional mov, that becomes an IT;
MOV pair. We can instead generate a CSEL instruction, which can
potentially be smaller and allows better register allocation freedom,
which can help reduce codesize. Performance is more variable and may
depend on the micrarchitecture details, but initial results look good.
If we need to control this per-cpu, we can add a subtarget feature as we
need it.

Original patch by David Penry.

Differential Revision: https://reviews.llvm.org/D83566
2020-07-16 11:10:53 +01:00

1712 lines
68 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: add_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB0_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB0_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: b .LBB0_7
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: b .LBB0_9
; CHECK-NEXT: .LBB0_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: subs r0, r3, #4
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB0_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: le lr, .LBB0_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB0_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r2, [r1], #4
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: le lr, .LBB0_8
; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %wide.load)
%3 = add i32 %2, %vec.phi
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: mul_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB1_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB1_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB1_6
; CHECK-NEXT: .LBB1_3: @ %vector.ph
; CHECK-NEXT: bic r12, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x1
; CHECK-NEXT: sub.w r3, r12, #4
; CHECK-NEXT: add.w lr, r2, r3, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB1_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB1_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: vmov r2, s3
; CHECK-NEXT: cmp r12, r1
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: mul lr, r3, r2
; CHECK-NEXT: vmov r3, s1
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: mul r2, r3, r2
; CHECK-NEXT: mul r2, r2, lr
; CHECK-NEXT: beq .LBB1_8
; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r12
; CHECK-NEXT: add.w r0, r0, r12, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB1_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: muls r2, r1, r2
; CHECK-NEXT: le lr, .LBB1_7
; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = mul <4 x i32> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = mul nsw i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: and_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB2_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB2_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB2_7
; CHECK-NEXT: .LBB2_3:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: b .LBB2_9
; CHECK-NEXT: .LBB2_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB2_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vand q0, q1, q0
; CHECK-NEXT: le lr, .LBB2_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: and.w r12, r12, r2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: and.w r2, r2, lr
; CHECK-NEXT: and.w r2, r2, r12
; CHECK-NEXT: beq .LBB2_9
; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB2_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: ands r2, r1
; CHECK-NEXT: le lr, .LBB2_8
; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = and <4 x i32> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = and i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: or_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB3_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB3_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB3_7
; CHECK-NEXT: .LBB3_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB3_9
; CHECK-NEXT: .LBB3_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB3_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vorr q0, q1, q0
; CHECK-NEXT: le lr, .LBB3_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: orr.w r12, r12, r2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: orr.w r2, r2, lr
; CHECK-NEXT: orr.w r2, r2, r12
; CHECK-NEXT: beq .LBB3_9
; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB3_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: orrs r2, r1
; CHECK-NEXT: le lr, .LBB3_8
; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = or <4 x i32> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = or i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: xor_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB4_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB4_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB4_7
; CHECK-NEXT: .LBB4_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB4_9
; CHECK-NEXT: .LBB4_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB4_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: veor q0, q1, q0
; CHECK-NEXT: le lr, .LBB4_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: eor.w r12, r12, r2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: eor.w r2, r2, lr
; CHECK-NEXT: eor.w r2, r2, r12
; CHECK-NEXT: beq .LBB4_9
; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB4_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: eors r2, r1
; CHECK-NEXT: le lr, .LBB4_8
; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = xor <4 x i32> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%5 = load i32, i32* %arrayidx, align 4
%add = xor i32 %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: fadd_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB5_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB5_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI5_0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB5_7
; CHECK-NEXT: .LBB5_3:
; CHECK-NEXT: vldr s0, .LCPI5_0
; CHECK-NEXT: b .LBB5_9
; CHECK-NEXT: .LBB5_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: sub.w r12, r2, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB5_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: vadd.f32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB5_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vadd.f32 s4, s2, s3
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: beq .LBB5_9
; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB5_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr s2, [r0]
; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: le lr, .LBB5_8
; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI5_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds float, float* %x, i32 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = fadd fast <4 x float> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
%5 = load float, float* %arrayidx, align 4
%add = fadd fast float %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret float %r.0.lcssa
}
define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: fmul_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB6_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB6_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB6_7
; CHECK-NEXT: .LBB6_3:
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
; CHECK-NEXT: b .LBB6_9
; CHECK-NEXT: .LBB6_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: sub.w r12, r2, #4
; CHECK-NEXT: vmov.f32 q0, #1.000000e+00
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB6_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: vmul.f32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB6_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmul.f32 s4, s2, s3
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: vmul.f32 s0, s0, s1
; CHECK-NEXT: vmul.f32 s0, s0, s4
; CHECK-NEXT: beq .LBB6_9
; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB6_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr s2, [r0]
; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: le lr, .LBB6_8
; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ]
%0 = getelementptr inbounds float, float* %x, i32 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = fmul fast <4 x float> %wide.load, %vec.phi
%index.next = add i32 %index, 4
%3 = icmp eq i32 %index.next, %n.vec
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%4 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
%5 = load float, float* %arrayidx, align 4
%add = fmul fast float %5, %r.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
ret float %r.0.lcssa
}
define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: smin_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB7_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB7_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mvn r2, #-2147483648
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB7_7
; CHECK-NEXT: .LBB7_3:
; CHECK-NEXT: mvn r2, #-2147483648
; CHECK-NEXT: b .LBB7_9
; CHECK-NEXT: .LBB7_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmvn.i32 q0, #0x80000000
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB7_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmin.s32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB7_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: mvn r2, #-2147483648
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vminv.s32 r2, q0
; CHECK-NEXT: beq .LBB7_9
; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB7_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: csel r2, r2, r1, lt
; CHECK-NEXT: le lr, .LBB7_8
; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = icmp slt <4 x i32> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp slt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: smin_i32_inloop:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB8_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB8_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB8_7
; CHECK-NEXT: .LBB8_3:
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: b .LBB8_9
; CHECK-NEXT: .LBB8_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w lr, r3, #4
; CHECK-NEXT: add.w lr, r2, lr, lsr #2
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB8_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: mvn r4, #-2147483648
; CHECK-NEXT: vminv.s32 r4, q0
; CHECK-NEXT: cmp r0, r4
; CHECK-NEXT: csel r0, r0, r4, lt
; CHECK-NEXT: le lr, .LBB8_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, pc}
; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB8_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r2, [r1], #4
; CHECK-NEXT: cmp r0, r2
; CHECK-NEXT: csel r0, r0, r2, lt
; CHECK-NEXT: le lr, .LBB8_8
; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%l5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
%2 = icmp slt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = phi i32 [ %3, %vector.body ]
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp slt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: smax_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB9_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB9_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r2, #-2147483648
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB9_7
; CHECK-NEXT: .LBB9_3:
; CHECK-NEXT: mov.w r2, #-2147483648
; CHECK-NEXT: b .LBB9_9
; CHECK-NEXT: .LBB9_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i32 q0, #0x80000000
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB9_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmax.s32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB9_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: mov.w r2, #-2147483648
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmaxv.s32 r2, q0
; CHECK-NEXT: beq .LBB9_9
; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB9_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: csel r2, r2, r1, gt
; CHECK-NEXT: le lr, .LBB9_8
; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = icmp sgt <4 x i32> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp sgt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: smax_i32_inloop:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB10_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB10_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB10_7
; CHECK-NEXT: .LBB10_3:
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: b .LBB10_9
; CHECK-NEXT: .LBB10_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w lr, r3, #4
; CHECK-NEXT: add.w lr, r2, lr, lsr #2
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB10_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: mov.w r4, #-2147483648
; CHECK-NEXT: vmaxv.s32 r4, q0
; CHECK-NEXT: cmp r0, r4
; CHECK-NEXT: csel r0, r0, r4, gt
; CHECK-NEXT: le lr, .LBB10_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, pc}
; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB10_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r2, [r1], #4
; CHECK-NEXT: cmp r0, r2
; CHECK-NEXT: csel r0, r0, r2, gt
; CHECK-NEXT: le lr, .LBB10_8
; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%l5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
%2 = icmp sgt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = phi i32 [ %3, %vector.body ]
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp sgt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: umin_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB11_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB11_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB11_7
; CHECK-NEXT: .LBB11_3:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: b .LBB11_9
; CHECK-NEXT: .LBB11_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB11_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmin.u32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB11_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vminv.u32 r2, q0
; CHECK-NEXT: beq .LBB11_9
; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB11_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: csel r2, r2, r1, lo
; CHECK-NEXT: le lr, .LBB11_8
; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = icmp ult <4 x i32> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp ult i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: umin_i32_inloop:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB12_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB12_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB12_7
; CHECK-NEXT: .LBB12_3:
; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: b .LBB12_9
; CHECK-NEXT: .LBB12_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w lr, r3, #4
; CHECK-NEXT: add.w lr, r2, lr, lsr #2
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB12_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: mov.w r4, #-1
; CHECK-NEXT: vminv.u32 r4, q0
; CHECK-NEXT: cmp r0, r4
; CHECK-NEXT: csel r0, r0, r4, lo
; CHECK-NEXT: le lr, .LBB12_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, pc}
; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB12_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r2, [r1], #4
; CHECK-NEXT: cmp r0, r2
; CHECK-NEXT: csel r0, r0, r2, hi
; CHECK-NEXT: le lr, .LBB12_8
; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%l5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
%2 = icmp ult i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = phi i32 [ %3, %vector.body ]
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp ugt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: umax_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB13_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB13_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB13_7
; CHECK-NEXT: .LBB13_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB13_9
; CHECK-NEXT: .LBB13_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB13_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmax.u32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB13_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: vmaxv.u32 r2, q0
; CHECK-NEXT: beq .LBB13_9
; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB13_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r0], #4
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: csel r2, r2, r1, hi
; CHECK-NEXT: le lr, .LBB13_8
; CHECK-NEXT: .LBB13_9: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%2 = icmp ugt <4 x i32> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp ugt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: umax_i32_inloop:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB14_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: mov.w r0, #0
; CHECK-NEXT: blo .LBB14_5
; CHECK-NEXT: @ %bb.2: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: subs r0, r3, #4
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: mov r2, r12
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB14_3: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: vmaxv.u32 r4, q0
; CHECK-NEXT: cmp r0, r4
; CHECK-NEXT: csel r0, r0, r4, hi
; CHECK-NEXT: le lr, .LBB14_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, pc}
; CHECK-NEXT: .LBB14_5: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r3
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB14_6: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r2, [r1], #4
; CHECK-NEXT: cmp r0, r2
; CHECK-NEXT: csel r0, r0, r2, hi
; CHECK-NEXT: le lr, .LBB14_6
; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .LBB14_8:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r4, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
%l5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
%2 = icmp ugt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = phi i32 [ %3, %vector.body ]
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
%6 = load i32, i32* %arrayidx, align 4
%c = icmp ugt i32 %r.07, %6
%add = select i1 %c, i32 %r.07, i32 %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret i32 %r.0.lcssa
}
define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: fmin_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB15_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB15_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI15_0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB15_7
; CHECK-NEXT: .LBB15_3:
; CHECK-NEXT: vldr s0, .LCPI15_0
; CHECK-NEXT: b .LBB15_9
; CHECK-NEXT: .LBB15_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: sub.w r12, r2, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB15_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: vcmp.f32 lt, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: le lr, .LBB15_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: vmov.f32 s5, s3
; CHECK-NEXT: vminnm.f32 q0, q0, q1
; CHECK-NEXT: vmov r3, s1
; CHECK-NEXT: vdup.32 q1, r3
; CHECK-NEXT: vminnm.f32 q0, q0, q1
; CHECK-NEXT: beq .LBB15_9
; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB15_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldmia r0!, {s4}
; CHECK-NEXT: vcmp.f32 s0, s4
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselge.f32 s0, s4, s0
; CHECK-NEXT: le lr, .LBB15_8
; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI15_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds float, float* %x, i32 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = fcmp ult <4 x float> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
%6 = load float, float* %arrayidx, align 4
%c = fcmp ult float %r.07, %6
%add = select i1 %c, float %r.07, float %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret float %r.0.lcssa
}
define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-LABEL: fmax_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB16_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: bhs .LBB16_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI16_0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB16_7
; CHECK-NEXT: .LBB16_3:
; CHECK-NEXT: vldr s0, .LCPI16_0
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: sub.w r12, r2, #4
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB16_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: vcmp.f32 lt, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: le lr, .LBB16_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: vmov.f32 s5, s3
; CHECK-NEXT: vmaxnm.f32 q0, q0, q1
; CHECK-NEXT: vmov r3, s1
; CHECK-NEXT: vdup.32 q1, r3
; CHECK-NEXT: vmaxnm.f32 q0, q0, q1
; CHECK-NEXT: beq .LBB16_9
; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldmia r0!, {s4}
; CHECK-NEXT: vcmp.f32 s4, s0
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselge.f32 s0, s4, s0
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI16_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%min.iters.check = icmp ult i32 %n, 4
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
vector.ph: ; preds = %for.body.preheader
%n.vec = and i32 %n, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr inbounds float, float* %x, i32 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = fcmp ugt <4 x float> %vec.phi, %wide.load
%3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
%r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader1, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
%6 = load float, float* %arrayidx, align 4
%c = fcmp ugt float %r.07, %6
%add = select i1 %c, float %r.07, float %6
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
%r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
ret float %r.0.lcssa
}
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)