mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
a9b961b8da
This adds a peephole optimisation to turn a t2MOVccr that could not be folded into any other instruction into a CSEL on 8.1-m. The t2MOVccr would usually be expanded into a conditional mov, that becomes an IT; MOV pair. We can instead generate a CSEL instruction, which can potentially be smaller and allows better register allocation freedom, which can help reduce codesize. Performance is more variable and may depend on the micrarchitecture details, but initial results look good. If we need to control this per-cpu, we can add a subtarget feature as we need it. Original patch by David Penry. Differential Revision: https://reviews.llvm.org/D83566
1712 lines
68 KiB
LLVM
1712 lines
68 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
|
|
|
|
define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: add_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB0_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: mov r12, r0
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB0_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: movs r0, #0
|
|
; CHECK-NEXT: b .LBB0_7
|
|
; CHECK-NEXT: .LBB0_3:
|
|
; CHECK-NEXT: movs r0, #0
|
|
; CHECK-NEXT: b .LBB0_9
|
|
; CHECK-NEXT: .LBB0_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: subs r0, r3, #4
|
|
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
|
|
; CHECK-NEXT: movs r0, #0
|
|
; CHECK-NEXT: mov r2, r12
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB0_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
|
; CHECK-NEXT: vaddva.u32 r0, q0
|
|
; CHECK-NEXT: le lr, .LBB0_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: popeq {r7, pc}
|
|
; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB0_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r2, [r1], #4
|
|
; CHECK-NEXT: add r0, r2
|
|
; CHECK-NEXT: le lr, .LBB0_8
|
|
; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %wide.load)
|
|
%3 = add i32 %2, %vec.phi
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
%add = add nsw i32 %5, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: mul_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB1_8
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB1_3
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: b .LBB1_6
|
|
; CHECK-NEXT: .LBB1_3: @ %vector.ph
|
|
; CHECK-NEXT: bic r12, r1, #3
|
|
; CHECK-NEXT: vmov.i32 q0, #0x1
|
|
; CHECK-NEXT: sub.w r3, r12, #4
|
|
; CHECK-NEXT: add.w lr, r2, r3, lsr #2
|
|
; CHECK-NEXT: mov r2, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB1_4: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
; CHECK-NEXT: vmul.i32 q0, q1, q0
|
|
; CHECK-NEXT: le lr, .LBB1_4
|
|
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
|
; CHECK-NEXT: vmov r2, s3
|
|
; CHECK-NEXT: cmp r12, r1
|
|
; CHECK-NEXT: vmov r3, s2
|
|
; CHECK-NEXT: mul lr, r3, r2
|
|
; CHECK-NEXT: vmov r3, s1
|
|
; CHECK-NEXT: vmov r2, s0
|
|
; CHECK-NEXT: mul r2, r3, r2
|
|
; CHECK-NEXT: mul r2, r2, lr
|
|
; CHECK-NEXT: beq .LBB1_8
|
|
; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r12
|
|
; CHECK-NEXT: add.w r0, r0, r12, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB1_7: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
; CHECK-NEXT: muls r2, r1, r2
|
|
; CHECK-NEXT: le lr, .LBB1_7
|
|
; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
|
|
; CHECK-NEXT: mov r0, r2
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = mul <4 x i32> %wide.load, %vec.phi
|
|
%index.next = add i32 %index, 4
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %2)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
%add = mul nsw i32 %5, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: and_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB2_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB2_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: b .LBB2_7
|
|
; CHECK-NEXT: .LBB2_3:
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
; CHECK-NEXT: b .LBB2_9
|
|
; CHECK-NEXT: .LBB2_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
; CHECK-NEXT: vmov.i8 q0, #0xff
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
|
; CHECK-NEXT: mov r2, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB2_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
; CHECK-NEXT: vand q0, q1, q0
|
|
; CHECK-NEXT: le lr, .LBB2_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: vmov r12, s3
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: vmov r2, s2
|
|
; CHECK-NEXT: vmov lr, s1
|
|
; CHECK-NEXT: and.w r12, r12, r2
|
|
; CHECK-NEXT: vmov r2, s0
|
|
; CHECK-NEXT: and.w r2, r2, lr
|
|
; CHECK-NEXT: and.w r2, r2, r12
|
|
; CHECK-NEXT: beq .LBB2_9
|
|
; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB2_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
; CHECK-NEXT: ands r2, r1
|
|
; CHECK-NEXT: le lr, .LBB2_8
|
|
; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: mov r0, r2
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = and <4 x i32> %wide.load, %vec.phi
|
|
%index.next = add i32 %index, 4
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %2)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
%add = and i32 %5, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: or_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB3_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB3_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB3_7
|
|
; CHECK-NEXT: .LBB3_3:
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB3_9
|
|
; CHECK-NEXT: .LBB3_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
|
; CHECK-NEXT: mov r2, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB3_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
; CHECK-NEXT: vorr q0, q1, q0
|
|
; CHECK-NEXT: le lr, .LBB3_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: vmov r12, s3
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: vmov r2, s2
|
|
; CHECK-NEXT: vmov lr, s1
|
|
; CHECK-NEXT: orr.w r12, r12, r2
|
|
; CHECK-NEXT: vmov r2, s0
|
|
; CHECK-NEXT: orr.w r2, r2, lr
|
|
; CHECK-NEXT: orr.w r2, r2, r12
|
|
; CHECK-NEXT: beq .LBB3_9
|
|
; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB3_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
; CHECK-NEXT: orrs r2, r1
|
|
; CHECK-NEXT: le lr, .LBB3_8
|
|
; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: mov r0, r2
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = or <4 x i32> %wide.load, %vec.phi
|
|
%index.next = add i32 %index, 4
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %2)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
%add = or i32 %5, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: xor_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB4_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB4_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB4_7
|
|
; CHECK-NEXT: .LBB4_3:
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB4_9
|
|
; CHECK-NEXT: .LBB4_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
|
; CHECK-NEXT: mov r2, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB4_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
; CHECK-NEXT: veor q0, q1, q0
|
|
; CHECK-NEXT: le lr, .LBB4_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: vmov r12, s3
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: vmov r2, s2
|
|
; CHECK-NEXT: vmov lr, s1
|
|
; CHECK-NEXT: eor.w r12, r12, r2
|
|
; CHECK-NEXT: vmov r2, s0
|
|
; CHECK-NEXT: eor.w r2, r2, lr
|
|
; CHECK-NEXT: eor.w r2, r2, r12
|
|
; CHECK-NEXT: beq .LBB4_9
|
|
; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB4_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
; CHECK-NEXT: eors r2, r1
|
|
; CHECK-NEXT: le lr, .LBB4_8
|
|
; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: mov r0, r2
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = xor <4 x i32> %wide.load, %vec.phi
|
|
%index.next = add i32 %index, 4
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %2)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%5 = load i32, i32* %arrayidx, align 4
|
|
%add = xor i32 %5, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: fadd_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB5_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB5_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr s0, .LCPI5_0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB5_7
|
|
; CHECK-NEXT: .LBB5_3:
|
|
; CHECK-NEXT: vldr s0, .LCPI5_0
|
|
; CHECK-NEXT: b .LBB5_9
|
|
; CHECK-NEXT: .LBB5_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r2, r1, #3
|
|
; CHECK-NEXT: movs r3, #1
|
|
; CHECK-NEXT: sub.w r12, r2, #4
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
|
; CHECK-NEXT: mov r3, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB5_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
|
|
; CHECK-NEXT: vadd.f32 q0, q1, q0
|
|
; CHECK-NEXT: le lr, .LBB5_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: vadd.f32 s4, s2, s3
|
|
; CHECK-NEXT: cmp r2, r1
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s4
|
|
; CHECK-NEXT: beq .LBB5_9
|
|
; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r2
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB5_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldr s2, [r0]
|
|
; CHECK-NEXT: adds r0, #4
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
; CHECK-NEXT: le lr, .LBB5_8
|
|
; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: vmov r0, s0
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.10:
|
|
; CHECK-NEXT: .LCPI5_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
|
|
%0 = getelementptr inbounds float, float* %x, i32 %index
|
|
%1 = bitcast float* %0 to <4 x float>*
|
|
%wide.load = load <4 x float>, <4 x float>* %1, align 4
|
|
%2 = fadd fast <4 x float> %wide.load, %vec.phi
|
|
%index.next = add i32 %index, 4
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%4 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
|
|
%5 = load float, float* %arrayidx, align 4
|
|
%add = fadd fast float %5, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
ret float %r.0.lcssa
|
|
}
|
|
|
|
define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: fmul_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB6_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB6_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB6_7
|
|
; CHECK-NEXT: .LBB6_3:
|
|
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
|
|
; CHECK-NEXT: b .LBB6_9
|
|
; CHECK-NEXT: .LBB6_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r2, r1, #3
|
|
; CHECK-NEXT: movs r3, #1
|
|
; CHECK-NEXT: sub.w r12, r2, #4
|
|
; CHECK-NEXT: vmov.f32 q0, #1.000000e+00
|
|
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
|
; CHECK-NEXT: mov r3, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB6_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
|
|
; CHECK-NEXT: vmul.f32 q0, q1, q0
|
|
; CHECK-NEXT: le lr, .LBB6_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: vmul.f32 s4, s2, s3
|
|
; CHECK-NEXT: cmp r2, r1
|
|
; CHECK-NEXT: vmul.f32 s0, s0, s1
|
|
; CHECK-NEXT: vmul.f32 s0, s0, s4
|
|
; CHECK-NEXT: beq .LBB6_9
|
|
; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r2
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB6_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldr s2, [r0]
|
|
; CHECK-NEXT: adds r0, #4
|
|
; CHECK-NEXT: vmul.f32 s0, s2, s0
|
|
; CHECK-NEXT: le lr, .LBB6_8
|
|
; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: vmov r0, s0
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ]
|
|
%0 = getelementptr inbounds float, float* %x, i32 %index
|
|
%1 = bitcast float* %0 to <4 x float>*
|
|
%wide.load = load <4 x float>, <4 x float>* %1, align 4
|
|
%2 = fmul fast <4 x float> %wide.load, %vec.phi
|
|
%index.next = add i32 %index, 4
|
|
%3 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %3, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%4 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
|
|
%5 = load float, float* %arrayidx, align 4
|
|
%add = fmul fast float %5, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
|
|
ret float %r.0.lcssa
|
|
}
|
|
|
|
define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: smin_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB7_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB7_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mvn r2, #-2147483648
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: b .LBB7_7
|
|
; CHECK-NEXT: .LBB7_3:
|
|
; CHECK-NEXT: mvn r2, #-2147483648
|
|
; CHECK-NEXT: b .LBB7_9
|
|
; CHECK-NEXT: .LBB7_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
; CHECK-NEXT: vmvn.i32 q0, #0x80000000
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
|
; CHECK-NEXT: mov r2, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB7_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
; CHECK-NEXT: vmin.s32 q0, q0, q1
|
|
; CHECK-NEXT: le lr, .LBB7_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: mvn r2, #-2147483648
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: vminv.s32 r2, q0
|
|
; CHECK-NEXT: beq .LBB7_9
|
|
; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB7_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
; CHECK-NEXT: cmp r2, r1
|
|
; CHECK-NEXT: csel r2, r2, r1, lt
|
|
; CHECK-NEXT: le lr, .LBB7_8
|
|
; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: mov r0, r2
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = icmp slt <4 x i32> %vec.phi, %wide.load
|
|
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %3)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
%c = icmp slt i32 %r.07, %6
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: smin_i32_inloop:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
; CHECK-NEXT: push {r4, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB8_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: mov r12, r0
|
|
; CHECK-NEXT: mvn r0, #-2147483648
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB8_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: b .LBB8_7
|
|
; CHECK-NEXT: .LBB8_3:
|
|
; CHECK-NEXT: mvn r0, #-2147483648
|
|
; CHECK-NEXT: b .LBB8_9
|
|
; CHECK-NEXT: .LBB8_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w lr, r3, #4
|
|
; CHECK-NEXT: add.w lr, r2, lr, lsr #2
|
|
; CHECK-NEXT: mov r2, r12
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB8_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
|
; CHECK-NEXT: mvn r4, #-2147483648
|
|
; CHECK-NEXT: vminv.s32 r4, q0
|
|
; CHECK-NEXT: cmp r0, r4
|
|
; CHECK-NEXT: csel r0, r0, r4, lt
|
|
; CHECK-NEXT: le lr, .LBB8_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: popeq {r4, pc}
|
|
; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB8_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r2, [r1], #4
|
|
; CHECK-NEXT: cmp r0, r2
|
|
; CHECK-NEXT: csel r0, r0, r2, lt
|
|
; CHECK-NEXT: le lr, .LBB8_8
|
|
; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%l5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
|
|
%2 = icmp slt i32 %vec.phi, %l5
|
|
%3 = select i1 %2, i32 %vec.phi, i32 %l5
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = phi i32 [ %3, %vector.body ]
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
%c = icmp slt i32 %r.07, %6
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: smax_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB9_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB9_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r2, #-2147483648
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: b .LBB9_7
|
|
; CHECK-NEXT: .LBB9_3:
|
|
; CHECK-NEXT: mov.w r2, #-2147483648
|
|
; CHECK-NEXT: b .LBB9_9
|
|
; CHECK-NEXT: .LBB9_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
; CHECK-NEXT: vmov.i32 q0, #0x80000000
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
|
; CHECK-NEXT: mov r2, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB9_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
; CHECK-NEXT: vmax.s32 q0, q0, q1
|
|
; CHECK-NEXT: le lr, .LBB9_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: mov.w r2, #-2147483648
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: vmaxv.s32 r2, q0
|
|
; CHECK-NEXT: beq .LBB9_9
|
|
; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB9_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
; CHECK-NEXT: cmp r2, r1
|
|
; CHECK-NEXT: csel r2, r2, r1, gt
|
|
; CHECK-NEXT: le lr, .LBB9_8
|
|
; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: mov r0, r2
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = icmp sgt <4 x i32> %vec.phi, %wide.load
|
|
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %3)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
%c = icmp sgt i32 %r.07, %6
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: smax_i32_inloop:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
; CHECK-NEXT: push {r4, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB10_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: mov r12, r0
|
|
; CHECK-NEXT: mov.w r0, #-2147483648
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB10_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: b .LBB10_7
|
|
; CHECK-NEXT: .LBB10_3:
|
|
; CHECK-NEXT: mov.w r0, #-2147483648
|
|
; CHECK-NEXT: b .LBB10_9
|
|
; CHECK-NEXT: .LBB10_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w lr, r3, #4
|
|
; CHECK-NEXT: add.w lr, r2, lr, lsr #2
|
|
; CHECK-NEXT: mov r2, r12
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB10_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
|
; CHECK-NEXT: mov.w r4, #-2147483648
|
|
; CHECK-NEXT: vmaxv.s32 r4, q0
|
|
; CHECK-NEXT: cmp r0, r4
|
|
; CHECK-NEXT: csel r0, r0, r4, gt
|
|
; CHECK-NEXT: le lr, .LBB10_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: popeq {r4, pc}
|
|
; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB10_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r2, [r1], #4
|
|
; CHECK-NEXT: cmp r0, r2
|
|
; CHECK-NEXT: csel r0, r0, r2, gt
|
|
; CHECK-NEXT: le lr, .LBB10_8
|
|
; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%l5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
|
|
%2 = icmp sgt i32 %vec.phi, %l5
|
|
%3 = select i1 %2, i32 %vec.phi, i32 %l5
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = phi i32 [ %3, %vector.body ]
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
%c = icmp sgt i32 %r.07, %6
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: umin_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB11_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB11_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: b .LBB11_7
|
|
; CHECK-NEXT: .LBB11_3:
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
; CHECK-NEXT: b .LBB11_9
|
|
; CHECK-NEXT: .LBB11_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
; CHECK-NEXT: vmov.i8 q0, #0xff
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
|
; CHECK-NEXT: mov r2, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB11_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
; CHECK-NEXT: vmin.u32 q0, q0, q1
|
|
; CHECK-NEXT: le lr, .LBB11_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: mov.w r2, #-1
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: vminv.u32 r2, q0
|
|
; CHECK-NEXT: beq .LBB11_9
|
|
; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB11_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
; CHECK-NEXT: cmp r2, r1
|
|
; CHECK-NEXT: csel r2, r2, r1, lo
|
|
; CHECK-NEXT: le lr, .LBB11_8
|
|
; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: mov r0, r2
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = icmp ult <4 x i32> %vec.phi, %wide.load
|
|
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %3)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
%c = icmp ult i32 %r.07, %6
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: umin_i32_inloop:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
; CHECK-NEXT: push {r4, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB12_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: mov r12, r0
|
|
; CHECK-NEXT: mov.w r0, #-1
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB12_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: b .LBB12_7
|
|
; CHECK-NEXT: .LBB12_3:
|
|
; CHECK-NEXT: mov.w r0, #-1
|
|
; CHECK-NEXT: b .LBB12_9
|
|
; CHECK-NEXT: .LBB12_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w lr, r3, #4
|
|
; CHECK-NEXT: add.w lr, r2, lr, lsr #2
|
|
; CHECK-NEXT: mov r2, r12
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB12_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
|
; CHECK-NEXT: mov.w r4, #-1
|
|
; CHECK-NEXT: vminv.u32 r4, q0
|
|
; CHECK-NEXT: cmp r0, r4
|
|
; CHECK-NEXT: csel r0, r0, r4, lo
|
|
; CHECK-NEXT: le lr, .LBB12_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: popeq {r4, pc}
|
|
; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB12_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r2, [r1], #4
|
|
; CHECK-NEXT: cmp r0, r2
|
|
; CHECK-NEXT: csel r0, r0, r2, hi
|
|
; CHECK-NEXT: le lr, .LBB12_8
|
|
; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%l5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
|
|
%2 = icmp ult i32 %vec.phi, %l5
|
|
%3 = select i1 %2, i32 %vec.phi, i32 %l5
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = phi i32 [ %3, %vector.body ]
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
%c = icmp ugt i32 %r.07, %6
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: umax_i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB13_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB13_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB13_7
|
|
; CHECK-NEXT: .LBB13_3:
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB13_9
|
|
; CHECK-NEXT: .LBB13_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: sub.w r12, r3, #4
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: add.w lr, r2, r12, lsr #2
|
|
; CHECK-NEXT: mov r2, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB13_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
|
|
; CHECK-NEXT: vmax.u32 q0, q0, q1
|
|
; CHECK-NEXT: le lr, .LBB13_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: vmaxv.u32 r2, q0
|
|
; CHECK-NEXT: beq .LBB13_9
|
|
; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r0, r0, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB13_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r1, [r0], #4
|
|
; CHECK-NEXT: cmp r2, r1
|
|
; CHECK-NEXT: csel r2, r2, r1, hi
|
|
; CHECK-NEXT: le lr, .LBB13_8
|
|
; CHECK-NEXT: .LBB13_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: mov r0, r2
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%2 = icmp ugt <4 x i32> %vec.phi, %wide.load
|
|
%3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %3)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
%c = icmp ugt i32 %r.07, %6
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: umax_i32_inloop:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
; CHECK-NEXT: push {r4, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB14_8
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: mov r12, r0
|
|
; CHECK-NEXT: movs r3, #0
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: mov.w r0, #0
|
|
; CHECK-NEXT: blo .LBB14_5
|
|
; CHECK-NEXT: @ %bb.2: @ %vector.ph
|
|
; CHECK-NEXT: bic r3, r1, #3
|
|
; CHECK-NEXT: movs r2, #1
|
|
; CHECK-NEXT: subs r0, r3, #4
|
|
; CHECK-NEXT: add.w lr, r2, r0, lsr #2
|
|
; CHECK-NEXT: movs r0, #0
|
|
; CHECK-NEXT: mov r2, r12
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB14_3: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r2], #16
|
|
; CHECK-NEXT: movs r4, #0
|
|
; CHECK-NEXT: vmaxv.u32 r4, q0
|
|
; CHECK-NEXT: cmp r0, r4
|
|
; CHECK-NEXT: csel r0, r0, r4, hi
|
|
; CHECK-NEXT: le lr, .LBB14_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: cmp r3, r1
|
|
; CHECK-NEXT: it eq
|
|
; CHECK-NEXT: popeq {r4, pc}
|
|
; CHECK-NEXT: .LBB14_5: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r3
|
|
; CHECK-NEXT: add.w r1, r12, r3, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB14_6: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldr r2, [r1], #4
|
|
; CHECK-NEXT: cmp r0, r2
|
|
; CHECK-NEXT: csel r0, r0, r2, hi
|
|
; CHECK-NEXT: le lr, .LBB14_6
|
|
; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
; CHECK-NEXT: .LBB14_8:
|
|
; CHECK-NEXT: movs r0, #0
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
|
%1 = bitcast i32* %0 to <4 x i32>*
|
|
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
|
|
%l5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
|
|
%2 = icmp ugt i32 %vec.phi, %l5
|
|
%3 = select i1 %2, i32 %vec.phi, i32 %l5
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = phi i32 [ %3, %vector.body ]
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
|
|
%6 = load i32, i32* %arrayidx, align 4
|
|
%c = icmp ugt i32 %r.07, %6
|
|
%add = select i1 %c, i32 %r.07, i32 %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: fmin_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB15_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB15_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr s0, .LCPI15_0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB15_7
|
|
; CHECK-NEXT: .LBB15_3:
|
|
; CHECK-NEXT: vldr s0, .LCPI15_0
|
|
; CHECK-NEXT: b .LBB15_9
|
|
; CHECK-NEXT: .LBB15_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r2, r1, #3
|
|
; CHECK-NEXT: movs r3, #1
|
|
; CHECK-NEXT: sub.w r12, r2, #4
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
|
; CHECK-NEXT: mov r3, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB15_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
|
|
; CHECK-NEXT: vcmp.f32 lt, q0, q1
|
|
; CHECK-NEXT: vpsel q0, q0, q1
|
|
; CHECK-NEXT: le lr, .LBB15_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: vmov.f32 s4, s2
|
|
; CHECK-NEXT: cmp r2, r1
|
|
; CHECK-NEXT: vmov.f32 s5, s3
|
|
; CHECK-NEXT: vminnm.f32 q0, q0, q1
|
|
; CHECK-NEXT: vmov r3, s1
|
|
; CHECK-NEXT: vdup.32 q1, r3
|
|
; CHECK-NEXT: vminnm.f32 q0, q0, q1
|
|
; CHECK-NEXT: beq .LBB15_9
|
|
; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r2
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB15_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldmia r0!, {s4}
|
|
; CHECK-NEXT: vcmp.f32 s0, s4
|
|
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
|
|
; CHECK-NEXT: vselge.f32 s0, s4, s0
|
|
; CHECK-NEXT: le lr, .LBB15_8
|
|
; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: vmov r0, s0
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.10:
|
|
; CHECK-NEXT: .LCPI15_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds float, float* %x, i32 %index
|
|
%1 = bitcast float* %0 to <4 x float>*
|
|
%wide.load = load <4 x float>, <4 x float>* %1, align 4
|
|
%2 = fcmp ult <4 x float> %vec.phi, %wide.load
|
|
%3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %3)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
|
|
%6 = load float, float* %arrayidx, align 4
|
|
%c = fcmp ult float %r.07, %6
|
|
%add = select i1 %c, float %r.07, float %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret float %r.0.lcssa
|
|
}
|
|
|
|
define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
|
|
; CHECK-LABEL: fmax_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: cmp r1, #1
|
|
; CHECK-NEXT: blt .LBB16_3
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: cmp r1, #4
|
|
; CHECK-NEXT: bhs .LBB16_4
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr s0, .LCPI16_0
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: b .LBB16_7
|
|
; CHECK-NEXT: .LBB16_3:
|
|
; CHECK-NEXT: vldr s0, .LCPI16_0
|
|
; CHECK-NEXT: b .LBB16_9
|
|
; CHECK-NEXT: .LBB16_4: @ %vector.ph
|
|
; CHECK-NEXT: bic r2, r1, #3
|
|
; CHECK-NEXT: movs r3, #1
|
|
; CHECK-NEXT: sub.w r12, r2, #4
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
|
; CHECK-NEXT: mov r3, r0
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB16_5: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
|
|
; CHECK-NEXT: vcmp.f32 lt, q1, q0
|
|
; CHECK-NEXT: vpsel q0, q0, q1
|
|
; CHECK-NEXT: le lr, .LBB16_5
|
|
; CHECK-NEXT: @ %bb.6: @ %middle.block
|
|
; CHECK-NEXT: vmov.f32 s4, s2
|
|
; CHECK-NEXT: cmp r2, r1
|
|
; CHECK-NEXT: vmov.f32 s5, s3
|
|
; CHECK-NEXT: vmaxnm.f32 q0, q0, q1
|
|
; CHECK-NEXT: vmov r3, s1
|
|
; CHECK-NEXT: vdup.32 q1, r3
|
|
; CHECK-NEXT: vmaxnm.f32 q0, q0, q1
|
|
; CHECK-NEXT: beq .LBB16_9
|
|
; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1
|
|
; CHECK-NEXT: sub.w lr, r1, r2
|
|
; CHECK-NEXT: add.w r0, r0, r2, lsl #2
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB16_8: @ %for.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldmia r0!, {s4}
|
|
; CHECK-NEXT: vcmp.f32 s4, s0
|
|
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
|
|
; CHECK-NEXT: vselge.f32 s0, s4, s0
|
|
; CHECK-NEXT: le lr, .LBB16_8
|
|
; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup
|
|
; CHECK-NEXT: vmov r0, s0
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.10:
|
|
; CHECK-NEXT: .LCPI16_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%min.iters.check = icmp ult i32 %n, 4
|
|
br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
|
|
|
|
vector.ph: ; preds = %for.body.preheader
|
|
%n.vec = and i32 %n, -4
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
|
|
%0 = getelementptr inbounds float, float* %x, i32 %index
|
|
%1 = bitcast float* %0 to <4 x float>*
|
|
%wide.load = load <4 x float>, <4 x float>* %1, align 4
|
|
%2 = fcmp ugt <4 x float> %vec.phi, %wide.load
|
|
%3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
|
|
%index.next = add i32 %index, 4
|
|
%4 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %4, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%5 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %3)
|
|
%cmp.n = icmp eq i32 %n.vec, %n
|
|
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
|
|
|
|
for.body.preheader1: ; preds = %middle.block, %for.body.preheader
|
|
%i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
|
|
%r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader1, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
|
|
%r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
|
|
%arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
|
|
%6 = load float, float* %arrayidx, align 4
|
|
%c = fcmp ugt float %r.07, %6
|
|
%add = select i1 %c, float %r.07, float %6
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
|
|
%r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
|
|
ret float %r.0.lcssa
|
|
}
|
|
|
|
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
|
|
declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
|
|
declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
|
|
declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
|
|
declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
|
|
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
|
|
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
|
|
declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
|
|
declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
|
|
declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
|
|
declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
|
|
declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
|
|
declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
|