mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
18fc27f084
Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
596 lines
22 KiB
LLVM
596 lines
22 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
|
|
|
|
; F32
|
|
|
|
define arm_aapcs_vfpcc <4 x float> @maxf32(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: maxf32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vmaxnma.f32 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
|
|
%bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
|
|
%c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %aa, <4 x float> %bb)
|
|
ret <4 x float> %c
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <4 x float> @maxf32_c(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: maxf32_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vmaxnma.f32 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
|
|
%bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
|
|
%c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %bb, <4 x float> %aa)
|
|
ret <4 x float> %c
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <4 x float> @minf32(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: minf32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vminnma.f32 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
|
|
%bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
|
|
%c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %aa, <4 x float> %bb)
|
|
ret <4 x float> %c
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <4 x float> @minf32_c(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: minf32_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vminnma.f32 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
|
|
%bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
|
|
%c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %bb, <4 x float> %aa)
|
|
ret <4 x float> %c
|
|
}
|
|
|
|
|
|
define arm_aapcs_vfpcc <4 x float> @maxpredf32(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: maxpredf32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vpt.f32 gt, q1, q0
|
|
; CHECK-NEXT: vmaxnmat.f32 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%c = fcmp olt <4 x float> %a, %b
|
|
%s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
|
|
ret <4 x float> %s
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <4 x float> @maxpredf32_c(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: maxpredf32_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vpt.f32 gt, q1, q0
|
|
; CHECK-NEXT: vmaxnmat.f32 q1, q0
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%c = fcmp olt <4 x float> %a, %b
|
|
%s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
|
|
ret <4 x float> %s
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <4 x float> @minpredf32(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: minpredf32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vpt.f32 gt, q1, q0
|
|
; CHECK-NEXT: vminnmat.f32 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%c = fcmp olt <4 x float> %a, %b
|
|
%s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
|
|
ret <4 x float> %s
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <4 x float> @minpredf32_c(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: minpredf32_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vpt.f32 gt, q1, q0
|
|
; CHECK-NEXT: vminnmat.f32 q1, q0
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%c = fcmp olt <4 x float> %a, %b
|
|
%s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
|
|
ret <4 x float> %s
|
|
}
|
|
|
|
|
|
|
|
; F16
|
|
|
|
define arm_aapcs_vfpcc <8 x half> @maxf16(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-LABEL: maxf16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vmaxnma.f16 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
|
|
%bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
|
|
%c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %aa, <8 x half> %bb)
|
|
ret <8 x half> %c
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <8 x half> @maxf16_c(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-LABEL: maxf16_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vmaxnma.f16 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
|
|
%bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
|
|
%c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %bb, <8 x half> %aa)
|
|
ret <8 x half> %c
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <8 x half> @minf16(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-LABEL: minf16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vminnma.f16 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
|
|
%bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
|
|
%c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %aa, <8 x half> %bb)
|
|
ret <8 x half> %c
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <8 x half> @minf16_c(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-LABEL: minf16_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vminnma.f16 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
|
|
%bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
|
|
%c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %bb, <8 x half> %aa)
|
|
ret <8 x half> %c
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <8 x half> @maxpredf16(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-LABEL: maxpredf16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vpt.f16 gt, q1, q0
|
|
; CHECK-NEXT: vmaxnmat.f16 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%c = fcmp olt <8 x half> %a, %b
|
|
%s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
|
|
ret <8 x half> %s
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <8 x half> @maxpredf16_c(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-LABEL: maxpredf16_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vpt.f16 gt, q1, q0
|
|
; CHECK-NEXT: vmaxnmat.f16 q1, q0
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%c = fcmp olt <8 x half> %a, %b
|
|
%s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
|
|
ret <8 x half> %s
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <8 x half> @minpredf16(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-LABEL: minpredf16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vpt.f16 gt, q1, q0
|
|
; CHECK-NEXT: vminnmat.f16 q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%c = fcmp olt <8 x half> %a, %b
|
|
%s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
|
|
ret <8 x half> %s
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <8 x half> @minpredf16_c(<8 x half> %a, <8 x half> %b) {
|
|
; CHECK-LABEL: minpredf16_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vpt.f16 gt, q1, q0
|
|
; CHECK-NEXT: vminnmat.f16 q1, q0
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: bx lr
|
|
%c = fcmp olt <8 x half> %a, %b
|
|
%s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
|
|
ret <8 x half> %s
|
|
}
|
|
|
|
|
|
; Loops
|
|
|
|
define void @loop_absmax32(float* nocapture readonly %0, i32 %1, float* nocapture %2) {
|
|
; CHECK-LABEL: loop_absmax32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: lsrs r1, r1, #3
|
|
; CHECK-NEXT: wls lr, r1, .LBB16_3
|
|
; CHECK-NEXT: @ %bb.1: @ %.preheader
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: .LBB16_2: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vmaxnma.f32 q0, q1
|
|
; CHECK-NEXT: le lr, .LBB16_2
|
|
; CHECK-NEXT: .LBB16_3:
|
|
; CHECK-NEXT: vldr s4, .LCPI16_0
|
|
; CHECK-NEXT: vmov r0, s4
|
|
; CHECK-NEXT: vmaxnmav.f32 r0, q0
|
|
; CHECK-NEXT: vmov s0, r0
|
|
; CHECK-NEXT: vstr s0, [r2]
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.4:
|
|
; CHECK-NEXT: .LCPI16_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
%4 = lshr i32 %1, 3
|
|
%5 = icmp eq i32 %4, 0
|
|
br i1 %5, label %18, label %6
|
|
|
|
6: ; preds = %3, %6
|
|
%7 = phi i32 [ %16, %6 ], [ %4, %3 ]
|
|
%8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
|
|
%9 = phi float* [ %12, %6 ], [ %0, %3 ]
|
|
%10 = bitcast float* %9 to <4 x float>*
|
|
%11 = load <4 x float>, <4 x float>* %10, align 4
|
|
%12 = getelementptr inbounds float, float* %9, i32 4
|
|
%13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
|
|
%14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
|
|
%15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %14, <4 x float> %13)
|
|
%16 = add nsw i32 %7, -1
|
|
%17 = icmp eq i32 %16, 0
|
|
br i1 %17, label %18, label %6
|
|
|
|
18: ; preds = %6, %3
|
|
%19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
|
|
%20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
|
|
store float %20, float* %2, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @loop_absmax32_c(float* nocapture readonly %0, i32 %1, float* nocapture %2) {
|
|
; CHECK-LABEL: loop_absmax32_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: lsrs r1, r1, #3
|
|
; CHECK-NEXT: wls lr, r1, .LBB17_3
|
|
; CHECK-NEXT: @ %bb.1: @ %.preheader
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: .LBB17_2: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vmaxnma.f32 q0, q1
|
|
; CHECK-NEXT: le lr, .LBB17_2
|
|
; CHECK-NEXT: .LBB17_3:
|
|
; CHECK-NEXT: vldr s4, .LCPI17_0
|
|
; CHECK-NEXT: vmov r0, s4
|
|
; CHECK-NEXT: vmaxnmav.f32 r0, q0
|
|
; CHECK-NEXT: vmov s0, r0
|
|
; CHECK-NEXT: vstr s0, [r2]
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.4:
|
|
; CHECK-NEXT: .LCPI17_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
%4 = lshr i32 %1, 3
|
|
%5 = icmp eq i32 %4, 0
|
|
br i1 %5, label %18, label %6
|
|
|
|
6: ; preds = %3, %6
|
|
%7 = phi i32 [ %16, %6 ], [ %4, %3 ]
|
|
%8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
|
|
%9 = phi float* [ %12, %6 ], [ %0, %3 ]
|
|
%10 = bitcast float* %9 to <4 x float>*
|
|
%11 = load <4 x float>, <4 x float>* %10, align 4
|
|
%12 = getelementptr inbounds float, float* %9, i32 4
|
|
%13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
|
|
%14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
|
|
%15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %13, <4 x float> %14)
|
|
%16 = add nsw i32 %7, -1
|
|
%17 = icmp eq i32 %16, 0
|
|
br i1 %17, label %18, label %6
|
|
|
|
18: ; preds = %6, %3
|
|
%19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
|
|
%20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
|
|
store float %20, float* %2, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @loop_absmax32_pred(float* %0, i32 %1, float* nocapture %2) {
|
|
; CHECK-LABEL: loop_absmax32_pred:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: dlstp.32 lr, r1
|
|
; CHECK-NEXT: .LBB18_1: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vmaxnma.f32 q0, q1
|
|
; CHECK-NEXT: letp lr, .LBB18_1
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr s4, .LCPI18_0
|
|
; CHECK-NEXT: vmov r0, s4
|
|
; CHECK-NEXT: vmaxnmav.f32 r0, q0
|
|
; CHECK-NEXT: vmov s0, r0
|
|
; CHECK-NEXT: vstr s0, [r2]
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.3:
|
|
; CHECK-NEXT: .LCPI18_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
br label %4
|
|
|
|
4: ; preds = %4, %3
|
|
%5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
|
|
%6 = phi i32 [ %1, %3 ], [ %13, %4 ]
|
|
%7 = phi float* [ %0, %3 ], [ %11, %4 ]
|
|
%8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
|
|
%9 = bitcast float* %7 to <4 x float>*
|
|
%10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
|
|
%11 = getelementptr inbounds float, float* %7, i32 4
|
|
%12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %5, <4 x float> %10, <4 x i1> %8)
|
|
%13 = add nsw i32 %6, -4
|
|
%14 = icmp sgt i32 %6, 4
|
|
br i1 %14, label %4, label %15
|
|
|
|
15: ; preds = %4
|
|
%16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
|
|
store float %16, float* %2, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @loop_absmax32_pred_c(float* %0, i32 %1, float* nocapture %2) {
|
|
; CHECK-LABEL: loop_absmax32_pred_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: dlstp.32 lr, r1
|
|
; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vmaxnma.f32 q1, q0
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: letp lr, .LBB19_1
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr s0, .LCPI19_0
|
|
; CHECK-NEXT: vmov r0, s0
|
|
; CHECK-NEXT: vmaxnmav.f32 r0, q1
|
|
; CHECK-NEXT: vmov s0, r0
|
|
; CHECK-NEXT: vstr s0, [r2]
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 2
|
|
; CHECK-NEXT: @ %bb.3:
|
|
; CHECK-NEXT: .LCPI19_0:
|
|
; CHECK-NEXT: .long 0x00000000 @ float 0
|
|
br label %4
|
|
|
|
4: ; preds = %4, %3
|
|
%5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
|
|
%6 = phi i32 [ %1, %3 ], [ %13, %4 ]
|
|
%7 = phi float* [ %0, %3 ], [ %11, %4 ]
|
|
%8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
|
|
%9 = bitcast float* %7 to <4 x float>*
|
|
%10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
|
|
%11 = getelementptr inbounds float, float* %7, i32 4
|
|
%12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %10, <4 x float> %5, <4 x i1> %8)
|
|
%13 = add nsw i32 %6, -4
|
|
%14 = icmp sgt i32 %6, 4
|
|
br i1 %14, label %4, label %15
|
|
|
|
15: ; preds = %4
|
|
%16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
|
|
store float %16, float* %2, align 4
|
|
ret void
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
define void @loop_absmax16(half* nocapture readonly %0, i32 %1, half* nocapture %2) {
|
|
; CHECK-LABEL: loop_absmax16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: lsrs r1, r1, #3
|
|
; CHECK-NEXT: wls lr, r1, .LBB20_3
|
|
; CHECK-NEXT: @ %bb.1: @ %.preheader
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: .LBB20_2: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #8
|
|
; CHECK-NEXT: vmaxnma.f16 q0, q1
|
|
; CHECK-NEXT: le lr, .LBB20_2
|
|
; CHECK-NEXT: .LBB20_3:
|
|
; CHECK-NEXT: vldr.16 s4, .LCPI20_0
|
|
; CHECK-NEXT: vmov r0, s4
|
|
; CHECK-NEXT: vmaxnmav.f16 r0, q0
|
|
; CHECK-NEXT: vmov s0, r0
|
|
; CHECK-NEXT: vstr.16 s0, [r2]
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 1
|
|
; CHECK-NEXT: @ %bb.4:
|
|
; CHECK-NEXT: .LCPI20_0:
|
|
; CHECK-NEXT: .short 0x0000 @ half 0
|
|
%4 = lshr i32 %1, 3
|
|
%5 = icmp eq i32 %4, 0
|
|
br i1 %5, label %18, label %6
|
|
|
|
6: ; preds = %3, %6
|
|
%7 = phi i32 [ %16, %6 ], [ %4, %3 ]
|
|
%8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
|
|
%9 = phi half* [ %12, %6 ], [ %0, %3 ]
|
|
%10 = bitcast half* %9 to <8 x half>*
|
|
%11 = load <8 x half>, <8 x half>* %10, align 4
|
|
%12 = getelementptr inbounds half, half* %9, i32 4
|
|
%13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
|
|
%14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
|
|
%15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %14, <8 x half> %13)
|
|
%16 = add nsw i32 %7, -1
|
|
%17 = icmp eq i32 %16, 0
|
|
br i1 %17, label %18, label %6
|
|
|
|
18: ; preds = %6, %3
|
|
%19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
|
|
%20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
|
|
store half %20, half* %2, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @loop_absmax16_c(half* nocapture readonly %0, i32 %1, half* nocapture %2) {
|
|
; CHECK-LABEL: loop_absmax16_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: lsrs r1, r1, #3
|
|
; CHECK-NEXT: wls lr, r1, .LBB21_3
|
|
; CHECK-NEXT: @ %bb.1: @ %.preheader
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: .LBB21_2: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #8
|
|
; CHECK-NEXT: vmaxnma.f16 q0, q1
|
|
; CHECK-NEXT: le lr, .LBB21_2
|
|
; CHECK-NEXT: .LBB21_3:
|
|
; CHECK-NEXT: vldr.16 s4, .LCPI21_0
|
|
; CHECK-NEXT: vmov r0, s4
|
|
; CHECK-NEXT: vmaxnmav.f16 r0, q0
|
|
; CHECK-NEXT: vmov s0, r0
|
|
; CHECK-NEXT: vstr.16 s0, [r2]
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 1
|
|
; CHECK-NEXT: @ %bb.4:
|
|
; CHECK-NEXT: .LCPI21_0:
|
|
; CHECK-NEXT: .short 0x0000 @ half 0
|
|
%4 = lshr i32 %1, 3
|
|
%5 = icmp eq i32 %4, 0
|
|
br i1 %5, label %18, label %6
|
|
|
|
6: ; preds = %3, %6
|
|
%7 = phi i32 [ %16, %6 ], [ %4, %3 ]
|
|
%8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
|
|
%9 = phi half* [ %12, %6 ], [ %0, %3 ]
|
|
%10 = bitcast half* %9 to <8 x half>*
|
|
%11 = load <8 x half>, <8 x half>* %10, align 4
|
|
%12 = getelementptr inbounds half, half* %9, i32 4
|
|
%13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
|
|
%14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
|
|
%15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %13, <8 x half> %14)
|
|
%16 = add nsw i32 %7, -1
|
|
%17 = icmp eq i32 %16, 0
|
|
br i1 %17, label %18, label %6
|
|
|
|
18: ; preds = %6, %3
|
|
%19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
|
|
%20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
|
|
store half %20, half* %2, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @loop_absmax16_pred(half* %0, i32 %1, half* nocapture %2) {
|
|
; CHECK-LABEL: loop_absmax16_pred:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: dlstp.16 lr, r1
|
|
; CHECK-NEXT: .LBB22_1: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrh.u16 q1, [r0], #8
|
|
; CHECK-NEXT: vmaxnma.f16 q0, q1
|
|
; CHECK-NEXT: letp lr, .LBB22_1
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr.16 s4, .LCPI22_0
|
|
; CHECK-NEXT: vmov r0, s4
|
|
; CHECK-NEXT: vmaxnmav.f16 r0, q0
|
|
; CHECK-NEXT: vmov s0, r0
|
|
; CHECK-NEXT: vstr.16 s0, [r2]
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 1
|
|
; CHECK-NEXT: @ %bb.3:
|
|
; CHECK-NEXT: .LCPI22_0:
|
|
; CHECK-NEXT: .short 0x0000 @ half 0
|
|
br label %4
|
|
|
|
4: ; preds = %4, %3
|
|
%5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
|
|
%6 = phi i32 [ %1, %3 ], [ %13, %4 ]
|
|
%7 = phi half* [ %0, %3 ], [ %11, %4 ]
|
|
%8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
|
|
%9 = bitcast half* %7 to <8 x half>*
|
|
%10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
|
|
%11 = getelementptr inbounds half, half* %7, i32 4
|
|
%12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %5, <8 x half> %10, <8 x i1> %8)
|
|
%13 = add nsw i32 %6, -8
|
|
%14 = icmp sgt i32 %6, 8
|
|
br i1 %14, label %4, label %15
|
|
|
|
15: ; preds = %4
|
|
%16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
|
|
store half %16, half* %2, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @loop_absmax16_pred_c(half* %0, i32 %1, half* nocapture %2) {
|
|
; CHECK-LABEL: loop_absmax16_pred_c:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: dlstp.16 lr, r1
|
|
; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrh.u16 q1, [r0], #8
|
|
; CHECK-NEXT: vmaxnma.f16 q1, q0
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: letp lr, .LBB23_1
|
|
; CHECK-NEXT: @ %bb.2:
|
|
; CHECK-NEXT: vldr.16 s0, .LCPI23_0
|
|
; CHECK-NEXT: vmov r0, s0
|
|
; CHECK-NEXT: vmaxnmav.f16 r0, q1
|
|
; CHECK-NEXT: vmov s0, r0
|
|
; CHECK-NEXT: vstr.16 s0, [r2]
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
; CHECK-NEXT: .p2align 1
|
|
; CHECK-NEXT: @ %bb.3:
|
|
; CHECK-NEXT: .LCPI23_0:
|
|
; CHECK-NEXT: .short 0x0000 @ half 0
|
|
br label %4
|
|
|
|
4: ; preds = %4, %3
|
|
%5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
|
|
%6 = phi i32 [ %1, %3 ], [ %13, %4 ]
|
|
%7 = phi half* [ %0, %3 ], [ %11, %4 ]
|
|
%8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
|
|
%9 = bitcast half* %7 to <8 x half>*
|
|
%10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
|
|
%11 = getelementptr inbounds half, half* %7, i32 4
|
|
%12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> %5, <8 x i1> %8)
|
|
%13 = add nsw i32 %6, -8
|
|
%14 = icmp sgt i32 %6, 8
|
|
br i1 %14, label %4, label %15
|
|
|
|
15: ; preds = %4
|
|
%16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
|
|
store half %16, half* %2, align 4
|
|
ret void
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
|
|
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
|
|
declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
|
|
declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
|
|
declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>)
|
|
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
|
|
declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
|
|
declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
|
|
|
|
declare <8 x i1> @llvm.arm.mve.vctp16(i32)
|
|
declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>)
|
|
declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
|
|
declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
|
|
declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>)
|
|
declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
|
|
declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
|
|
declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
|
|
|
|
|