1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
David Green fcac3fa8b2 [ARM] Convert VPSEL to VMOV in tail predicated loops
VPSEL has slightly different semantics under tail predication (it can
end up selecting from Qn, Qm and Qd). We do not model that at the moment
so they block tail predicated loops from being formed.

This just converts them into a predicated VMOV instead (via a VORR),
allowing tail predication to happen whilst still modelling the original
behaviour of the input.

Differential Revision: https://reviews.llvm.org/D85110
2020-08-03 22:03:14 +01:00

95 lines
4.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -verify-machineinstrs -o - | FileCheck %s
define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %pResult, i32* nocapture %pIndex) {
; CHECK-LABEL: arm_min_helium_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r6, r7, lr}
; CHECK-NEXT: push {r4, r6, r7, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: vidup.u32 q2, r6, #1
; CHECK-NEXT: adr r4, .LCPI0_0
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: vmov.i32 q3, #0x4
; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB0_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q4, [r0], #16
; CHECK-NEXT: vcmp.f32 ge, q1, q4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vmovt q1, q4
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vadd.i32 q2, q2, q3
; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %do.end
; CHECK-NEXT: vldr s8, .LCPI0_1
; CHECK-NEXT: vdup.32 q3, r1
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vminnmv.f32 r0, q1
; CHECK-NEXT: vcmp.f32 le, q1, r0
; CHECK-NEXT: vmov s8, r0
; CHECK-NEXT: vpsel q0, q0, q3
; CHECK-NEXT: vminv.u32 r1, q0
; CHECK-NEXT: str r1, [r3]
; CHECK-NEXT: vstr s8, [r2]
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r4, r6, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11
; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11
; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11
; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11
; CHECK-NEXT: .LCPI0_1:
; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 1)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
br label %do.body
do.body: ; preds = %do.body, %entry
%curExtremValVec.0 = phi <4 x float> [ <float 0x426D1A94A0000000, float 0x426D1A94A0000000, float 0x426D1A94A0000000, float 0x426D1A94A0000000>, %entry ], [ %8, %do.body ]
%indexVec.0 = phi <4 x i32> [ %1, %entry ], [ %11, %do.body ]
%2 = phi <4 x float> [ zeroinitializer, %entry ], [ %10, %do.body ]
%blkCnt.0 = phi i32 [ %blockSize, %entry ], [ %sub, %do.body ]
%pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
%3 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
%4 = bitcast float* %pSrc.addr.0 to <4 x float>*
%5 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> zeroinitializer)
%6 = fcmp fast ole <4 x float> %5, %curExtremValVec.0
%7 = and <4 x i1> %6, %3
%8 = select fast <4 x i1> %7, <4 x float> %5, <4 x float> %curExtremValVec.0
%9 = bitcast <4 x i32> %indexVec.0 to <4 x float>
%10 = select fast <4 x i1> %7, <4 x float> %9, <4 x float> %2
%11 = add <4 x i32> %indexVec.0, <i32 4, i32 4, i32 4, i32 4>
%add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
%sub = add nsw i32 %blkCnt.0, -4
%cmp = icmp sgt i32 %blkCnt.0, 4
br i1 %cmp, label %do.body, label %do.end
do.end: ; preds = %do.body
%12 = bitcast <4 x float> %10 to <4 x i32>
%13 = tail call fast float @llvm.arm.mve.minnmv.f32.v4f32(float 0x426D1A94A0000000, <4 x float> %8)
%.splatinsert = insertelement <4 x float> undef, float %13, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
%14 = fcmp fast ole <4 x float> %8, %.splat
%.splatinsert1 = insertelement <4 x i32> undef, i32 %blockSize, i32 0
%.splat2 = shufflevector <4 x i32> %.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
%15 = select <4 x i1> %14, <4 x i32> %12, <4 x i32> %.splat2
%16 = tail call i32 @llvm.arm.mve.minv.v4i32(i32 %blockSize, <4 x i32> %15, i32 1)
store i32 %16, i32* %pIndex, align 4
store float %13, float* %pResult, align 4
ret void
}
declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) #1
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2
declare float @llvm.arm.mve.minnmv.f32.v4f32(float, <4 x float>) #1
declare i32 @llvm.arm.mve.minv.v4i32(i32, <4 x i32>, i32) #1