1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

[ARM] Convert VPSEL to VMOV in tail predicated loops

VPSEL has slightly different semantics under tail predication (it can
end up selecting from Qn, Qm and Qd). We do not model that at the moment
so they block tail predicated loops from being formed.

This just converts them into a predicated VMOV instead (via a VORR),
allowing tail predication to happen whilst still modelling the original
behaviour of the input.

Differential Revision: https://reviews.llvm.org/D85110
This commit is contained in:
David Green 2020-08-03 22:03:14 +01:00
parent d648be35e8
commit fcac3fa8b2
5 changed files with 110 additions and 62 deletions

View File

@ -57,6 +57,7 @@ private:
Register Target);
bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
bool ConvertVPSEL(MachineBasicBlock &MBB);
};
char MVEVPTOptimisations::ID = 0;
@ -356,7 +357,7 @@ bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
}
for (MachineInstr *DeadInstruction : DeadInstructions)
DeadInstruction->removeFromParent();
DeadInstruction->eraseFromParent();
return Modified;
}
@ -430,7 +431,44 @@ bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
}
for (MachineInstr *DeadInstruction : DeadInstructions)
DeadInstruction->removeFromParent();
DeadInstruction->eraseFromParent();
return !DeadInstructions.empty();
}
// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
// somewhat blunt approximation to allow tail predicated with vpsel
// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
// different semantics under tail predication. Until that is modelled we just
// convert to a VMOVT (via a predicated VORR) instead.
bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
bool HasVCTP = false;
SmallVector<MachineInstr *, 4> DeadInstructions;
for (MachineInstr &MI : MBB.instrs()) {
if (isVCTP(&MI)) {
HasVCTP = true;
continue;
}
if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
continue;
MachineInstrBuilder MIBuilder =
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
.add(MI.getOperand(1))
.addImm(ARMVCC::Then)
.add(MI.getOperand(4))
.add(MI.getOperand(2));
LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump());
DeadInstructions.push_back(&MI);
}
for (MachineInstr *DeadInstruction : DeadInstructions)
DeadInstruction->eraseFromParent();
return !DeadInstructions.empty();
}
@ -452,6 +490,7 @@ bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
for (MachineBasicBlock &MBB : Fn) {
Modified |= ReplaceVCMPsByVPNOTs(MBB);
Modified |= ReduceOldVCCRValueUses(MBB);
Modified |= ConvertVPSEL(MBB);
}
LLVM_DEBUG(dbgs() << "**************************************\n");

View File

@ -23,14 +23,14 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: and r4, r12, #15
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vdup.32 q3, r4
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r2], #16
; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
; CHECK-NEXT: vcmp.i32 eq, q3, zr
; CHECK-NEXT: vdup.32 q3, r4
; CHECK-NEXT: vpt.i32 eq, q3, zr
; CHECK-NEXT: vmovt q1, q2
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vpsel q1, q2, q1
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16

View File

@ -1739,9 +1739,10 @@ define arm_aapcs_vfpcc <4 x i32> @icmp_slt_v4i32_y(<4 x i32> %x, <4 x i32> %y, i
; CHECK-LABEL: icmp_slt_v4i32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s32 gt, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@ -1755,9 +1756,10 @@ define arm_aapcs_vfpcc <8 x i16> @icmp_slt_v8i16_y(<8 x i16> %x, <8 x i16> %y, i
; CHECK-LABEL: icmp_slt_v8i16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s16 gt, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@ -1771,9 +1773,10 @@ define arm_aapcs_vfpcc <16 x i8> @icmp_slt_v16i8_y(<16 x i8> %x, <16 x i8> %y, i
; CHECK-LABEL: icmp_slt_v16i8_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s8 gt, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@ -1787,9 +1790,10 @@ define arm_aapcs_vfpcc <4 x i32> @icmp_sgt_v4i32_y(<4 x i32> %x, <4 x i32> %y, i
; CHECK-LABEL: icmp_sgt_v4i32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s32 gt, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@ -1803,9 +1807,10 @@ define arm_aapcs_vfpcc <8 x i16> @icmp_sgt_v8i16_y(<8 x i16> %x, <8 x i16> %y, i
; CHECK-LABEL: icmp_sgt_v8i16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s16 gt, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@ -1819,9 +1824,10 @@ define arm_aapcs_vfpcc <16 x i8> @icmp_sgt_v16i8_y(<16 x i8> %x, <16 x i8> %y, i
; CHECK-LABEL: icmp_sgt_v16i8_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.s8 gt, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@ -1835,9 +1841,10 @@ define arm_aapcs_vfpcc <4 x i32> @icmp_ult_v4i32_y(<4 x i32> %x, <4 x i32> %y, i
; CHECK-LABEL: icmp_ult_v4i32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u32 hi, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@ -1851,9 +1858,10 @@ define arm_aapcs_vfpcc <8 x i16> @icmp_ult_v8i16_y(<8 x i16> %x, <8 x i16> %y, i
; CHECK-LABEL: icmp_ult_v8i16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u16 hi, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@ -1867,9 +1875,10 @@ define arm_aapcs_vfpcc <16 x i8> @icmp_ult_v16i8_y(<16 x i8> %x, <16 x i8> %y, i
; CHECK-LABEL: icmp_ult_v16i8_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u8 hi, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@ -1883,9 +1892,10 @@ define arm_aapcs_vfpcc <4 x i32> @icmp_ugt_v4i32_y(<4 x i32> %x, <4 x i32> %y, i
; CHECK-LABEL: icmp_ugt_v4i32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u32 hi, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@ -1899,9 +1909,10 @@ define arm_aapcs_vfpcc <8 x i16> @icmp_ugt_v8i16_y(<8 x i16> %x, <8 x i16> %y, i
; CHECK-LABEL: icmp_ugt_v8i16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u16 hi, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@ -1915,9 +1926,10 @@ define arm_aapcs_vfpcc <16 x i8> @icmp_ugt_v16i8_y(<16 x i8> %x, <16 x i8> %y, i
; CHECK-LABEL: icmp_ugt_v16i8_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.u8 hi, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@ -1931,9 +1943,10 @@ define arm_aapcs_vfpcc <4 x float> @fcmp_fast_olt_v4f32_y(<4 x float> %x, <4 x f
; CHECK-LABEL: fcmp_fast_olt_v4f32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.f32 gt, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@ -1947,9 +1960,10 @@ define arm_aapcs_vfpcc <8 x half> @fcmp_fast_olt_v8f16_y(<8 x half> %x, <8 x hal
; CHECK-LABEL: fcmp_fast_olt_v8f16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.f16 gt, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@ -1963,9 +1977,10 @@ define arm_aapcs_vfpcc <4 x float> @fcmp_fast_ogt_v4f32_y(<4 x float> %x, <4 x f
; CHECK-LABEL: fcmp_fast_ogt_v4f32_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.f32 gt, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@ -1979,9 +1994,10 @@ define arm_aapcs_vfpcc <8 x half> @fcmp_fast_ogt_v8f16_y(<8 x half> %x, <8 x hal
; CHECK-LABEL: fcmp_fast_ogt_v8f16_y:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vcmpt.f16 gt, q0, q1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmovt q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)

View File

@ -9,32 +9,22 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: vidup.u32 q2, r6, #1
; CHECK-NEXT: cmp r1, #4
; CHECK-NEXT: it ge
; CHECK-NEXT: movge.w r12, #4
; CHECK-NEXT: sub.w r6, r1, r12
; CHECK-NEXT: adds r6, #3
; CHECK-NEXT: mov.w lr, #1
; CHECK-NEXT: adr r4, .LCPI0_0
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w lr, lr, r6, lsr #2
; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: vmov.i32 q3, #0x4
; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB0_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q4, [r0], #16
; CHECK-NEXT: vcmp.f32 ge, q1, q4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
; CHECK-NEXT: vcmpt.f32 ge, q1, q4
; CHECK-NEXT: vpsel q0, q2, q0
; CHECK-NEXT: vpsel q1, q4, q1
; CHECK-NEXT: vmovt q1, q4
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vadd.i32 q2, q2, q3
; CHECK-NEXT: le lr, .LBB0_1
; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %do.end
; CHECK-NEXT: vldr s8, .LCPI0_1
; CHECK-NEXT: vdup.32 q3, r1

View File

@ -4,10 +4,11 @@
define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
; CHECK-LABEL: vctp8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q1
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <16 x i1> @llvm.arm.mve.vctp8(i32 %arg)
@ -20,10 +21,11 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
; CHECK-LABEL: vctp16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q1
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <8 x i1> @llvm.arm.mve.vctp16(i32 %arg)
@ -36,10 +38,11 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
; CHECK-LABEL: vctp32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q1
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %arg)