1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

[ARM] Use mov operand if the mov cannot be moved while tail predicating

There are some cases where the instruction that sets up the iteration
count for a tail predicated loop cannot be moved before the dlstp,
stopping tail predication entirely. This patch checks if the mov operand
can be used and if so, uses that instead.

Differential Revision: https://reviews.llvm.org/D86087
This commit is contained in:
Sam Tebbs 2020-08-17 16:03:55 +01:00
parent af8011f23e
commit 928822abd7
3 changed files with 375 additions and 12 deletions

View File

@ -226,6 +226,7 @@ namespace {
MachineInstr *Dec = nullptr;
MachineInstr *End = nullptr;
MachineInstr *VCTP = nullptr;
MachineOperand TPNumElements;
SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
VPTBlock *CurrentBlock = nullptr;
SetVector<MachineInstr*> CurrentPredicate;
@ -239,7 +240,8 @@ namespace {
LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
const ARMBaseInstrInfo &TII)
: ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) {
: ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
TPNumElements(MachineOperand::CreateImm(0)) {
MF = ML.getHeader()->getParent();
if (auto *MBB = ML.getLoopPreheader())
Preheader = MBB;
@ -291,11 +293,10 @@ namespace {
SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
// Return the loop iteration count, or the number of elements if we're tail
// predicating.
MachineOperand &getCount() {
return IsTailPredicationLegal() ?
VCTP->getOperand(1) : Start->getOperand(0);
// Return the operand for the loop start instruction. This will be the loop
// iteration count, or the number of elements if we're tail predicating.
MachineOperand &getLoopStartOperand() {
return IsTailPredicationLegal() ? TPNumElements : Start->getOperand(0);
}
unsigned getStartOpcode() const {
@ -453,7 +454,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
// of the iteration count, to the loop start instruction. The number of
// elements is provided to the vctp instruction, so we need to check that
// we can use this register at InsertPt.
Register NumElements = VCTP->getOperand(1).getReg();
TPNumElements = VCTP->getOperand(1);
Register NumElements = TPNumElements.getReg();
// If the register is defined within loop, then we can't perform TP.
// TODO: Check whether this is just a mov of a register that would be
@ -466,9 +468,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
// The element count register maybe defined after InsertPt, in which case we
// need to try to move either InsertPt or the def so that the [w|d]lstp can
// use the value.
// TODO: On failing to move an instruction, check if the count is provided by
// a mov and whether we can use the mov operand directly.
MachineBasicBlock *InsertBB = StartInsertPt->getParent();
if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) {
if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) {
if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) {
@ -482,9 +483,21 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
StartInsertPt);
LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
} else {
LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
<< "start instruction.\n");
return false;
// If we fail to move an instruction and the element count is provided
// by a mov, use the mov operand if it will have the same value at the
// insertion point
MachineOperand Operand = ElemDef->getOperand(1);
if (isMovRegOpcode(ElemDef->getOpcode()) &&
RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg()) ==
RDA.getUniqueReachingMIDef(StartInsertPt, Operand.getReg())) {
TPNumElements = Operand;
NumElements = TPNumElements.getReg();
} else {
LLVM_DEBUG(dbgs()
<< "ARM Loops: Unable to move element count to loop "
<< "start instruction.\n");
return false;
}
}
}
}

View File

@ -0,0 +1,269 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops -tail-predication=enabled %s -o - | FileCheck %s
--- |
define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) #0 {
entry:
%0 = add i32 %blockSize, 3
%1 = icmp slt i32 %blockSize, 4
%smin = select i1 %1, i32 %blockSize, i32 4
%2 = sub i32 %0, %smin
%3 = lshr i32 %2, 2
%4 = add nuw nsw i32 %3, 1
%5 = icmp slt i32 %blockSize, 4
%smin3 = select i1 %5, i32 %blockSize, i32 4
%6 = sub i32 %0, %smin3
%7 = lshr i32 %6, 2
%8 = add nuw nsw i32 %7, 1
call void @llvm.set.loop.iterations.i32(i32 %8)
br label %do.body.i
do.body.i: ; preds = %do.body.i, %entry
%blkCnt.0.i = phi i32 [ %13, %do.body.i ], [ %blockSize, %entry ]
%sumVec.0.i = phi <4 x float> [ %12, %do.body.i ], [ zeroinitializer, %entry ]
%pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ]
%9 = phi i32 [ %8, %entry ], [ %14, %do.body.i ]
%pSrc.addr.0.i2 = bitcast float* %pSrc.addr.0.i to <4 x float>*
%10 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i)
%11 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.0.i2, i32 4, <4 x i1> %10, <4 x float> zeroinitializer)
%12 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %sumVec.0.i, <4 x float> %11, <4 x i1> %10, <4 x float> %sumVec.0.i)
%add.ptr.i = getelementptr inbounds float, float* %pSrc.addr.0.i, i32 4
%13 = add i32 %blkCnt.0.i, -4
%14 = call i32 @llvm.loop.decrement.reg.i32(i32 %9, i32 1)
%15 = icmp ne i32 %14, 0
br i1 %15, label %do.body.i, label %arm_mean_f32_mve.exit
arm_mean_f32_mve.exit: ; preds = %do.body.i
%16 = extractelement <4 x float> %12, i32 3
%add2.i.i = fadd fast float %16, %16
%conv.i = uitofp i32 %blockSize to float
%div.i = fdiv fast float %add2.i.i, %conv.i
%17 = bitcast float %div.i to i32
%18 = insertelement <4 x i32> undef, i32 %17, i64 0
%19 = shufflevector <4 x i32> %18, <4 x i32> undef, <4 x i32> zeroinitializer
%20 = bitcast <4 x i32> %19 to <4 x float>
call void @llvm.set.loop.iterations.i32(i32 %4)
br label %do.body
do.body: ; preds = %do.body, %arm_mean_f32_mve.exit
%blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %26, %do.body ]
%sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %25, %do.body ]
%pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ]
%21 = phi i32 [ %4, %arm_mean_f32_mve.exit ], [ %27, %do.body ]
%pSrc.addr.01 = bitcast float* %pSrc.addr.0 to <4 x float>*
%22 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
%23 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.01, i32 4, <4 x i1> %22, <4 x float> zeroinitializer)
%24 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %23, <4 x float> %20, <4 x i1> %22, <4 x float> undef)
%25 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %24, <4 x float> %24, <4 x float> %sumVec.0, <4 x i1> %22)
%add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
%26 = add i32 %blkCnt.0, -4
%27 = call i32 @llvm.loop.decrement.reg.i32(i32 %21, i32 1)
%28 = icmp ne i32 %27, 0
br i1 %28, label %do.body, label %do.end
do.end: ; preds = %do.body
%29 = extractelement <4 x float> %25, i32 3
%add2.i = fadd fast float %29, %29
%sub2 = add i32 %blockSize, -1
%conv = uitofp i32 %sub2 to float
%div = fdiv fast float %add2.i, %conv
store float %div, float* %pResult, align 4
ret void
}
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) #1
; Function Attrs: nounwind readnone
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
; Function Attrs: argmemonly nounwind readonly willreturn
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
; Function Attrs: noduplicate nounwind
declare void @llvm.set.loop.iterations.i32(i32) #3
; Function Attrs: noduplicate nounwind
declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #3
attributes #0 = { "target-features"="+mve.fp" }
attributes #1 = { nounwind readnone "target-features"="+mve.fp" }
attributes #2 = { argmemonly nounwind readonly willreturn "target-features"="+mve.fp" }
attributes #3 = { noduplicate nounwind }
...
---
name: arm_var_f32_mve
alignment: 2
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers: []
liveins:
- { reg: '$r0', virtual-reg: '' }
- { reg: '$r1', virtual-reg: '' }
- { reg: '$r2', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 8
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 0
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack: []
stack:
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
; CHECK-LABEL: name: arm_var_f32_mve
; CHECK: bb.0.entry:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: liveins: $lr, $r0, $r1, $r2, $r4
; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
; CHECK: $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3
; CHECK: bb.1.do.body.i:
; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r12
; CHECK: renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
; CHECK: bb.2.arm_mean_f32_mve.exit:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: liveins: $q0, $r0, $r1, $r2
; CHECK: $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
; CHECK: $lr = MVE_DLSTP_32 $r1
; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
; CHECK: renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
; CHECK: renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
; CHECK: bb.3.do.body:
; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000)
; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3
; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
; CHECK: renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.01, align 4)
; CHECK: renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2
; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, killed renamable $q2, 0, killed $noreg
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.3
; CHECK: bb.4.do.end:
; CHECK: liveins: $q0, $r1, $r2
; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 1, 14 /* CC::al */, $noreg
; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
; CHECK: $s2 = VMOVSR killed $r0, 14 /* CC::al */, $noreg
; CHECK: renamable $s2 = VUITOS killed renamable $s2, 14 /* CC::al */, $noreg
; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s2, 14 /* CC::al */, $noreg
; CHECK: VSTRS killed renamable $s0, killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.pResult)
; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
bb.0.entry:
successors: %bb.1(0x80000000)
liveins: $r0, $r1, $r2, $r4, $lr
frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r4, -8
$r3 = tMOVr $r1, 14 /* CC::al */, $noreg
tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
t2IT 10, 8, implicit-def $itstate
renamable $r3 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate
renamable $r12 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg
renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
$r3 = tMOVr $r1, 14 /* CC::al */, $noreg
$r12 = tMOVr $r0, 14 /* CC::al */, $noreg
t2DoLoopStart renamable $lr
$r4 = tMOVr $lr, 14 /* CC::al */, $noreg
bb.1.do.body.i:
successors: %bb.1(0x7c000000), %bb.2(0x04000000)
liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r12
renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
renamable $lr = t2LoopDec killed renamable $lr, 1
MVE_VPST 4, implicit $vpr
renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0
t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
tB %bb.2, 14 /* CC::al */, $noreg
bb.2.arm_mean_f32_mve.exit:
successors: %bb.3(0x80000000)
liveins: $q0, $r0, $r1, $r2, $r4
$s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
$lr = tMOVr $r4, 14 /* CC::al */, $noreg
renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0
t2DoLoopStart killed $r4
renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
$r3 = tMOVr $r1, 14 /* CC::al */, $noreg
bb.3.do.body:
successors: %bb.3(0x7c000000), %bb.4(0x04000000)
liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3
renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
renamable $lr = t2LoopDec killed renamable $lr, 1
MVE_VPST 2, implicit $vpr
renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.01, align 4)
renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 1, renamable $vpr, undef renamable $q2
renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, renamable $q2, 1, killed renamable $vpr
t2LoopEnd renamable $lr, %bb.3, implicit-def dead $cpsr
tB %bb.4, 14 /* CC::al */, $noreg
bb.4.do.end:
liveins: $q0, $r1, $r2
renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 1, 14 /* CC::al */, $noreg
renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0
$s2 = VMOVSR killed $r0, 14 /* CC::al */, $noreg
renamable $s2 = VUITOS killed renamable $s2, 14 /* CC::al */, $noreg
renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s2, 14 /* CC::al */, $noreg
VSTRS killed renamable $s0, killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.pResult)
frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
...

View File

@ -0,0 +1,81 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) {
; CHECK-LABEL: .LBB0_1: @ %do.body.i
; CHECK: dlstp.32 lr, r1
; CHECK-NEXT: vadd.f32 s0, s3, s3
; CHECK-NEXT: vcvt.f32.u32 s4, s4
; CHECK-NEXT: vdiv.f32 s0, s0, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vdup.32 q1, r3
; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: .LBB0_3: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vldrw.u32 q2, [r0], #16
; CHECK-NEXT: vsub.f32 q2, q2, q1
; CHECK-NEXT: vfma.f32 q0, q2, q2
; CHECK-NEXT: letp lr, .LBB0_3
entry:
br label %do.body.i
do.body.i: ; preds = %entry, %do.body.i
%blkCnt.0.i = phi i32 [ %sub.i, %do.body.i ], [ %blockSize, %entry ]
%sumVec.0.i = phi <4 x float> [ %3, %do.body.i ], [ zeroinitializer, %entry ]
%pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ]
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i)
%1 = bitcast float* %pSrc.addr.0.i to <4 x float>*
%2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
%3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %sumVec.0.i, <4 x float> %2, <4 x i1> %0, <4 x float> %sumVec.0.i)
%sub.i = add nsw i32 %blkCnt.0.i, -4
%add.ptr.i = getelementptr inbounds float, float* %pSrc.addr.0.i, i32 4
%cmp.i = icmp sgt i32 %blkCnt.0.i, 4
br i1 %cmp.i, label %do.body.i, label %arm_mean_f32_mve.exit
arm_mean_f32_mve.exit: ; preds = %do.body.i
%4 = extractelement <4 x float> %3, i32 3
%add2.i.i = fadd fast float %4, %4
%conv.i = uitofp i32 %blockSize to float
%div.i = fdiv fast float %add2.i.i, %conv.i
%.splatinsert = insertelement <4 x float> undef, float %div.i, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
br label %do.body
do.body: ; preds = %do.body, %arm_mean_f32_mve.exit
%blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %sub, %do.body ]
%sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %9, %do.body ]
%pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ]
%5 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
%6 = bitcast float* %pSrc.addr.0 to <4 x float>*
%7 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %6, i32 4, <4 x i1> %5, <4 x float> zeroinitializer)
%8 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %7, <4 x float> %.splat, <4 x i1> %5, <4 x float> undef)
%9 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %8, <4 x float> %8, <4 x float> %sumVec.0, <4 x i1> %5)
%sub = add nsw i32 %blkCnt.0, -4
%add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
%cmp1 = icmp sgt i32 %blkCnt.0, 4
br i1 %cmp1, label %do.body, label %do.end
do.end: ; preds = %do.body
%10 = extractelement <4 x float> %9, i32 3
%add2.i = fadd fast float %10, %10
%sub2 = add i32 %blockSize, -1
%conv = uitofp i32 %sub2 to float
%div = fdiv fast float %add2.i, %conv
br label %cleanup
cleanup: ; preds = %entry, %do.end
store float %div, float* %pResult, align 4
ret void
}
declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)