From 9fd9749580b25d110980052e3b20df755091d764 Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 13 Jun 2021 13:55:34 +0100 Subject: [PATCH] [ARM] Introduce t2WhileLoopStartTP This adds t2WhileLoopStartTP, similar to the t2DoLoopStartTP added in D90591. It keeps a reference to both the tripcount register and the element count register, so that the ARMLowOverheadLoops pass in the backend can pick the correct one without having to search for it from the operand of a VCTP. Differential Revision: https://reviews.llvm.org/D103236 --- lib/Target/ARM/ARMBaseInstrInfo.cpp | 5 +- lib/Target/ARM/ARMBaseInstrInfo.h | 9 +--- lib/Target/ARM/ARMBlockPlacement.cpp | 11 +++-- lib/Target/ARM/ARMInstrThumb2.td | 24 ++++++---- lib/Target/ARM/ARMLowOverheadLoops.cpp | 42 ++++++++-------- .../ARM/MVETPAndVPTOptimisationsPass.cpp | 19 +++++--- lib/Target/ARM/MVETailPredUtils.h | 32 +++++++++---- .../Thumb2/LowOverheadLoops/memcall.ll | 9 ++-- .../LowOverheadLoops/wls-search-pred.mir | 4 +- .../Thumb2/mve-gather-scatter-optimisation.ll | 3 +- test/CodeGen/Thumb2/mve-memtp-loop.ll | 48 ++++++++----------- 11 files changed, 109 insertions(+), 97 deletions(-) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index aff8b8e21bd..b55d5492615 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6122,8 +6122,9 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, // Be conservative with ARMv8.1 MVE instructions. if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart || Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || - Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2LoopDec || - Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec) + Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2WhileLoopStartTP || + Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd || + Opc == ARM::t2LoopEndDec) return outliner::InstrType::Illegal; const MCInstrDesc &MCID = MI.getDesc(); diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 6eb997ba1a1..0ebba0d9fdd 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -367,7 +367,8 @@ public: bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override { return MI->getOpcode() == ARM::t2LoopEndDec || MI->getOpcode() == ARM::t2DoLoopStartTP || - MI->getOpcode() == ARM::t2WhileLoopStartLR; + MI->getOpcode() == ARM::t2WhileLoopStartLR || + MI->getOpcode() == ARM::t2WhileLoopStartTP; } private: @@ -645,12 +646,6 @@ static inline bool isJumpTableBranchOpcode(int Opc) { Opc == ARM::t2BR_JT; } -static inline bool isLowOverheadTerminatorOpcode(int Opc) { - return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || - Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2LoopEnd || - Opc == ARM::t2LoopEndDec; -} - static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; diff --git a/lib/Target/ARM/ARMBlockPlacement.cpp b/lib/Target/ARM/ARMBlockPlacement.cpp index 539db713f17..5ea47f529b2 100644 --- a/lib/Target/ARM/ARMBlockPlacement.cpp +++ b/lib/Target/ARM/ARMBlockPlacement.cpp @@ -15,6 +15,7 @@ #include "ARMBaseInstrInfo.h" #include "ARMBasicBlockInfo.h" #include "ARMSubtarget.h" +#include "MVETailPredUtils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -61,13 +62,13 @@ INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false, static MachineInstr *findWLSInBlock(MachineBasicBlock *MBB) { for (auto &Terminator : MBB->terminators()) { - if (Terminator.getOpcode() == ARM::t2WhileLoopStartLR) + if (isWhileLoopStart(Terminator)) return &Terminator; } return nullptr; } -/// Find t2WhileLoopStartLR in the loop predecessor BB or otherwise in its only +/// Find WhileLoopStart in the loop predecessor BB or otherwise in its only /// predecessor. If found, returns (BB, WLS Instr) pair, otherwise a null pair. static MachineInstr *findWLS(MachineLoop *ML) { MachineBasicBlock *Predecessor = ML->getLoopPredecessor(); @@ -93,7 +94,7 @@ bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) { return false; MachineBasicBlock *Predecessor = WlsInstr->getParent(); - MachineBasicBlock *LoopExit = WlsInstr->getOperand(2).getMBB(); + MachineBasicBlock *LoopExit = getWhileLoopStartTargetBB(*WlsInstr); // We don't want to move Preheader to before the function's entry block. if (!LoopExit->getPrevNode()) @@ -118,9 +119,9 @@ bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) { ++It) { MachineBasicBlock *MBB = &*It; for (auto &Terminator : MBB->terminators()) { - if (Terminator.getOpcode() != ARM::t2WhileLoopStartLR) + if (!isWhileLoopStart(Terminator)) continue; - MachineBasicBlock *WLSTarget = Terminator.getOperand(2).getMBB(); + MachineBasicBlock *WLSTarget = getWhileLoopStartTargetBB(Terminator); // TODO: Analyse the blocks to make a decision if it would be worth // moving Preheader even if we'd introduce a backwards WLS if (WLSTarget == Predecessor) { diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 5f7cfa371ff..1258c70b81f 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -5479,8 +5479,8 @@ let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in { // t2DoLoopStart a pseudo for DLS hardware loops. Lowered into a DLS in // ARMLowOverheadLoops if possible, or reverted to a Mov if not. def t2DoLoopStart : - t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br, - [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>; + t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc), 4, IIC_Br, + [(set GPRlr:$X, (int_start_loop_iterations rGPR:$tc))]>; // A pseudo for a DLSTP, created in the MVETPAndVPTOptimizationPass from a // t2DoLoopStart if the loops is tail predicated. Holds both the element @@ -5488,7 +5488,7 @@ def t2DoLoopStart : // ARMLowOverheadLoops when it is converted to a DLSTP or DLS as required. let isTerminator = 1, hasSideEffects = 1 in def t2DoLoopStartTP : - t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>; + t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc, rGPR:$elts), 4, IIC_Br, []>; // Setup for a t2WhileLoopStart. A pair of t2WhileLoopSetup and t2WhileLoopStart // will be created post-ISel from a llvm.test.start.loop.iterations. This @@ -5496,7 +5496,7 @@ def t2DoLoopStartTP : // valid after reg alloc, as it should be lowered during MVETPAndVPTOptimisations // into a t2WhileLoopStartLR (or expanded). def t2WhileLoopSetup : - t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$elts), 4, IIC_Br, []>; + t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$tc), 4, IIC_Br, []>; // A pseudo to represent the decrement in a low overhead loop. A t2LoopDec and // t2LoopEnd together represent a LE instruction. Ideally these are converted @@ -5511,7 +5511,7 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in { // into a t2WhileLoopStartLR that does both the LR setup and branch. def t2WhileLoopStart : t2PseudoInst<(outs), - (ins GPRlr:$elts, brtarget:$target), + (ins GPRlr:$tc, brtarget:$target), 4, IIC_Br, []>, Sched<[WriteBr]>; @@ -5521,13 +5521,21 @@ def t2WhileLoopStart : // converted into t2CMP and t2Bcc. def t2WhileLoopStartLR : t2PseudoInst<(outs GPRlr:$lr), - (ins rGPR:$elts, brtarget:$target), + (ins rGPR:$tc, brtarget:$target), + 8, IIC_Br, []>, + Sched<[WriteBr]>; + +// Similar to a t2DoLoopStartTP, a t2WhileLoopStartTP is a pseudo for a WLSTP +// holding both the element count and the tripcount of the loop. +def t2WhileLoopStartTP : + t2PseudoInst<(outs GPRlr:$lr), + (ins rGPR:$tc, rGPR:$elts, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; // t2LoopEnd - the branch half of a t2LoopDec/t2LoopEnd pair. def t2LoopEnd : - t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target), + t2PseudoInst<(outs), (ins GPRlr:$tc, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; // The combination of a t2LoopDec and t2LoopEnd, performing both the LR @@ -5535,7 +5543,7 @@ def t2LoopEnd : // LETP in ARMLowOverheadLoops as appropriate, or converted to t2CMP/t2Bcc // if the branches are out of range. def t2LoopEndDec : - t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target), + t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$tc, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; } // end isBranch, isTerminator, hasSideEffects diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp index e1d77a585d2..ecdb380cc34 100644 --- a/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -101,10 +101,6 @@ static bool shouldInspect(MachineInstr &MI) { return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI); } -static bool isDo(MachineInstr *MI) { - return MI->getOpcode() != ARM::t2WhileLoopStartLR; -} - namespace { using InstSet = SmallPtrSetImpl; @@ -446,7 +442,7 @@ namespace { } unsigned getStartOpcode() const { - bool IsDo = isDo(Start); + bool IsDo = isDoLoopStart(*Start); if (!IsTailPredicationLegal()) return IsDo ? ARM::t2DLS : ARM::t2WLS; @@ -635,7 +631,8 @@ bool LowOverheadLoop::ValidateTailPredicate() { // elements is provided to the vctp instruction, so we need to check that // we can use this register at InsertPt. MachineInstr *VCTP = VCTPs.back(); - if (Start->getOpcode() == ARM::t2DoLoopStartTP) { + if (Start->getOpcode() == ARM::t2DoLoopStartTP || + Start->getOpcode() == ARM::t2WhileLoopStartTP) { TPNumElements = Start->getOperand(2); StartInsertPt = Start; StartInsertBB = Start->getParent(); @@ -778,10 +775,12 @@ bool LowOverheadLoop::ValidateTailPredicate() { } } - // If we converted the LoopStart to a t2DoLoopStartTP, we can also remove any - // extra instructions in the preheader, which often includes a now unused MOV. - if (Start->getOpcode() == ARM::t2DoLoopStartTP && Preheader && - !Preheader->empty() && + // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we + // can also remove any extra instructions in the preheader, which often + // includes a now unused MOV. + if ((Start->getOpcode() == ARM::t2DoLoopStartTP || + Start->getOpcode() == ARM::t2WhileLoopStartTP) && + Preheader && !Preheader->empty() && !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) { if (auto *Def = RDA.getUniqueReachingMIDef( &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) { @@ -1045,12 +1044,13 @@ void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { return false; } - if (Start->getOpcode() == ARM::t2WhileLoopStartLR && - (BBUtils->getOffsetOf(Start) > - BBUtils->getOffsetOf(Start->getOperand(2).getMBB()) || - !BBUtils->isBBInRange(Start, Start->getOperand(2).getMBB(), 4094))) { - LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); - return false; + if (isWhileLoopStart(*Start)) { + MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start); + if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) || + !BBUtils->isBBInRange(Start, TargetBB, 4094)) { + LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); + return false; + } } return true; }; @@ -1289,7 +1289,7 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { // another low register. void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); - MachineBasicBlock *DestBB = MI->getOperand(2).getMBB(); + MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI); unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; @@ -1426,8 +1426,8 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { MIB.addDef(ARM::LR); MIB.add(Count); - if (!isDo(Start)) - MIB.add(Start->getOperand(2)); + if (isWhileLoopStart(*Start)) + MIB.addMBB(getWhileLoopStartTargetBB(*Start)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); NewStart = &*MIB; @@ -1612,7 +1612,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { }; if (LoLoop.Revert) { - if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStartLR) + if (isWhileLoopStart(*LoLoop.Start)) RevertWhile(LoLoop.Start); else RevertDo(LoLoop.Start); @@ -1683,7 +1683,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() { Changed = true; for (auto *Start : Starts) { - if (Start->getOpcode() == ARM::t2WhileLoopStartLR) + if (isWhileLoopStart(*Start)) RevertWhile(Start); else RevertDo(Start); diff --git a/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index 2aa5d6ad842..6fa5402096a 100644 --- a/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -429,7 +429,8 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) return false; - if (LoopDec != LoopEnd || LoopStart->getOpcode() != ARM::t2DoLoopStart) + if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart && + LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) return false; SmallVector VCTPs; @@ -494,12 +495,16 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, return false; } - MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), - TII->get(ARM::t2DoLoopStartTP)) - .add(LoopStart->getOperand(0)) - .add(LoopStart->getOperand(1)) - .addReg(CountReg); - (void)MI; + unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart + ? ARM::t2DoLoopStartTP + : ARM::t2WhileLoopStartTP; + MachineInstrBuilder MI = + BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc)) + .add(LoopStart->getOperand(0)) + .add(LoopStart->getOperand(1)) + .addReg(CountReg); + if (NewOpc == ARM::t2WhileLoopStartTP) + MI.add(LoopStart->getOperand(2)); LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with " << *MI.getInstr()); MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass); diff --git a/lib/Target/ARM/MVETailPredUtils.h b/lib/Target/ARM/MVETailPredUtils.h index b0c003120fa..8c64893d448 100644 --- a/lib/Target/ARM/MVETailPredUtils.h +++ b/lib/Target/ARM/MVETailPredUtils.h @@ -68,11 +68,26 @@ static inline bool isVCTP(const MachineInstr *MI) { return false; } -static inline bool isLoopStart(MachineInstr &MI) { +static inline bool isDoLoopStart(const MachineInstr &MI) { return MI.getOpcode() == ARM::t2DoLoopStart || - MI.getOpcode() == ARM::t2DoLoopStartTP || - MI.getOpcode() == ARM::t2WhileLoopStart || - MI.getOpcode() == ARM::t2WhileLoopStartLR; + MI.getOpcode() == ARM::t2DoLoopStartTP; +} + +static inline bool isWhileLoopStart(const MachineInstr &MI) { + return MI.getOpcode() == ARM::t2WhileLoopStart || + MI.getOpcode() == ARM::t2WhileLoopStartLR || + MI.getOpcode() == ARM::t2WhileLoopStartTP; +} + +static inline bool isLoopStart(const MachineInstr &MI) { + return isDoLoopStart(MI) || isWhileLoopStart(MI); +} + +// Return the TargetBB stored in a t2WhileLoopStartLR/t2WhileLoopStartTP. +inline MachineBasicBlock *getWhileLoopStartTargetBB(const MachineInstr &MI) { + assert(isWhileLoopStart(MI) && "Expected WhileLoopStart!"); + unsigned Op = MI.getOpcode() == ARM::t2WhileLoopStartTP ? 3 : 2; + return MI.getOperand(Op).getMBB(); } // WhileLoopStart holds the exit block, so produce a subs Op0, Op1, 0 and then a @@ -84,8 +99,9 @@ inline void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII, unsigned BrOpc = ARM::t2Bcc, bool UseCmp = false) { MachineBasicBlock *MBB = MI->getParent(); - assert(MI->getOpcode() == ARM::t2WhileLoopStartLR && - "Only expected a t2WhileLoopStartLR in RevertWhileLoopStartLR!"); + assert((MI->getOpcode() == ARM::t2WhileLoopStartLR || + MI->getOpcode() == ARM::t2WhileLoopStartTP) && + "Only expected a t2WhileLoopStartLR/TP in RevertWhileLoopStartLR!"); // Subs/Cmp if (UseCmp) { @@ -109,8 +125,8 @@ inline void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII, // Branch MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); - MIB.add(MI->getOperand(2)); // branch target - MIB.addImm(ARMCC::EQ); // condition code + MIB.addMBB(getWhileLoopStartTargetBB(*MI)); // branch target + MIB.addImm(ARMCC::EQ); // condition code MIB.addReg(ARM::CPSR); MI->eraseFromParent(); diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll index 8c8f6784425..040e026e6a8 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll +++ b/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll @@ -17,8 +17,7 @@ define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, ; CHECK-NEXT: @ Child Loop BB0_4 Depth 2 ; CHECK-NEXT: adds r4, r1, r7 ; CHECK-NEXT: adds r5, r0, r7 -; CHECK-NEXT: mov r6, r3 -; CHECK-NEXT: wlstp.8 lr, r6, .LBB0_3 +; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_3 ; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .LBB0_3: @ %for.body ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 @@ -71,8 +70,7 @@ define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) { ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_4 Depth 2 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: wlstp.8 lr, r3, .LBB1_3 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB1_3 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_3: @ %for.body ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 @@ -285,8 +283,7 @@ define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) { ; CHECK-NEXT: @ %bb.1: @ %prehead ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: wlstp.8 lr, r3, .LBB6_3 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB6_3 ; CHECK-NEXT: .LBB6_2: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q0, [r12], #16 ; CHECK-NEXT: letp lr, .LBB6_2 diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir b/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir index 234b112050d..e94af93d8cf 100644 --- a/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir +++ b/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir @@ -63,11 +63,11 @@ body: | ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[t2WhileLoopStartLR:%[0-9]+]]:gprlr = t2WhileLoopStartLR killed [[t2LSRri]], %bb.3, implicit-def $cpsr + ; CHECK: [[t2WhileLoopStartTP:%[0-9]+]]:gprlr = t2WhileLoopStartTP killed [[t2LSRri]], [[COPY]], %bb.3, implicit-def $cpsr ; CHECK: bb.2: ; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %11, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartLR]], %bb.1, %13, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartTP]], %bb.1, %13, %bb.2 ; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %15, %bb.2 ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg diff --git a/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index 21ce978ee49..56421bde7b6 100644 --- a/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -634,8 +634,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r3, r0, r5, lsl #1 -; CHECK-NEXT: mov r5, r6 -; CHECK-NEXT: wlstp.8 lr, r5, .LBB10_4 +; CHECK-NEXT: wlstp.8 lr, r6, .LBB10_4 ; CHECK-NEXT: b .LBB10_15 ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 diff --git a/test/CodeGen/Thumb2/mve-memtp-loop.ll b/test/CodeGen/Thumb2/mve-memtp-loop.ll index 8929e082864..ea1f75bbc70 100644 --- a/test/CodeGen/Thumb2/mve-memtp-loop.ll +++ b/test/CodeGen/Thumb2/mve-memtp-loop.ll @@ -235,8 +235,7 @@ define void @test11(i8* nocapture %x, i8* nocapture %y, i32 %n) { ; CHECK-NEXT: .LBB10_1: @ %prehead ; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: wlstp.8 lr, r3, .LBB10_3 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB10_3 ; CHECK-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r12], #16 ; CHECK-NEXT: vstrb.8 q0, [r4], #16 @@ -318,8 +317,7 @@ define void @twoloops(i32* %X, i32 %n, i32 %m) { ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB13_2 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB13_2 ; CHECK-NEXT: .LBB13_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB13_1 @@ -489,8 +487,7 @@ define void @multilooped_exit(i32 %b) { ; CHECK-NEXT: movt r3, :upper16:arr_56 ; CHECK-NEXT: lsr.w r12, r1, #4 ; CHECK-NEXT: mov r2, r3 -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_5 +; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_5 ; CHECK-NEXT: .LBB18_4: @ Parent Loop BB18_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 @@ -498,8 +495,7 @@ define void @multilooped_exit(i32 %b) { ; CHECK-NEXT: .LBB18_5: @ %loop ; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1 ; CHECK-NEXT: mov r2, r3 -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_7 +; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_7 ; CHECK-NEXT: .LBB18_6: @ Parent Loop BB18_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 @@ -507,8 +503,7 @@ define void @multilooped_exit(i32 %b) { ; CHECK-NEXT: .LBB18_7: @ %loop ; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1 ; CHECK-NEXT: mov r2, r3 -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_9 +; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_9 ; CHECK-NEXT: .LBB18_8: @ Parent Loop BB18_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 @@ -567,12 +562,10 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: movw r0, :lower16:arr_22 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: movt r0, :upper16:arr_22 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: add.w r1, r2, #15 ; CHECK-NEXT: lsrs r3, r1, #4 -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_2 +; CHECK-NEXT: strd r3, r2, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_2 ; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_1 @@ -621,11 +614,12 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: le lr, .LBB19_3 ; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup6 ; CHECK-NEXT: movw r0, :lower16:arr_22 -; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: movt r0, :upper16:arr_22 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-NEXT: add.w r0, r0, #1824 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_6 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_6 ; CHECK-NEXT: .LBB19_5: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_5 @@ -675,11 +669,12 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: le lr, .LBB19_7 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup6.1 ; CHECK-NEXT: movw r0, :lower16:arr_22 -; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: movt r0, :upper16:arr_22 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-NEXT: add.w r0, r0, #3648 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_10 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_10 ; CHECK-NEXT: .LBB19_9: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_9 @@ -731,19 +726,14 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: le lr, .LBB19_11 ; CHECK-NEXT: @ %bb.12: @ %for.cond.cleanup6.2 ; CHECK-NEXT: movw r0, :lower16:arr_22 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: movt r0, :upper16:arr_22 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: add.w r0, r0, #5472 -; CHECK-NEXT: wls lr, r1, .LBB19_14 +; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_14 ; CHECK-NEXT: .LBB19_13: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vctp.8 r1 -; CHECK-NEXT: subs r1, #16 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrbt.8 q1, [r0], #16 -; CHECK-NEXT: le lr, .LBB19_13 +; CHECK-NEXT: vstrb.8 q1, [r0], #16 +; CHECK-NEXT: letp lr, .LBB19_13 ; CHECK-NEXT: .LBB19_14: @ %for.cond.cleanup6.2 ; CHECK-NEXT: movw r2, :lower16:arr_21 ; CHECK-NEXT: movw r1, #5508