1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 20:23:11 +01:00

[ARM] Introduce t2WhileLoopStartTP

This adds t2WhileLoopStartTP, similar to the t2DoLoopStartTP added in
D90591. It keeps a reference to both the tripcount register and the
element count register, so that the ARMLowOverheadLoops pass in the
backend can pick the correct one without having to search for it from
the operand of a VCTP.

Differential Revision: https://reviews.llvm.org/D103236
This commit is contained in:
David Green 2021-06-13 13:55:34 +01:00
parent 416150a164
commit 9fd9749580
11 changed files with 109 additions and 97 deletions

View File

@ -6122,8 +6122,9 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
// Be conservative with ARMv8.1 MVE instructions.
if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart ||
Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2LoopDec ||
Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec)
Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2WhileLoopStartTP ||
Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd ||
Opc == ARM::t2LoopEndDec)
return outliner::InstrType::Illegal;
const MCInstrDesc &MCID = MI.getDesc();

View File

@ -367,7 +367,8 @@ public:
bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override {
return MI->getOpcode() == ARM::t2LoopEndDec ||
MI->getOpcode() == ARM::t2DoLoopStartTP ||
MI->getOpcode() == ARM::t2WhileLoopStartLR;
MI->getOpcode() == ARM::t2WhileLoopStartLR ||
MI->getOpcode() == ARM::t2WhileLoopStartTP;
}
private:
@ -645,12 +646,6 @@ static inline bool isJumpTableBranchOpcode(int Opc) {
Opc == ARM::t2BR_JT;
}
static inline bool isLowOverheadTerminatorOpcode(int Opc) {
return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2LoopEnd ||
Opc == ARM::t2LoopEndDec;
}
static inline
bool isIndirectBranchOpcode(int Opc) {
return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;

View File

@ -15,6 +15,7 @@
#include "ARMBaseInstrInfo.h"
#include "ARMBasicBlockInfo.h"
#include "ARMSubtarget.h"
#include "MVETailPredUtils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
@ -61,13 +62,13 @@ INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false,
static MachineInstr *findWLSInBlock(MachineBasicBlock *MBB) {
for (auto &Terminator : MBB->terminators()) {
if (Terminator.getOpcode() == ARM::t2WhileLoopStartLR)
if (isWhileLoopStart(Terminator))
return &Terminator;
}
return nullptr;
}
/// Find t2WhileLoopStartLR in the loop predecessor BB or otherwise in its only
/// Find WhileLoopStart in the loop predecessor BB or otherwise in its only
/// predecessor. If found, returns (BB, WLS Instr) pair, otherwise a null pair.
static MachineInstr *findWLS(MachineLoop *ML) {
MachineBasicBlock *Predecessor = ML->getLoopPredecessor();
@ -93,7 +94,7 @@ bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
return false;
MachineBasicBlock *Predecessor = WlsInstr->getParent();
MachineBasicBlock *LoopExit = WlsInstr->getOperand(2).getMBB();
MachineBasicBlock *LoopExit = getWhileLoopStartTargetBB(*WlsInstr);
// We don't want to move Preheader to before the function's entry block.
if (!LoopExit->getPrevNode())
@ -118,9 +119,9 @@ bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
++It) {
MachineBasicBlock *MBB = &*It;
for (auto &Terminator : MBB->terminators()) {
if (Terminator.getOpcode() != ARM::t2WhileLoopStartLR)
if (!isWhileLoopStart(Terminator))
continue;
MachineBasicBlock *WLSTarget = Terminator.getOperand(2).getMBB();
MachineBasicBlock *WLSTarget = getWhileLoopStartTargetBB(Terminator);
// TODO: Analyse the blocks to make a decision if it would be worth
// moving Preheader even if we'd introduce a backwards WLS
if (WLSTarget == Predecessor) {

View File

@ -5479,8 +5479,8 @@ let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {
// t2DoLoopStart a pseudo for DLS hardware loops. Lowered into a DLS in
// ARMLowOverheadLoops if possible, or reverted to a Mov if not.
def t2DoLoopStart :
t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
[(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc), 4, IIC_Br,
[(set GPRlr:$X, (int_start_loop_iterations rGPR:$tc))]>;
// A pseudo for a DLSTP, created in the MVETPAndVPTOptimizationPass from a
// t2DoLoopStart if the loops is tail predicated. Holds both the element
@ -5488,7 +5488,7 @@ def t2DoLoopStart :
// ARMLowOverheadLoops when it is converted to a DLSTP or DLS as required.
let isTerminator = 1, hasSideEffects = 1 in
def t2DoLoopStartTP :
t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>;
t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc, rGPR:$elts), 4, IIC_Br, []>;
// Setup for a t2WhileLoopStart. A pair of t2WhileLoopSetup and t2WhileLoopStart
// will be created post-ISel from a llvm.test.start.loop.iterations. This
@ -5496,7 +5496,7 @@ def t2DoLoopStartTP :
// valid after reg alloc, as it should be lowered during MVETPAndVPTOptimisations
// into a t2WhileLoopStartLR (or expanded).
def t2WhileLoopSetup :
t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$elts), 4, IIC_Br, []>;
t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$tc), 4, IIC_Br, []>;
// A pseudo to represent the decrement in a low overhead loop. A t2LoopDec and
// t2LoopEnd together represent a LE instruction. Ideally these are converted
@ -5511,7 +5511,7 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in {
// into a t2WhileLoopStartLR that does both the LR setup and branch.
def t2WhileLoopStart :
t2PseudoInst<(outs),
(ins GPRlr:$elts, brtarget:$target),
(ins GPRlr:$tc, brtarget:$target),
4, IIC_Br, []>,
Sched<[WriteBr]>;
@ -5521,13 +5521,21 @@ def t2WhileLoopStart :
// converted into t2CMP and t2Bcc.
def t2WhileLoopStartLR :
t2PseudoInst<(outs GPRlr:$lr),
(ins rGPR:$elts, brtarget:$target),
(ins rGPR:$tc, brtarget:$target),
8, IIC_Br, []>,
Sched<[WriteBr]>;
// Similar to a t2DoLoopStartTP, a t2WhileLoopStartTP is a pseudo for a WLSTP
// holding both the element count and the tripcount of the loop.
def t2WhileLoopStartTP :
t2PseudoInst<(outs GPRlr:$lr),
(ins rGPR:$tc, rGPR:$elts, brtarget:$target),
8, IIC_Br, []>,
Sched<[WriteBr]>;
// t2LoopEnd - the branch half of a t2LoopDec/t2LoopEnd pair.
def t2LoopEnd :
t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
t2PseudoInst<(outs), (ins GPRlr:$tc, brtarget:$target),
8, IIC_Br, []>, Sched<[WriteBr]>;
// The combination of a t2LoopDec and t2LoopEnd, performing both the LR
@ -5535,7 +5543,7 @@ def t2LoopEnd :
// LETP in ARMLowOverheadLoops as appropriate, or converted to t2CMP/t2Bcc
// if the branches are out of range.
def t2LoopEndDec :
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target),
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$tc, brtarget:$target),
8, IIC_Br, []>, Sched<[WriteBr]>;
} // end isBranch, isTerminator, hasSideEffects

View File

@ -101,10 +101,6 @@ static bool shouldInspect(MachineInstr &MI) {
return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
}
static bool isDo(MachineInstr *MI) {
return MI->getOpcode() != ARM::t2WhileLoopStartLR;
}
namespace {
using InstSet = SmallPtrSetImpl<MachineInstr *>;
@ -446,7 +442,7 @@ namespace {
}
unsigned getStartOpcode() const {
bool IsDo = isDo(Start);
bool IsDo = isDoLoopStart(*Start);
if (!IsTailPredicationLegal())
return IsDo ? ARM::t2DLS : ARM::t2WLS;
@ -635,7 +631,8 @@ bool LowOverheadLoop::ValidateTailPredicate() {
// elements is provided to the vctp instruction, so we need to check that
// we can use this register at InsertPt.
MachineInstr *VCTP = VCTPs.back();
if (Start->getOpcode() == ARM::t2DoLoopStartTP) {
if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
Start->getOpcode() == ARM::t2WhileLoopStartTP) {
TPNumElements = Start->getOperand(2);
StartInsertPt = Start;
StartInsertBB = Start->getParent();
@ -778,10 +775,12 @@ bool LowOverheadLoop::ValidateTailPredicate() {
}
}
// If we converted the LoopStart to a t2DoLoopStartTP, we can also remove any
// extra instructions in the preheader, which often includes a now unused MOV.
if (Start->getOpcode() == ARM::t2DoLoopStartTP && Preheader &&
!Preheader->empty() &&
// If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
// can also remove any extra instructions in the preheader, which often
// includes a now unused MOV.
if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
Preheader && !Preheader->empty() &&
!RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
if (auto *Def = RDA.getUniqueReachingMIDef(
&Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
@ -1045,12 +1044,13 @@ void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
return false;
}
if (Start->getOpcode() == ARM::t2WhileLoopStartLR &&
(BBUtils->getOffsetOf(Start) >
BBUtils->getOffsetOf(Start->getOperand(2).getMBB()) ||
!BBUtils->isBBInRange(Start, Start->getOperand(2).getMBB(), 4094))) {
LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
return false;
if (isWhileLoopStart(*Start)) {
MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
!BBUtils->isBBInRange(Start, TargetBB, 4094)) {
LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
return false;
}
}
return true;
};
@ -1289,7 +1289,7 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
// another low register.
void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);
unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
ARM::tBcc : ARM::t2Bcc;
@ -1426,8 +1426,8 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
MIB.addDef(ARM::LR);
MIB.add(Count);
if (!isDo(Start))
MIB.add(Start->getOperand(2));
if (isWhileLoopStart(*Start))
MIB.addMBB(getWhileLoopStartTargetBB(*Start));
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
NewStart = &*MIB;
@ -1612,7 +1612,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
};
if (LoLoop.Revert) {
if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStartLR)
if (isWhileLoopStart(*LoLoop.Start))
RevertWhile(LoLoop.Start);
else
RevertDo(LoLoop.Start);
@ -1683,7 +1683,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
Changed = true;
for (auto *Start : Starts) {
if (Start->getOpcode() == ARM::t2WhileLoopStartLR)
if (isWhileLoopStart(*Start))
RevertWhile(Start);
else
RevertDo(Start);

View File

@ -429,7 +429,8 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
return false;
if (LoopDec != LoopEnd || LoopStart->getOpcode() != ARM::t2DoLoopStart)
if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
LoopStart->getOpcode() != ARM::t2WhileLoopStartLR))
return false;
SmallVector<MachineInstr *, 4> VCTPs;
@ -494,12 +495,16 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
return false;
}
MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
TII->get(ARM::t2DoLoopStartTP))
.add(LoopStart->getOperand(0))
.add(LoopStart->getOperand(1))
.addReg(CountReg);
(void)MI;
unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart
? ARM::t2DoLoopStartTP
: ARM::t2WhileLoopStartTP;
MachineInstrBuilder MI =
BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc))
.add(LoopStart->getOperand(0))
.add(LoopStart->getOperand(1))
.addReg(CountReg);
if (NewOpc == ARM::t2WhileLoopStartTP)
MI.add(LoopStart->getOperand(2));
LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with "
<< *MI.getInstr());
MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);

View File

@ -68,11 +68,26 @@ static inline bool isVCTP(const MachineInstr *MI) {
return false;
}
static inline bool isLoopStart(MachineInstr &MI) {
static inline bool isDoLoopStart(const MachineInstr &MI) {
return MI.getOpcode() == ARM::t2DoLoopStart ||
MI.getOpcode() == ARM::t2DoLoopStartTP ||
MI.getOpcode() == ARM::t2WhileLoopStart ||
MI.getOpcode() == ARM::t2WhileLoopStartLR;
MI.getOpcode() == ARM::t2DoLoopStartTP;
}
static inline bool isWhileLoopStart(const MachineInstr &MI) {
return MI.getOpcode() == ARM::t2WhileLoopStart ||
MI.getOpcode() == ARM::t2WhileLoopStartLR ||
MI.getOpcode() == ARM::t2WhileLoopStartTP;
}
static inline bool isLoopStart(const MachineInstr &MI) {
return isDoLoopStart(MI) || isWhileLoopStart(MI);
}
// Return the TargetBB stored in a t2WhileLoopStartLR/t2WhileLoopStartTP.
inline MachineBasicBlock *getWhileLoopStartTargetBB(const MachineInstr &MI) {
assert(isWhileLoopStart(MI) && "Expected WhileLoopStart!");
unsigned Op = MI.getOpcode() == ARM::t2WhileLoopStartTP ? 3 : 2;
return MI.getOperand(Op).getMBB();
}
// WhileLoopStart holds the exit block, so produce a subs Op0, Op1, 0 and then a
@ -84,8 +99,9 @@ inline void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII,
unsigned BrOpc = ARM::t2Bcc,
bool UseCmp = false) {
MachineBasicBlock *MBB = MI->getParent();
assert(MI->getOpcode() == ARM::t2WhileLoopStartLR &&
"Only expected a t2WhileLoopStartLR in RevertWhileLoopStartLR!");
assert((MI->getOpcode() == ARM::t2WhileLoopStartLR ||
MI->getOpcode() == ARM::t2WhileLoopStartTP) &&
"Only expected a t2WhileLoopStartLR/TP in RevertWhileLoopStartLR!");
// Subs/Cmp
if (UseCmp) {
@ -109,8 +125,8 @@ inline void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII,
// Branch
MachineInstrBuilder MIB =
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
MIB.add(MI->getOperand(2)); // branch target
MIB.addImm(ARMCC::EQ); // condition code
MIB.addMBB(getWhileLoopStartTargetBB(*MI)); // branch target
MIB.addImm(ARMCC::EQ); // condition code
MIB.addReg(ARM::CPSR);
MI->eraseFromParent();

View File

@ -17,8 +17,7 @@ define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n,
; CHECK-NEXT: @ Child Loop BB0_4 Depth 2
; CHECK-NEXT: adds r4, r1, r7
; CHECK-NEXT: adds r5, r0, r7
; CHECK-NEXT: mov r6, r3
; CHECK-NEXT: wlstp.8 lr, r6, .LBB0_3
; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_3
; CHECK-NEXT: b .LBB0_4
; CHECK-NEXT: .LBB0_3: @ %for.body
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
@ -71,8 +70,7 @@ define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_4 Depth 2
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: wlstp.8 lr, r3, .LBB1_3
; CHECK-NEXT: wlstp.8 lr, r2, .LBB1_3
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_3: @ %for.body
; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
@ -285,8 +283,7 @@ define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) {
; CHECK-NEXT: @ %bb.1: @ %prehead
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: wlstp.8 lr, r3, .LBB6_3
; CHECK-NEXT: wlstp.8 lr, r2, .LBB6_3
; CHECK-NEXT: .LBB6_2: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vstrb.8 q0, [r12], #16
; CHECK-NEXT: letp lr, .LBB6_2

View File

@ -63,11 +63,11 @@ body: |
; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2WhileLoopStartLR:%[0-9]+]]:gprlr = t2WhileLoopStartLR killed [[t2LSRri]], %bb.3, implicit-def $cpsr
; CHECK: [[t2WhileLoopStartTP:%[0-9]+]]:gprlr = t2WhileLoopStartTP killed [[t2LSRri]], [[COPY]], %bb.3, implicit-def $cpsr
; CHECK: bb.2:
; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %11, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartLR]], %bb.1, %13, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartTP]], %bb.1, %13, %bb.2
; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %15, %bb.2
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg

View File

@ -634,8 +634,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: add.w r3, r0, r5, lsl #1
; CHECK-NEXT: mov r5, r6
; CHECK-NEXT: wlstp.8 lr, r5, .LBB10_4
; CHECK-NEXT: wlstp.8 lr, r6, .LBB10_4
; CHECK-NEXT: b .LBB10_15
; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1

View File

@ -235,8 +235,7 @@ define void @test11(i8* nocapture %x, i8* nocapture %y, i32 %n) {
; CHECK-NEXT: .LBB10_1: @ %prehead
; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: wlstp.8 lr, r3, .LBB10_3
; CHECK-NEXT: wlstp.8 lr, r2, .LBB10_3
; CHECK-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r12], #16
; CHECK-NEXT: vstrb.8 q0, [r4], #16
@ -318,8 +317,7 @@ define void @twoloops(i32* %X, i32 %n, i32 %m) {
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: wlstp.8 lr, r1, .LBB13_2
; CHECK-NEXT: wlstp.8 lr, r2, .LBB13_2
; CHECK-NEXT: .LBB13_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vstrb.8 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB13_1
@ -489,8 +487,7 @@ define void @multilooped_exit(i32 %b) {
; CHECK-NEXT: movt r3, :upper16:arr_56
; CHECK-NEXT: lsr.w r12, r1, #4
; CHECK-NEXT: mov r2, r3
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_5
; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_5
; CHECK-NEXT: .LBB18_4: @ Parent Loop BB18_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q0, [r2], #16
@ -498,8 +495,7 @@ define void @multilooped_exit(i32 %b) {
; CHECK-NEXT: .LBB18_5: @ %loop
; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1
; CHECK-NEXT: mov r2, r3
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_7
; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_7
; CHECK-NEXT: .LBB18_6: @ Parent Loop BB18_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q0, [r2], #16
@ -507,8 +503,7 @@ define void @multilooped_exit(i32 %b) {
; CHECK-NEXT: .LBB18_7: @ %loop
; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1
; CHECK-NEXT: mov r2, r3
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_9
; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_9
; CHECK-NEXT: .LBB18_8: @ Parent Loop BB18_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrb.8 q0, [r2], #16
@ -567,12 +562,10 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: movw r0, :lower16:arr_22
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: movt r0, :upper16:arr_22
; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: add.w r1, r2, #15
; CHECK-NEXT: lsrs r3, r1, #4
; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_2
; CHECK-NEXT: strd r3, r2, [sp] @ 8-byte Folded Spill
; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_2
; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vstrb.8 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB19_1
@ -621,11 +614,12 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: le lr, .LBB19_3
; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup6
; CHECK-NEXT: movw r0, :lower16:arr_22
; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: movt r0, :upper16:arr_22
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r0, #1824
; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_6
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_6
; CHECK-NEXT: .LBB19_5: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vstrb.8 q1, [r0], #16
; CHECK-NEXT: letp lr, .LBB19_5
@ -675,11 +669,12 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: le lr, .LBB19_7
; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup6.1
; CHECK-NEXT: movw r0, :lower16:arr_22
; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: movt r0, :upper16:arr_22
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r0, #3648
; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_10
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_10
; CHECK-NEXT: .LBB19_9: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vstrb.8 q1, [r0], #16
; CHECK-NEXT: letp lr, .LBB19_9
@ -731,19 +726,14 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: le lr, .LBB19_11
; CHECK-NEXT: @ %bb.12: @ %for.cond.cleanup6.2
; CHECK-NEXT: movw r0, :lower16:arr_22
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: movt r0, :upper16:arr_22
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: add.w r0, r0, #5472
; CHECK-NEXT: wls lr, r1, .LBB19_14
; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_14
; CHECK-NEXT: .LBB19_13: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vctp.8 r1
; CHECK-NEXT: subs r1, #16
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.8 q1, [r0], #16
; CHECK-NEXT: le lr, .LBB19_13
; CHECK-NEXT: vstrb.8 q1, [r0], #16
; CHECK-NEXT: letp lr, .LBB19_13
; CHECK-NEXT: .LBB19_14: @ %for.cond.cleanup6.2
; CHECK-NEXT: movw r2, :lower16:arr_21
; CHECK-NEXT: movw r1, #5508