mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 02:33:06 +01:00
[ARM] Transforming memcpy to Tail predicated Loop
This patch converts llvm.memcpy intrinsic into Tail Predicated Hardware loops for a target that supports the Arm M-profile Vector Extension (MVE). From an implementation point of view, the patch - adds an ARM specific SDAG Node (to which the llvm.memcpy intrinsic is lowered to, during first phase of ISel) - adds a corresponding TableGen entry to generate a pseudo instruction, with a custom inserter, on matching the above node. - Adds a custom inserter function that expands the pseudo instruction into MIR suitable to be (by later passes) into a WLSTP loop. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D99723
This commit is contained in:
parent
6e72fd82c1
commit
4adadbc511
@ -1802,6 +1802,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
MAKE_CASE(ARMISD::CSINV)
|
||||
MAKE_CASE(ARMISD::CSNEG)
|
||||
MAKE_CASE(ARMISD::CSINC)
|
||||
MAKE_CASE(ARMISD::MEMCPYLOOP)
|
||||
#undef MAKE_CASE
|
||||
}
|
||||
return nullptr;
|
||||
@ -11097,6 +11098,141 @@ static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Adds logic in loop entry MBB to calculate loop iteration count and adds
|
||||
/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
|
||||
static Register genTPEntry(MachineBasicBlock *TpEntry,
|
||||
MachineBasicBlock *TpLoopBody,
|
||||
MachineBasicBlock *TpExit, Register OpSizeReg,
|
||||
const TargetInstrInfo *TII, DebugLoc Dl,
|
||||
MachineRegisterInfo &MRI) {
|
||||
|
||||
// Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.
|
||||
Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
|
||||
.addUse(OpSizeReg)
|
||||
.addImm(15)
|
||||
.add(predOps(ARMCC::AL))
|
||||
.addReg(0);
|
||||
|
||||
Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg)
|
||||
.addUse(AddDestReg, RegState::Kill)
|
||||
.addImm(16)
|
||||
.add(predOps(ARMCC::AL))
|
||||
.addReg(0);
|
||||
|
||||
Register LsrDestReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
|
||||
BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
|
||||
.addUse(BicDestReg, RegState::Kill)
|
||||
.addImm(4)
|
||||
.add(predOps(ARMCC::AL))
|
||||
.addReg(0);
|
||||
|
||||
Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
|
||||
BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
|
||||
.addUse(LsrDestReg, RegState::Kill);
|
||||
|
||||
BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
|
||||
.addUse(TotalIterationsReg)
|
||||
.addMBB(TpExit);
|
||||
|
||||
return TotalIterationsReg;
|
||||
}
|
||||
|
||||
/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
|
||||
/// t2DoLoopEnd. These are used by later passes to generate tail predicated
|
||||
/// loops.
|
||||
static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
|
||||
MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
|
||||
const TargetInstrInfo *TII, DebugLoc Dl,
|
||||
MachineRegisterInfo &MRI, Register OpSrcReg,
|
||||
Register OpDestReg, Register ElementCountReg,
|
||||
Register TotalIterationsReg) {
|
||||
|
||||
// First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop
|
||||
// iteration counter, predication counter Current position in the src array
|
||||
Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
|
||||
.addUse(OpSrcReg)
|
||||
.addMBB(TpEntry)
|
||||
.addUse(CurrSrcReg)
|
||||
.addMBB(TpLoopBody);
|
||||
|
||||
// Current position in the dest array
|
||||
Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
|
||||
.addUse(OpDestReg)
|
||||
.addMBB(TpEntry)
|
||||
.addUse(CurrDestReg)
|
||||
.addMBB(TpLoopBody);
|
||||
|
||||
// Current loop counter
|
||||
Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
|
||||
Register RemainingLoopIterationsReg =
|
||||
MRI.createVirtualRegister(&ARM::GPRlrRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
|
||||
.addUse(TotalIterationsReg)
|
||||
.addMBB(TpEntry)
|
||||
.addUse(RemainingLoopIterationsReg)
|
||||
.addMBB(TpLoopBody);
|
||||
|
||||
// Predication counter
|
||||
Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
|
||||
.addUse(ElementCountReg)
|
||||
.addMBB(TpEntry)
|
||||
.addUse(RemainingElementsReg)
|
||||
.addMBB(TpLoopBody);
|
||||
|
||||
// Pass predication counter to VCTP
|
||||
Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
|
||||
.addUse(PredCounterPhiReg)
|
||||
.addImm(ARMVCC::None)
|
||||
.addReg(0);
|
||||
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
|
||||
.addUse(PredCounterPhiReg)
|
||||
.addImm(16)
|
||||
.add(predOps(ARMCC::AL))
|
||||
.addReg(0);
|
||||
|
||||
// VLDRB and VSTRB instructions, predicated using VPR
|
||||
Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
|
||||
.addDef(CurrSrcReg)
|
||||
.addDef(LoadedValueReg)
|
||||
.addReg(SrcPhiReg)
|
||||
.addImm(16)
|
||||
.addImm(ARMVCC::Then)
|
||||
.addUse(VccrReg);
|
||||
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
|
||||
.addDef(CurrDestReg)
|
||||
.addUse(LoadedValueReg, RegState::Kill)
|
||||
.addReg(DestPhiReg)
|
||||
.addImm(16)
|
||||
.addImm(ARMVCC::Then)
|
||||
.addUse(VccrReg);
|
||||
|
||||
// Add the pseudoInstrs for decrementing the loop counter and marking the
|
||||
// end:t2DoLoopDec and t2DoLoopEnd
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
|
||||
.addUse(LoopCounterPhiReg)
|
||||
.addImm(1);
|
||||
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
|
||||
.addUse(RemainingLoopIterationsReg)
|
||||
.addMBB(TpLoopBody);
|
||||
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
|
||||
.addMBB(TpExit)
|
||||
.add(predOps(ARMCC::AL));
|
||||
}
|
||||
|
||||
MachineBasicBlock *
|
||||
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const {
|
||||
@ -11123,6 +11259,95 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
return BB;
|
||||
}
|
||||
|
||||
case ARM::MVE_MEMCPYLOOPINST: {
|
||||
|
||||
// Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction
|
||||
// into a Tail Predicated (TP) Loop. It adds the instructions to calculate
|
||||
// the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
|
||||
// adds the relevant instructions in the TP loop Body for generation of a
|
||||
// WLSTP loop.
|
||||
|
||||
// Below is relevant portion of the CFG after the transformation.
|
||||
// The Machine Basic Blocks are shown along with branch conditions (in
|
||||
// brackets). Note that TP entry/exit MBBs depict the entry/exit of this
|
||||
// portion of the CFG and may not necessarily be the entry/exit of the
|
||||
// function.
|
||||
|
||||
// (Relevant) CFG after transformation:
|
||||
// TP entry MBB
|
||||
// |
|
||||
// |-----------------|
|
||||
// (n <= 0) (n > 0)
|
||||
// | |
|
||||
// | TP loop Body MBB<--|
|
||||
// | | |
|
||||
// \ |___________|
|
||||
// \ /
|
||||
// TP exit MBB
|
||||
|
||||
MachineFunction *MF = BB->getParent();
|
||||
MachineFunctionProperties &Properties = MF->getProperties();
|
||||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||
|
||||
Register OpDestReg = MI.getOperand(0).getReg();
|
||||
Register OpSrcReg = MI.getOperand(1).getReg();
|
||||
Register OpSizeReg = MI.getOperand(2).getReg();
|
||||
|
||||
// Allocate the required MBBs and add to parent function.
|
||||
MachineBasicBlock *TpEntry = BB;
|
||||
MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
|
||||
MachineBasicBlock *TpExit;
|
||||
|
||||
MF->push_back(TpLoopBody);
|
||||
|
||||
// If any instructions are present in the current block after
|
||||
// MVE_MEMCPYLOOPINST, split the current block and move the instructions
|
||||
// into the newly created exit block. If there are no instructions
|
||||
// add an explicit branch to the FallThrough block and then split.
|
||||
//
|
||||
// The split is required for two reasons:
|
||||
// 1) A terminator(t2WhileLoopStart) will be placed at that site.
|
||||
// 2) Since a TPLoopBody will be added later, any phis in successive blocks
|
||||
// need to be updated. splitAt() already handles this.
|
||||
TpExit = BB->splitAt(MI, false);
|
||||
if (TpExit == BB) {
|
||||
assert(BB->canFallThrough() &&
|
||||
"Exit block must be FallThrough of the block containing memcpy");
|
||||
TpExit = BB->getFallThrough();
|
||||
BuildMI(BB, dl, TII->get(ARM::t2B))
|
||||
.addMBB(TpExit)
|
||||
.add(predOps(ARMCC::AL));
|
||||
TpExit = BB->splitAt(MI, false);
|
||||
}
|
||||
|
||||
// Add logic for iteration count
|
||||
Register TotalIterationsReg =
|
||||
genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
|
||||
|
||||
// Add the vectorized (and predicated) loads/store instructions
|
||||
genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
|
||||
OpDestReg, OpSizeReg, TotalIterationsReg);
|
||||
|
||||
// Required to avoid conflict with the MachineVerifier during testing.
|
||||
Properties.reset(MachineFunctionProperties::Property::NoPHIs);
|
||||
|
||||
// Connect the blocks
|
||||
TpEntry->addSuccessor(TpLoopBody);
|
||||
TpLoopBody->addSuccessor(TpLoopBody);
|
||||
TpLoopBody->addSuccessor(TpExit);
|
||||
|
||||
// Reorder for a more natural layout
|
||||
TpLoopBody->moveAfter(TpEntry);
|
||||
TpExit->moveAfter(TpLoopBody);
|
||||
|
||||
// Finally, remove the memcpy Psuedo Instruction
|
||||
MI.eraseFromParent();
|
||||
|
||||
// Return the exit block as it may contain other instructions requiring a
|
||||
// custom inserter
|
||||
return TpExit;
|
||||
}
|
||||
|
||||
// The Thumb2 pre-indexed stores have the same MI operands, they just
|
||||
// define them differently in the .td files from the isel patterns, so
|
||||
// they need pseudos.
|
||||
|
@ -300,6 +300,10 @@ class VectorType;
|
||||
// instructions.
|
||||
MEMCPY,
|
||||
|
||||
// Pseudo-instruction representing a memory copy using a tail predicated
|
||||
// loop
|
||||
MEMCPYLOOP,
|
||||
|
||||
// V8.1MMainline condition select
|
||||
CSINV, // Conditional select invert.
|
||||
CSNEG, // Conditional select negate.
|
||||
|
@ -6865,6 +6865,18 @@ class MVE_WLSTP<string asm, bits<2> size>
|
||||
let isTerminator = 1;
|
||||
}
|
||||
|
||||
def SDT_MVEMEMCPYLOOPNODE
|
||||
: SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
|
||||
def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
|
||||
|
||||
let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
|
||||
def MVE_MEMCPYLOOPINST : PseudoInst<(outs),
|
||||
(ins rGPR:$dst, rGPR:$src, rGPR:$sz),
|
||||
NoItinerary,
|
||||
[(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
|
||||
}
|
||||
|
||||
def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
|
||||
def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
|
||||
def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
|
||||
|
@ -11,12 +11,27 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ARMTargetMachine.h"
|
||||
#include "ARMTargetTransformInfo.h"
|
||||
#include "llvm/CodeGen/SelectionDAG.h"
|
||||
#include "llvm/IR/DerivedTypes.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "arm-selectiondag-info"
|
||||
|
||||
cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
|
||||
"arm-memtransfer-tploop", cl::Hidden,
|
||||
cl::desc("Control conversion of memcpy to "
|
||||
"Tail predicated loops (WLSTP)"),
|
||||
cl::init(TPLoop::ForceDisabled),
|
||||
cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
|
||||
"Don't convert memcpy to TP loop."),
|
||||
clEnumValN(TPLoop::ForceEnabled, "force-enabled",
|
||||
"Always convert memcpy to TP loop."),
|
||||
clEnumValN(TPLoop::Allow, "allow",
|
||||
"Allow (may be subject to certain conditions) "
|
||||
"conversion of memcpy to TP loop.")));
|
||||
|
||||
// Emit, if possible, a specialized version of the given Libcall. Typically this
|
||||
// means selecting the appropriately aligned version, but we also convert memset
|
||||
// of 0 into memclr.
|
||||
@ -130,13 +145,40 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
|
||||
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
|
||||
const ARMSubtarget &Subtarget =
|
||||
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
|
||||
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
|
||||
|
||||
auto GenInlineTP = [&](const ARMSubtarget &Subtarget,
|
||||
const SelectionDAG &DAG) {
|
||||
auto &F = DAG.getMachineFunction().getFunction();
|
||||
if (!EnableMemtransferTPLoop)
|
||||
return false;
|
||||
if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
|
||||
return true;
|
||||
// Do not generate inline TP loop if optimizations is disabled,
|
||||
// or if optimization for size (-Os or -Oz) is on.
|
||||
if (F.hasOptNone() || F.hasOptSize())
|
||||
return false;
|
||||
// If cli option is unset
|
||||
if (!ConstantSize && Alignment >= Align(4))
|
||||
return true;
|
||||
if (ConstantSize &&
|
||||
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
|
||||
ConstantSize->getZExtValue() <
|
||||
Subtarget.getMaxTPLoopInlineSizeThreshold())
|
||||
return true;
|
||||
return false;
|
||||
};
|
||||
|
||||
if (Subtarget.hasMVEIntegerOps() && GenInlineTP(Subtarget, DAG))
|
||||
return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
|
||||
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
|
||||
|
||||
// Do repeated 4-byte loads and stores. To be improved.
|
||||
// This requires 4-byte alignment.
|
||||
if (Alignment < Align(4))
|
||||
return SDValue();
|
||||
// This requires the copy size to be a constant, preferably
|
||||
// within a subtarget-specific limit.
|
||||
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
|
||||
if (!ConstantSize)
|
||||
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
|
||||
Alignment.value(), RTLIB::MEMCPY);
|
||||
|
@ -538,6 +538,11 @@ public:
|
||||
return 64;
|
||||
}
|
||||
|
||||
/// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size
|
||||
/// that still makes it profitable to inline the call as a Tail
|
||||
/// Predicated loop
|
||||
unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; }
|
||||
|
||||
/// ParseSubtargetFeatures - Parses features string setting specified
|
||||
/// subtarget options. Definition of function is auto generated by tblgen.
|
||||
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
|
||||
|
@ -48,6 +48,11 @@ namespace TailPredication {
|
||||
};
|
||||
}
|
||||
|
||||
// For controlling conversion of memcpy into Tail Predicated loop.
|
||||
namespace TPLoop {
|
||||
enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
|
||||
}
|
||||
|
||||
class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
|
||||
using BaseT = BasicTTIImplBase<ARMTTIImpl>;
|
||||
using TTI = TargetTransformInfo;
|
||||
|
@ -1,34 +1,39 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
|
||||
; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
|
||||
|
||||
define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
|
||||
; CHECK-LABEL: test_memcpy:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
|
||||
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
|
||||
; CHECK-NEXT: .pad #4
|
||||
; CHECK-NEXT: sub sp, #4
|
||||
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
|
||||
; CHECK-NEXT: cmp r2, #1
|
||||
; CHECK-NEXT: blt .LBB0_3
|
||||
; CHECK-NEXT: blt .LBB0_5
|
||||
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
||||
; CHECK-NEXT: mov r8, r3
|
||||
; CHECK-NEXT: mov r5, r2
|
||||
; CHECK-NEXT: mov r9, r1
|
||||
; CHECK-NEXT: mov r7, r0
|
||||
; CHECK-NEXT: lsls r4, r3, #2
|
||||
; CHECK-NEXT: movs r6, #0
|
||||
; CHECK-NEXT: lsl.w r12, r3, #2
|
||||
; CHECK-NEXT: movs r7, #0
|
||||
; CHECK-NEXT: b .LBB0_2
|
||||
; CHECK-NEXT: .LBB0_2: @ %for.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: adds r0, r7, r6
|
||||
; CHECK-NEXT: add.w r1, r9, r6
|
||||
; CHECK-NEXT: mov r2, r8
|
||||
; CHECK-NEXT: bl __aeabi_memcpy4
|
||||
; CHECK-NEXT: add r6, r4
|
||||
; CHECK-NEXT: subs r5, #1
|
||||
; CHECK-NEXT: bne .LBB0_2
|
||||
; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: add sp, #4
|
||||
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
||||
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
||||
; CHECK-NEXT: @ Child Loop BB0_4 Depth 2
|
||||
; CHECK-NEXT: adds r4, r1, r7
|
||||
; CHECK-NEXT: adds r5, r0, r7
|
||||
; CHECK-NEXT: mov r6, r3
|
||||
; CHECK-NEXT: wlstp.8 lr, r6, .LBB0_3
|
||||
; CHECK-NEXT: b .LBB0_4
|
||||
; CHECK-NEXT: .LBB0_3: @ %for.body
|
||||
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
|
||||
; CHECK-NEXT: add r7, r12
|
||||
; CHECK-NEXT: subs r2, #1
|
||||
; CHECK-NEXT: beq .LBB0_5
|
||||
; CHECK-NEXT: b .LBB0_2
|
||||
; CHECK-NEXT: .LBB0_4: @ Parent Loop BB0_2 Depth=1
|
||||
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: vldrb.u8 q0, [r4], #16
|
||||
; CHECK-NEXT: vstrb.8 q0, [r5], #16
|
||||
; CHECK-NEXT: letp lr, .LBB0_4
|
||||
; CHECK-NEXT: b .LBB0_3
|
||||
; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
|
||||
entry:
|
||||
%cmp8 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
||||
|
285
test/CodeGen/Thumb2/mve-tp-loop.ll
Normal file
285
test/CodeGen/Thumb2/mve-tp-loop.ll
Normal file
@ -0,0 +1,285 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
|
||||
|
||||
; Check that WLSTP loop is not generated for alignment < 4
|
||||
; void test1(char* dest, char* src, int n){
|
||||
; memcpy(dest, src, n);
|
||||
; }
|
||||
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
|
||||
|
||||
define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){
|
||||
; CHECK-LABEL: test1:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: bl __aeabi_memcpy
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %X, i8* align 1 %Y, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Check that WLSTP loop is generated for alignment >= 4
|
||||
; void test2(int* restrict X, int* restrict Y, int n){
|
||||
; memcpy(X, Y, n);
|
||||
; }
|
||||
|
||||
define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n){
|
||||
; CHECK-LABEL: test2:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB1_2
|
||||
; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB1_1
|
||||
; CHECK-NEXT: .LBB1_2: @ %entry
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
%1 = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Checks that transform handles some arithmetic on the input arguments.
|
||||
; void test3(int* restrict X, int* restrict Y, int n)
|
||||
; {
|
||||
; memcpy(X+2, Y+3, (n*2)+10);
|
||||
; }
|
||||
|
||||
define void @test3(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
|
||||
; CHECK-LABEL: test3:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: movs r3, #10
|
||||
; CHECK-NEXT: add.w r2, r3, r2, lsl #1
|
||||
; CHECK-NEXT: adds r1, #12
|
||||
; CHECK-NEXT: adds r0, #8
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB2_2
|
||||
; CHECK-NEXT: .LBB2_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB2_1
|
||||
; CHECK-NEXT: .LBB2_2: @ %entry
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%add.ptr = getelementptr inbounds i32, i32* %X, i32 2
|
||||
%0 = bitcast i32* %add.ptr to i8*
|
||||
%add.ptr1 = getelementptr inbounds i32, i32* %Y, i32 3
|
||||
%1 = bitcast i32* %add.ptr1 to i8*
|
||||
%mul = shl nsw i32 %n, 1
|
||||
%add = add nsw i32 %mul, 10
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 %0, i8* nonnull align 4 %1, i32 %add, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Checks that transform handles for loops that are implicitly converted to mempcy
|
||||
; void test4(int* restrict X, int* restrict Y, int n){
|
||||
; for(int i = 0; i < n; ++i){
|
||||
; X[i] = Y[i];
|
||||
; }
|
||||
; }
|
||||
|
||||
define void @test4(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
|
||||
; CHECK-LABEL: test4:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: cmp r2, #1
|
||||
; CHECK-NEXT: it lt
|
||||
; CHECK-NEXT: bxlt lr
|
||||
; CHECK-NEXT: .LBB3_1: @ %for.body.preheader
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB3_3
|
||||
; CHECK-NEXT: .LBB3_2: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB3_2
|
||||
; CHECK-NEXT: .LBB3_3: @ %for.body.preheader
|
||||
; CHECK-NEXT: pop.w {r7, lr}
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%cmp6 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
%X.bits = bitcast i32* %X to i8*
|
||||
%Y.bits = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body.preheader, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Checks that transform can handle > i32 size inputs
|
||||
define void @test5(i8* noalias %X, i8* noalias %Y, i64 %n){
|
||||
; CHECK-LABEL: test5:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB4_2
|
||||
; CHECK-NEXT: .LBB4_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB4_1
|
||||
; CHECK-NEXT: .LBB4_2:
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %X, i8* align 4 %Y, i64 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Checks the transform is applied for constant size inputs below a certain threshold (128 in this case)
|
||||
define void @test6(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
|
||||
; CHECK-LABEL: test6:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: movs r2, #127
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB5_2
|
||||
; CHECK-NEXT: .LBB5_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB5_1
|
||||
; CHECK-NEXT: .LBB5_2: @ %entry
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
%1 = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* noundef nonnull align 4 dereferenceable(127) %0, i8* noundef nonnull align 4 dereferenceable(127) %1, i32 127, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Checks the transform is NOT applied for constant size inputs above a certain threshold (128 in this case)
|
||||
define void @test7(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
|
||||
; CHECK-LABEL: test7:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: movs r2, #128
|
||||
; CHECK-NEXT: bl __aeabi_memcpy4
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
%1 = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 128, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Checks the transform is NOT applied for constant size inputs below a certain threshold (64 in this case)
|
||||
define void @test8(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
|
||||
; CHECK-LABEL: test8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, lr}
|
||||
; CHECK-NEXT: push {r4, lr}
|
||||
; CHECK-NEXT: ldm.w r1!, {r2, r3, r4, r12, lr}
|
||||
; CHECK-NEXT: stm.w r0!, {r2, r3, r4, r12, lr}
|
||||
; CHECK-NEXT: ldm.w r1!, {r2, r3, r4, r12, lr}
|
||||
; CHECK-NEXT: stm.w r0!, {r2, r3, r4, r12, lr}
|
||||
; CHECK-NEXT: ldm.w r1, {r2, r3, r4, r12, lr}
|
||||
; CHECK-NEXT: stm.w r0, {r2, r3, r4, r12, lr}
|
||||
; CHECK-NEXT: pop {r4, pc}
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
%1 = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 60, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Checks the transform is NOT applied (regardless of alignment) when optimizations are disabled
|
||||
define void @test9(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) #0 {
|
||||
; CHECK-LABEL: test9:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: bl __aeabi_memcpy4
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
%1 = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Checks the transform is NOT applied (regardless of alignment) when optimization for size is on (-Os or -Oz)
|
||||
define void @test10(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) #1 {
|
||||
; CHECK-LABEL: test10:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: bl __aeabi_memcpy4
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
%1 = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test11(i8* nocapture %x, i8* nocapture %y, i32 %n) {
|
||||
; CHECK-LABEL: test11:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, lr}
|
||||
; CHECK-NEXT: push {r4, lr}
|
||||
; CHECK-NEXT: cmp.w r2, #-1
|
||||
; CHECK-NEXT: it gt
|
||||
; CHECK-NEXT: popgt {r4, pc}
|
||||
; CHECK-NEXT: .LBB10_1: @ %prehead
|
||||
; CHECK-NEXT: add.w r3, r2, #15
|
||||
; CHECK-NEXT: mov r12, r1
|
||||
; CHECK-NEXT: bic r3, r3, #16
|
||||
; CHECK-NEXT: mov r4, r0
|
||||
; CHECK-NEXT: lsr.w lr, r3, #4
|
||||
; CHECK-NEXT: mov r3, r2
|
||||
; CHECK-NEXT: subs.w lr, lr, #0
|
||||
; CHECK-NEXT: beq .LBB10_3
|
||||
; CHECK-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vctp.8 r3
|
||||
; CHECK-NEXT: subs r3, #16
|
||||
; CHECK-NEXT: vpstt
|
||||
; CHECK-NEXT: vldrbt.u8 q0, [r12], #16
|
||||
; CHECK-NEXT: vstrbt.8 q0, [r4], #16
|
||||
; CHECK-NEXT: subs.w lr, lr, #1
|
||||
; CHECK-NEXT: bne .LBB10_2
|
||||
; CHECK-NEXT: b .LBB10_3
|
||||
; CHECK-NEXT: .LBB10_3: @ %for.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: ldrb r3, [r0], #1
|
||||
; CHECK-NEXT: subs r2, #2
|
||||
; CHECK-NEXT: strb r3, [r1], #1
|
||||
; CHECK-NEXT: bne .LBB10_3
|
||||
; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r4, pc}
|
||||
entry:
|
||||
%cmp6 = icmp slt i32 %n, 0
|
||||
br i1 %cmp6, label %prehead, label %for.cond.cleanup
|
||||
|
||||
prehead: ; preds = %entry
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %x, i8* align 4 %y, i32 %n, i1 false)
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body, %prehead
|
||||
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %prehead ]
|
||||
%x.addr.08 = phi i8* [ %add.ptr, %for.body ], [ %x, %prehead ]
|
||||
%y.addr.07 = phi i8* [ %add.ptr1, %for.body ], [ %y, %prehead ]
|
||||
%add.ptr = getelementptr inbounds i8, i8* %x.addr.08, i32 1
|
||||
%add.ptr1 = getelementptr inbounds i8, i8* %y.addr.07, i32 1
|
||||
%l = load i8, i8* %x.addr.08, align 1
|
||||
store i8 %l, i8* %y.addr.07, align 1
|
||||
%inc = add nuw nsw i32 %i.09, 2
|
||||
%exitcond.not = icmp eq i32 %inc, %n
|
||||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
|
||||
|
||||
for.cond.cleanup: ; preds = %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { noinline optnone }
|
||||
attributes #1 = { optsize }
|
127
test/CodeGen/Thumb2/mve-tp-loop.mir
Normal file
127
test/CodeGen/Thumb2/mve-tp-loop.mir
Normal file
@ -0,0 +1,127 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir --verify-machineinstrs -run-pass=finalize-isel %s -o - | FileCheck %s
|
||||
--- |
|
||||
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
|
||||
target triple = "arm-arm-none-eabi"
|
||||
|
||||
; Function Attrs: argmemonly nofree nosync nounwind willreturn
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
|
||||
|
||||
define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
%1 = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
|
||||
entry:
|
||||
%cmp6 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
%X.bits = bitcast i32* %X to i8*
|
||||
%Y.bits = bitcast i32* %Y to i8*
|
||||
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body.preheader, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
...
|
||||
---
|
||||
name: test1
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0.entry:
|
||||
liveins: $r0, $r1, $r2
|
||||
|
||||
; CHECK-LABEL: name: test1
|
||||
; CHECK: liveins: $r0, $r1, $r2
|
||||
; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1
|
||||
; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
|
||||
; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
|
||||
; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
|
||||
; CHECK: .1:
|
||||
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.0, %8, %bb.1
|
||||
; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %10, %bb.1
|
||||
; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %12, %bb.1
|
||||
; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %14, %bb.1
|
||||
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
|
||||
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
|
||||
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
|
||||
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
|
||||
; CHECK: .2.entry:
|
||||
; CHECK: tBX_RET 14 /* CC::al */, $noreg
|
||||
%2:rgpr = COPY $r2
|
||||
%1:rgpr = COPY $r1
|
||||
%0:rgpr = COPY $r0
|
||||
MVE_MEMCPYLOOPINST %0, %1, %2
|
||||
tBX_RET 14 /* CC::al */, $noreg
|
||||
|
||||
...
|
||||
---
|
||||
name: test2
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
; CHECK-LABEL: name: test2
|
||||
; CHECK: bb.0.entry:
|
||||
; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000)
|
||||
; CHECK: liveins: $r0, $r1, $r2
|
||||
; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1
|
||||
; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
|
||||
; CHECK: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
|
||||
; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
|
||||
; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg
|
||||
; CHECK: bb.1.for.body.preheader:
|
||||
; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
|
||||
; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.4, implicit-def $cpsr
|
||||
; CHECK: bb.3:
|
||||
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.3
|
||||
; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.3
|
||||
; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.3
|
||||
; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.3
|
||||
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
|
||||
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
|
||||
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
|
||||
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
|
||||
; CHECK: bb.4.for.body.preheader:
|
||||
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
|
||||
; CHECK: bb.2.for.cond.cleanup:
|
||||
; CHECK: tBX_RET 14 /* CC::al */, $noreg
|
||||
bb.0.entry:
|
||||
successors: %bb.1(0x50000000), %bb.2(0x30000000)
|
||||
liveins: $r0, $r1, $r2
|
||||
|
||||
%2:rgpr = COPY $r2
|
||||
%1:rgpr = COPY $r1
|
||||
%0:rgpr = COPY $r0
|
||||
t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
|
||||
t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
|
||||
t2B %bb.1, 14 /* CC::al */, $noreg
|
||||
|
||||
bb.1.for.body.preheader:
|
||||
successors: %bb.2(0x80000000)
|
||||
|
||||
MVE_MEMCPYLOOPINST %0, %1, %2
|
||||
|
||||
bb.2.for.cond.cleanup:
|
||||
tBX_RET 14 /* CC::al */, $noreg
|
||||
|
||||
...
|
Loading…
Reference in New Issue
Block a user