mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[ARM] Transforming memset to Tail predicated Loop
This patch converts llvm.memset intrinsic into Tail Predicated Hardware loops for a target that supports the Arm M-profile Vector Extension (MVE). The llvm.memset is converted to a TP loop for both constant and non-constant input sizes (of llvm.memset). Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D100435
This commit is contained in:
parent
6aa024bb17
commit
0a055c77d0
@ -1803,6 +1803,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
MAKE_CASE(ARMISD::CSNEG)
|
||||
MAKE_CASE(ARMISD::CSINC)
|
||||
MAKE_CASE(ARMISD::MEMCPYLOOP)
|
||||
MAKE_CASE(ARMISD::MEMSETLOOP)
|
||||
#undef MAKE_CASE
|
||||
}
|
||||
return nullptr;
|
||||
@ -11105,7 +11106,6 @@ static Register genTPEntry(MachineBasicBlock *TpEntry,
|
||||
MachineBasicBlock *TpExit, Register OpSizeReg,
|
||||
const TargetInstrInfo *TII, DebugLoc Dl,
|
||||
MachineRegisterInfo &MRI) {
|
||||
|
||||
// Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.
|
||||
Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
|
||||
@ -11147,17 +11147,21 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
|
||||
const TargetInstrInfo *TII, DebugLoc Dl,
|
||||
MachineRegisterInfo &MRI, Register OpSrcReg,
|
||||
Register OpDestReg, Register ElementCountReg,
|
||||
Register TotalIterationsReg) {
|
||||
Register TotalIterationsReg, bool IsMemcpy) {
|
||||
// First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
|
||||
// array, loop iteration counter, predication counter.
|
||||
|
||||
// First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop
|
||||
// iteration counter, predication counter Current position in the src array
|
||||
Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
|
||||
.addUse(OpSrcReg)
|
||||
.addMBB(TpEntry)
|
||||
.addUse(CurrSrcReg)
|
||||
.addMBB(TpLoopBody);
|
||||
Register SrcPhiReg, CurrSrcReg;
|
||||
if (IsMemcpy) {
|
||||
// Current position in the src array
|
||||
SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
|
||||
.addUse(OpSrcReg)
|
||||
.addMBB(TpEntry)
|
||||
.addUse(CurrSrcReg)
|
||||
.addMBB(TpLoopBody);
|
||||
}
|
||||
|
||||
// Current position in the dest array
|
||||
Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
|
||||
@ -11200,19 +11204,23 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
|
||||
.add(predOps(ARMCC::AL))
|
||||
.addReg(0);
|
||||
|
||||
// VLDRB and VSTRB instructions, predicated using VPR
|
||||
Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
|
||||
.addDef(CurrSrcReg)
|
||||
.addDef(LoadedValueReg)
|
||||
.addReg(SrcPhiReg)
|
||||
.addImm(16)
|
||||
.addImm(ARMVCC::Then)
|
||||
.addUse(VccrReg);
|
||||
// VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
|
||||
Register SrcValueReg;
|
||||
if (IsMemcpy) {
|
||||
SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
|
||||
.addDef(CurrSrcReg)
|
||||
.addDef(SrcValueReg)
|
||||
.addReg(SrcPhiReg)
|
||||
.addImm(16)
|
||||
.addImm(ARMVCC::Then)
|
||||
.addUse(VccrReg);
|
||||
} else
|
||||
SrcValueReg = OpSrcReg;
|
||||
|
||||
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
|
||||
.addDef(CurrDestReg)
|
||||
.addUse(LoadedValueReg, RegState::Kill)
|
||||
.addUse(SrcValueReg)
|
||||
.addReg(DestPhiReg)
|
||||
.addImm(16)
|
||||
.addImm(ARMVCC::Then)
|
||||
@ -11259,9 +11267,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
return BB;
|
||||
}
|
||||
|
||||
case ARM::MVE_MEMCPYLOOPINST: {
|
||||
case ARM::MVE_MEMCPYLOOPINST:
|
||||
case ARM::MVE_MEMSETLOOPINST: {
|
||||
|
||||
// Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction
|
||||
// Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
|
||||
// into a Tail Predicated (TP) Loop. It adds the instructions to calculate
|
||||
// the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
|
||||
// adds the relevant instructions in the TP loop Body for generation of a
|
||||
@ -11301,23 +11310,24 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
MF->push_back(TpLoopBody);
|
||||
|
||||
// If any instructions are present in the current block after
|
||||
// MVE_MEMCPYLOOPINST, split the current block and move the instructions
|
||||
// into the newly created exit block. If there are no instructions
|
||||
// add an explicit branch to the FallThrough block and then split.
|
||||
// MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
|
||||
// move the instructions into the newly created exit block. If there are no
|
||||
// instructions add an explicit branch to the FallThrough block and then
|
||||
// split.
|
||||
//
|
||||
// The split is required for two reasons:
|
||||
// 1) A terminator(t2WhileLoopStart) will be placed at that site.
|
||||
// 2) Since a TPLoopBody will be added later, any phis in successive blocks
|
||||
// need to be updated. splitAt() already handles this.
|
||||
TpExit = BB->splitAt(MI, false);
|
||||
TpExit = BB->splitAt(MI);
|
||||
if (TpExit == BB) {
|
||||
assert(BB->canFallThrough() &&
|
||||
"Exit block must be FallThrough of the block containing memcpy");
|
||||
assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
|
||||
"block containing memcpy/memset Pseudo");
|
||||
TpExit = BB->getFallThrough();
|
||||
BuildMI(BB, dl, TII->get(ARM::t2B))
|
||||
.addMBB(TpExit)
|
||||
.add(predOps(ARMCC::AL));
|
||||
TpExit = BB->splitAt(MI, false);
|
||||
TpExit = BB->splitAt(MI);
|
||||
}
|
||||
|
||||
// Add logic for iteration count
|
||||
@ -11325,8 +11335,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
|
||||
|
||||
// Add the vectorized (and predicated) loads/store instructions
|
||||
bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
|
||||
genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
|
||||
OpDestReg, OpSizeReg, TotalIterationsReg);
|
||||
OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
|
||||
|
||||
// Required to avoid conflict with the MachineVerifier during testing.
|
||||
Properties.reset(MachineFunctionProperties::Property::NoPHIs);
|
||||
|
@ -303,6 +303,9 @@ class VectorType;
|
||||
// Pseudo-instruction representing a memory copy using a tail predicated
|
||||
// loop
|
||||
MEMCPYLOOP,
|
||||
// Pseudo-instruction representing a memset using a tail predicated
|
||||
// loop
|
||||
MEMSETLOOP,
|
||||
|
||||
// V8.1MMainline condition select
|
||||
CSINV, // Conditional select invert.
|
||||
|
@ -6877,6 +6877,18 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
|
||||
[(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
|
||||
}
|
||||
|
||||
def SDT_MVEMEMSETLOOPNODE
|
||||
: SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisVT<1, v16i8>, SDTCisVT<2, i32>]>;
|
||||
def MVE_MEMSETLOOPNODE : SDNode<"ARMISD::MEMSETLOOP", SDT_MVEMEMSETLOOPNODE,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
|
||||
|
||||
let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
|
||||
def MVE_MEMSETLOOPINST : PseudoInst<(outs),
|
||||
(ins rGPR:$dst, MQPR:$src, rGPR:$sz),
|
||||
NoItinerary,
|
||||
[(MVE_MEMSETLOOPNODE rGPR:$dst, MQPR:$src, rGPR:$sz)]>;
|
||||
}
|
||||
|
||||
def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
|
||||
def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
|
||||
def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
|
||||
|
@ -139,6 +139,33 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
|
||||
return CallResult.second;
|
||||
}
|
||||
|
||||
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
|
||||
const SelectionDAG &DAG,
|
||||
ConstantSDNode *ConstantSize,
|
||||
Align Alignment, bool IsMemcpy) {
|
||||
auto &F = DAG.getMachineFunction().getFunction();
|
||||
if (!EnableMemtransferTPLoop)
|
||||
return false;
|
||||
if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
|
||||
return true;
|
||||
// Do not generate inline TP loop if optimizations is disabled,
|
||||
// or if optimization for size (-Os or -Oz) is on.
|
||||
if (F.hasOptNone() || F.hasOptSize())
|
||||
return false;
|
||||
// If cli option is unset, for memset always generate inline TP.
|
||||
// For memcpy, check some conditions
|
||||
if (!IsMemcpy)
|
||||
return true;
|
||||
if (!ConstantSize && Alignment >= Align(4))
|
||||
return true;
|
||||
if (ConstantSize &&
|
||||
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
|
||||
ConstantSize->getZExtValue() <
|
||||
Subtarget.getMaxMemcpyTPInlineSizeThreshold())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
|
||||
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
|
||||
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
|
||||
@ -147,29 +174,8 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
|
||||
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
|
||||
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
|
||||
|
||||
auto GenInlineTP = [&](const ARMSubtarget &Subtarget,
|
||||
const SelectionDAG &DAG) {
|
||||
auto &F = DAG.getMachineFunction().getFunction();
|
||||
if (!EnableMemtransferTPLoop)
|
||||
return false;
|
||||
if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
|
||||
return true;
|
||||
// Do not generate inline TP loop if optimizations is disabled,
|
||||
// or if optimization for size (-Os or -Oz) is on.
|
||||
if (F.hasOptNone() || F.hasOptSize())
|
||||
return false;
|
||||
// If cli option is unset
|
||||
if (!ConstantSize && Alignment >= Align(4))
|
||||
return true;
|
||||
if (ConstantSize &&
|
||||
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
|
||||
ConstantSize->getZExtValue() <
|
||||
Subtarget.getMaxTPLoopInlineSizeThreshold())
|
||||
return true;
|
||||
return false;
|
||||
};
|
||||
|
||||
if (Subtarget.hasMVEIntegerOps() && GenInlineTP(Subtarget, DAG))
|
||||
if (Subtarget.hasMVEIntegerOps() &&
|
||||
shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
|
||||
return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
|
||||
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
|
||||
|
||||
@ -292,6 +298,22 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
|
||||
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
|
||||
SDValue Size, Align Alignment, bool isVolatile,
|
||||
MachinePointerInfo DstPtrInfo) const {
|
||||
|
||||
const ARMSubtarget &Subtarget =
|
||||
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
|
||||
|
||||
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
|
||||
|
||||
// Generate TP loop for llvm.memset
|
||||
if (Subtarget.hasMVEIntegerOps() &&
|
||||
shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
|
||||
false)) {
|
||||
Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
|
||||
DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
|
||||
return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
|
||||
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
|
||||
}
|
||||
|
||||
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
|
||||
Alignment.value(), RTLIB::MEMSET);
|
||||
}
|
||||
|
@ -538,10 +538,11 @@ public:
|
||||
return 64;
|
||||
}
|
||||
|
||||
/// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size
|
||||
/// that still makes it profitable to inline the call as a Tail
|
||||
/// Predicated loop
|
||||
unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; }
|
||||
/// getMaxMemcpyTPInlineSizeThreshold - Returns the maximum size
|
||||
/// that still makes it profitable to inline a llvm.memcpy as a Tail
|
||||
/// Predicated loop.
|
||||
/// This threshold should only be used for constant size inputs.
|
||||
unsigned getMaxMemcpyTPInlineSizeThreshold() const { return 128; }
|
||||
|
||||
/// ParseSubtargetFeatures - Parses features string setting specified
|
||||
/// subtarget options. Definition of function is auto generated by tblgen.
|
||||
|
@ -58,28 +58,35 @@ for.body: ; preds = %entry, %for.body
|
||||
define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {
|
||||
; CHECK-LABEL: test_memset:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
|
||||
; CHECK-NEXT: .pad #4
|
||||
; CHECK-NEXT: sub sp, #4
|
||||
; CHECK-NEXT: .save {r4, lr}
|
||||
; CHECK-NEXT: push {r4, lr}
|
||||
; CHECK-NEXT: cmp r1, #1
|
||||
; CHECK-NEXT: blt .LBB1_3
|
||||
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
||||
; CHECK-NEXT: mov r4, r2
|
||||
; CHECK-NEXT: mov r5, r1
|
||||
; CHECK-NEXT: mov r6, r0
|
||||
; CHECK-NEXT: lsls r7, r2, #2
|
||||
; CHECK-NEXT: it lt
|
||||
; CHECK-NEXT: poplt {r4, pc}
|
||||
; CHECK-NEXT: .LBB1_1: @ %for.body.preheader
|
||||
; CHECK-NEXT: lsl.w r12, r2, #2
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: b .LBB1_2
|
||||
; CHECK-NEXT: .LBB1_2: @ %for.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: mov r0, r6
|
||||
; CHECK-NEXT: mov r1, r4
|
||||
; CHECK-NEXT: bl __aeabi_memclr4
|
||||
; CHECK-NEXT: add r6, r7
|
||||
; CHECK-NEXT: subs r5, #1
|
||||
; CHECK-NEXT: bne .LBB1_2
|
||||
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: add sp, #4
|
||||
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
|
||||
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
||||
; CHECK-NEXT: @ Child Loop BB1_4 Depth 2
|
||||
; CHECK-NEXT: mov r4, r0
|
||||
; CHECK-NEXT: mov r3, r2
|
||||
; CHECK-NEXT: wlstp.8 lr, r3, .LBB1_3
|
||||
; CHECK-NEXT: b .LBB1_4
|
||||
; CHECK-NEXT: .LBB1_3: @ %for.body
|
||||
; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
|
||||
; CHECK-NEXT: add r0, r12
|
||||
; CHECK-NEXT: subs r1, #1
|
||||
; CHECK-NEXT: beq .LBB1_5
|
||||
; CHECK-NEXT: b .LBB1_2
|
||||
; CHECK-NEXT: .LBB1_4: @ Parent Loop BB1_2 Depth=1
|
||||
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: vstrb.8 q0, [r4], #16
|
||||
; CHECK-NEXT: letp lr, .LBB1_4
|
||||
; CHECK-NEXT: b .LBB1_3
|
||||
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r4, pc}
|
||||
entry:
|
||||
%cmp5 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp5, label %for.body, label %for.cond.cleanup
|
||||
|
@ -1,6 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
|
||||
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck %s
|
||||
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow %s -o 2>/dev/null - | FileCheck %s
|
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4}
|
||||
!1 = !{i32 1, !"min_enum_size", i32 4}
|
||||
@ -592,141 +592,147 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
|
||||
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
; CHECK-NEXT: .pad #4
|
||||
; CHECK-NEXT: sub sp, #4
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
|
||||
; CHECK-NEXT: .pad #24
|
||||
; CHECK-NEXT: sub sp, #24
|
||||
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: .pad #32
|
||||
; CHECK-NEXT: sub sp, #32
|
||||
; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill
|
||||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
|
||||
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
|
||||
; CHECK-NEXT: mov r0, r3
|
||||
; CHECK-NEXT: itt ne
|
||||
; CHECK-NEXT: ldrne r0, [sp, #112]
|
||||
; CHECK-NEXT: ldrne r0, [sp, #136]
|
||||
; CHECK-NEXT: cmpne r0, #0
|
||||
; CHECK-NEXT: bne .LBB10_2
|
||||
; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: add sp, #24
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
|
||||
; CHECK-NEXT: add sp, #32
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: add sp, #4
|
||||
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader
|
||||
; CHECK-NEXT: ldr.w r9, [sp, #116]
|
||||
; CHECK-NEXT: mov r6, r1
|
||||
; CHECK-NEXT: movs r1, #1
|
||||
; CHECK-NEXT: mov r11, r2
|
||||
; CHECK-NEXT: bic r10, r9, #3
|
||||
; CHECK-NEXT: mov.w r8, #0
|
||||
; CHECK-NEXT: sub.w r0, r10, #4
|
||||
; CHECK-NEXT: add.w r0, r1, r0, lsr #2
|
||||
; CHECK-NEXT: ldr r1, [sp, #112]
|
||||
; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
|
||||
; CHECK-NEXT: lsl.w r0, r9, #1
|
||||
; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
|
||||
; CHECK-NEXT: adr r0, .LCPI10_0
|
||||
; CHECK-NEXT: vdup.32 q4, r1
|
||||
; CHECK-NEXT: vldrw.u32 q5, [r0]
|
||||
; CHECK-NEXT: lsls r4, r1, #1
|
||||
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
||||
; CHECK-NEXT: vshl.i32 q6, q4, #2
|
||||
; CHECK-NEXT: movs r1, #0
|
||||
; CHECK-NEXT: ldr.w r12, [sp, #140]
|
||||
; CHECK-NEXT: movs r7, #1
|
||||
; CHECK-NEXT: mov.w r11, #0
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: bic r2, r12, #3
|
||||
; CHECK-NEXT: subs r3, r2, #4
|
||||
; CHECK-NEXT: add.w r0, r7, r3, lsr #2
|
||||
; CHECK-NEXT: ldr r7, [sp, #136]
|
||||
; CHECK-NEXT: adr r3, .LCPI10_0
|
||||
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
|
||||
; CHECK-NEXT: lsl.w r0, r12, #1
|
||||
; CHECK-NEXT: vdup.32 q1, r7
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r3]
|
||||
; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
|
||||
; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
|
||||
; CHECK-NEXT: lsls r6, r7, #1
|
||||
; CHECK-NEXT: vshl.i32 q3, q1, #2
|
||||
; CHECK-NEXT: movs r3, #0
|
||||
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
|
||||
; CHECK-NEXT: b .LBB10_5
|
||||
; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader
|
||||
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
|
||||
; CHECK-NEXT: add.w r0, r11, r12, lsl #1
|
||||
; CHECK-NEXT: mov r1, r4
|
||||
; CHECK-NEXT: bl __aeabi_memclr
|
||||
; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
|
||||
; CHECK-NEXT: add.w r3, r0, r5, lsl #1
|
||||
; CHECK-NEXT: mov r5, r6
|
||||
; CHECK-NEXT: wlstp.8 lr, r5, .LBB10_4
|
||||
; CHECK-NEXT: b .LBB10_15
|
||||
; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
|
||||
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
|
||||
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
|
||||
; CHECK-NEXT: add r8, r9
|
||||
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
||||
; CHECK-NEXT: add r1, r0
|
||||
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
||||
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
|
||||
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
|
||||
; CHECK-NEXT: adds r1, #1
|
||||
; CHECK-NEXT: cmp r1, r0
|
||||
; CHECK-NEXT: add r11, r12
|
||||
; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
|
||||
; CHECK-NEXT: add r3, r0
|
||||
; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
|
||||
; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
|
||||
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
|
||||
; CHECK-NEXT: adds r3, #1
|
||||
; CHECK-NEXT: cmp r3, r0
|
||||
; CHECK-NEXT: beq .LBB10_1
|
||||
; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us
|
||||
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
||||
; CHECK-NEXT: @ Child Loop BB10_8 Depth 2
|
||||
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
|
||||
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
|
||||
; CHECK-NEXT: ldr r0, [sp, #112]
|
||||
; CHECK-NEXT: cmp.w r9, #0
|
||||
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
|
||||
; CHECK-NEXT: mul r12, r1, r0
|
||||
; CHECK-NEXT: @ Child Loop BB10_15 Depth 2
|
||||
; CHECK-NEXT: mul r5, r3, r7
|
||||
; CHECK-NEXT: cmp.w r12, #0
|
||||
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
|
||||
; CHECK-NEXT: beq .LBB10_3
|
||||
; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader
|
||||
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
|
||||
; CHECK-NEXT: movs r1, #0
|
||||
; CHECK-NEXT: mov.w r8, #0
|
||||
; CHECK-NEXT: b .LBB10_8
|
||||
; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
|
||||
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
|
||||
; CHECK-NEXT: ldr r0, [sp, #112]
|
||||
; CHECK-NEXT: add.w r3, r1, r12
|
||||
; CHECK-NEXT: adds r1, #1
|
||||
; CHECK-NEXT: cmp r1, r0
|
||||
; CHECK-NEXT: strh.w r2, [r11, r3, lsl #1]
|
||||
; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload
|
||||
; CHECK-NEXT: add.w r0, r8, r5
|
||||
; CHECK-NEXT: add.w r8, r8, #1
|
||||
; CHECK-NEXT: cmp r8, r7
|
||||
; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1]
|
||||
; CHECK-NEXT: beq .LBB10_4
|
||||
; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us
|
||||
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
|
||||
; CHECK-NEXT: @ => This Loop Header: Depth=2
|
||||
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
|
||||
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
|
||||
; CHECK-NEXT: cmp.w r9, #3
|
||||
; CHECK-NEXT: cmp.w r12, #3
|
||||
; CHECK-NEXT: bhi .LBB10_10
|
||||
; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
|
||||
; CHECK-NEXT: movs r7, #0
|
||||
; CHECK-NEXT: movs r2, #0
|
||||
; CHECK-NEXT: movs r4, #0
|
||||
; CHECK-NEXT: mov.w r10, #0
|
||||
; CHECK-NEXT: b .LBB10_13
|
||||
; CHECK-NEXT: .LBB10_10: @ %vector.ph
|
||||
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
|
||||
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
|
||||
; CHECK-NEXT: vmov q1, q4
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: vmlas.u32 q1, q5, r1
|
||||
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
|
||||
; CHECK-NEXT: vmov q5, q1
|
||||
; CHECK-NEXT: vmov.i32 q4, #0x0
|
||||
; CHECK-NEXT: vmlas.u32 q5, q2, r8
|
||||
; CHECK-NEXT: dls lr, r0
|
||||
; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
|
||||
; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
|
||||
; CHECK-NEXT: .LBB10_11: @ %vector.body
|
||||
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
|
||||
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
|
||||
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
|
||||
; CHECK-NEXT: vadd.i32 q2, q1, q6
|
||||
; CHECK-NEXT: vldrh.s32 q3, [r6, q1, uxtw #1]
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r2], #8
|
||||
; CHECK-NEXT: vmul.i32 q1, q3, q1
|
||||
; CHECK-NEXT: vadd.i32 q0, q1, q0
|
||||
; CHECK-NEXT: vmov q1, q2
|
||||
; CHECK-NEXT: vadd.i32 q6, q5, q3
|
||||
; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1]
|
||||
; CHECK-NEXT: vldrh.s32 q5, [r3], #8
|
||||
; CHECK-NEXT: vmul.i32 q5, q7, q5
|
||||
; CHECK-NEXT: vadd.i32 q4, q5, q4
|
||||
; CHECK-NEXT: vmov q5, q6
|
||||
; CHECK-NEXT: le lr, .LBB10_11
|
||||
; CHECK-NEXT: @ %bb.12: @ %middle.block
|
||||
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
|
||||
; CHECK-NEXT: vaddv.u32 r2, q0
|
||||
; CHECK-NEXT: cmp r10, r9
|
||||
; CHECK-NEXT: mov r7, r10
|
||||
; CHECK-NEXT: vaddv.u32 r10, q4
|
||||
; CHECK-NEXT: cmp r2, r12
|
||||
; CHECK-NEXT: mov r4, r2
|
||||
; CHECK-NEXT: beq .LBB10_7
|
||||
; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
|
||||
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
|
||||
; CHECK-NEXT: ldr r0, [sp, #112]
|
||||
; CHECK-NEXT: add.w r5, r8, r7
|
||||
; CHECK-NEXT: sub.w lr, r9, r7
|
||||
; CHECK-NEXT: mla r3, r0, r7, r1
|
||||
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
||||
; CHECK-NEXT: add.w r5, r0, r5, lsl #1
|
||||
; CHECK-NEXT: add.w r3, r6, r3, lsl #1
|
||||
; CHECK-NEXT: mla r3, r7, r4, r8
|
||||
; CHECK-NEXT: add.w r0, r11, r4
|
||||
; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
|
||||
; CHECK-NEXT: sub.w lr, r12, r4
|
||||
; CHECK-NEXT: add.w r9, r7, r0, lsl #1
|
||||
; CHECK-NEXT: ldr r7, [sp, #136]
|
||||
; CHECK-NEXT: add.w r3, r1, r3, lsl #1
|
||||
; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us
|
||||
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
|
||||
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
|
||||
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
|
||||
; CHECK-NEXT: ldrsh.w r0, [r3]
|
||||
; CHECK-NEXT: add r3, r4
|
||||
; CHECK-NEXT: ldrsh r7, [r5], #2
|
||||
; CHECK-NEXT: smlabb r2, r0, r7, r2
|
||||
; CHECK-NEXT: ldrsh.w r4, [r3]
|
||||
; CHECK-NEXT: add r3, r6
|
||||
; CHECK-NEXT: ldrsh r0, [r9], #2
|
||||
; CHECK-NEXT: smlabb r10, r4, r0, r10
|
||||
; CHECK-NEXT: le lr, .LBB10_14
|
||||
; CHECK-NEXT: b .LBB10_7
|
||||
; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1
|
||||
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: vstrb.8 q0, [r3], #16
|
||||
; CHECK-NEXT: letp lr, .LBB10_15
|
||||
; CHECK-NEXT: b .LBB10_4
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.15:
|
||||
; CHECK-NEXT: @ %bb.16:
|
||||
; CHECK-NEXT: .LCPI10_0:
|
||||
; CHECK-NEXT: .long 0 @ 0x0
|
||||
; CHECK-NEXT: .long 1 @ 0x1
|
||||
|
@ -1,5 +1,5 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
|
||||
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -verify-machineinstrs %s -o - | FileCheck %s
|
||||
|
||||
; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.
|
||||
|
||||
@ -147,65 +147,74 @@ define dso_local i32 @e() #0 {
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
|
||||
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: .pad #392
|
||||
; CHECK-NEXT: sub sp, #392
|
||||
; CHECK-NEXT: movw r9, :lower16:.L_MergedGlobals
|
||||
; CHECK-NEXT: vldr s0, .LCPI1_0
|
||||
; CHECK-NEXT: movt r9, :upper16:.L_MergedGlobals
|
||||
; CHECK-NEXT: vldr s3, .LCPI1_1
|
||||
; CHECK-NEXT: mov r7, r9
|
||||
; CHECK-NEXT: mov r5, r9
|
||||
; CHECK-NEXT: ldr r0, [r7, #4]!
|
||||
; CHECK-NEXT: movw r4, :lower16:e
|
||||
; CHECK-NEXT: ldr r1, [r5, #8]!
|
||||
; CHECK-NEXT: movt r4, :upper16:e
|
||||
; CHECK-NEXT: vmov r6, s3
|
||||
; CHECK-NEXT: vdup.32 q4, r7
|
||||
; CHECK-NEXT: vmov s1, r7
|
||||
; CHECK-NEXT: vmov q1[2], q1[0], r5, r5
|
||||
; CHECK-NEXT: vmov s9, r4
|
||||
; CHECK-NEXT: vmov q1[3], q1[1], r6, r4
|
||||
; CHECK-NEXT: vmov.f32 s2, s1
|
||||
; CHECK-NEXT: vmov q3, q4
|
||||
; CHECK-NEXT: vmov.f32 s8, s0
|
||||
; CHECK-NEXT: vmov q5, q4
|
||||
; CHECK-NEXT: vmov.f32 s10, s1
|
||||
; CHECK-NEXT: vstrw.32 q1, [sp, #76]
|
||||
; CHECK-NEXT: vmov q1[2], q1[0], r7, r6
|
||||
; CHECK-NEXT: mov.w r8, #4
|
||||
; CHECK-NEXT: mov.w r10, #0
|
||||
; CHECK-NEXT: vmov q1[3], q1[1], r7, r4
|
||||
; CHECK-NEXT: vmov.32 q3[0], r4
|
||||
; CHECK-NEXT: vmov.32 q5[1], r4
|
||||
; CHECK-NEXT: str r1, [r0]
|
||||
; CHECK-NEXT: vmov.f32 s11, s3
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: .pad #416
|
||||
; CHECK-NEXT: sub sp, #416
|
||||
; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals
|
||||
; CHECK-NEXT: vldr s12, .LCPI1_0
|
||||
; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals
|
||||
; CHECK-NEXT: vldr s15, .LCPI1_1
|
||||
; CHECK-NEXT: mov r3, r7
|
||||
; CHECK-NEXT: mov r4, r7
|
||||
; CHECK-NEXT: ldr r0, [r3, #4]!
|
||||
; CHECK-NEXT: movw r2, :lower16:e
|
||||
; CHECK-NEXT: ldr r6, [r4, #8]!
|
||||
; CHECK-NEXT: vmov r5, s15
|
||||
; CHECK-NEXT: vmov s13, r3
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: movt r2, :upper16:e
|
||||
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
|
||||
; CHECK-NEXT: vmov q0[2], q0[0], r4, r4
|
||||
; CHECK-NEXT: vmov s21, r2
|
||||
; CHECK-NEXT: vmov.f32 s14, s13
|
||||
; CHECK-NEXT: vmov q0[3], q0[1], r5, r2
|
||||
; CHECK-NEXT: vmov.f32 s20, s12
|
||||
; CHECK-NEXT: vdup.32 q7, r3
|
||||
; CHECK-NEXT: vmov q6[2], q6[0], r3, r5
|
||||
; CHECK-NEXT: vmov.f32 s22, s13
|
||||
; CHECK-NEXT: vstrw.32 q0, [sp, #100]
|
||||
; CHECK-NEXT: vmov q0, q7
|
||||
; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
|
||||
; CHECK-NEXT: vmov q4, q7
|
||||
; CHECK-NEXT: vmov.32 q0[0], r2
|
||||
; CHECK-NEXT: vmov.32 q7[1], r2
|
||||
; CHECK-NEXT: vmov.f32 s23, s15
|
||||
; CHECK-NEXT: movs r1, #64
|
||||
; CHECK-NEXT: strh.w r8, [sp, #390]
|
||||
; CHECK-NEXT: strd r0, r10, [sp, #24]
|
||||
; CHECK-NEXT: vstrw.32 q0, [sp, #44]
|
||||
; CHECK-NEXT: str r0, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0]
|
||||
; CHECK-NEXT: str r0, [sp, #48]
|
||||
; CHECK-NEXT: vstrw.32 q5, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q3, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q1, [r0]
|
||||
; CHECK-NEXT: bl __aeabi_memclr4
|
||||
; CHECK-NEXT: vmov q0[2], q0[0], r5, r7
|
||||
; CHECK-NEXT: vmov q1[2], q1[0], r7, r7
|
||||
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
|
||||
; CHECK-NEXT: vmov q1[3], q1[1], r5, r6
|
||||
; CHECK-NEXT: vmov.32 q4[0], r10
|
||||
; CHECK-NEXT: str r6, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q7, [r0]
|
||||
; CHECK-NEXT: str r0, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r0]
|
||||
; CHECK-NEXT: str.w r10, [r9]
|
||||
; CHECK-NEXT: vstrw.32 q4, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q6, [r0]
|
||||
; CHECK-NEXT: mov.w r8, #0
|
||||
; CHECK-NEXT: vmov q1[2], q1[0], r4, r3
|
||||
; CHECK-NEXT: vmov q2[2], q2[0], r3, r3
|
||||
; CHECK-NEXT: mov.w r12, #4
|
||||
; CHECK-NEXT: vmov q1[3], q1[1], r2, r4
|
||||
; CHECK-NEXT: vmov q2[3], q2[1], r4, r5
|
||||
; CHECK-NEXT: vmov.32 q4[0], r8
|
||||
; CHECK-NEXT: @ implicit-def: $r2
|
||||
; CHECK-NEXT: str.w r8, [sp, #52]
|
||||
; CHECK-NEXT: strh.w r12, [sp, #414]
|
||||
; CHECK-NEXT: vstrw.32 q3, [sp, #68]
|
||||
; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2
|
||||
; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
|
||||
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
||||
; CHECK-NEXT: letp lr, .LBB1_1
|
||||
; CHECK-NEXT: .LBB1_2: @ %entry
|
||||
; CHECK-NEXT: vstrw.32 q1, [r0]
|
||||
; CHECK-NEXT: str.w r8, [sp, #308]
|
||||
; CHECK-NEXT: .LBB1_1: @ %for.cond
|
||||
; CHECK-NEXT: str.w r8, [r7]
|
||||
; CHECK-NEXT: vstrw.32 q4, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0]
|
||||
; CHECK-NEXT: str.w r12, [sp, #332]
|
||||
; CHECK-NEXT: .LBB1_3: @ %for.cond
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: b .LBB1_1
|
||||
; CHECK-NEXT: b .LBB1_3
|
||||
; CHECK-NEXT: .p2align 2
|
||||
; CHECK-NEXT: @ %bb.2:
|
||||
; CHECK-NEXT: @ %bb.4:
|
||||
; CHECK-NEXT: .LCPI1_0:
|
||||
; CHECK-NEXT: .long 0x00000004 @ float 5.60519386E-45
|
||||
; CHECK-NEXT: .LCPI1_1:
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
|
||||
declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)
|
||||
|
||||
define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){
|
||||
; CHECK-LABEL: test1:
|
||||
@ -281,5 +282,132 @@ for.cond.cleanup: ; preds = %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Check that WLSTP loop is generated for simplest case of align = 1
|
||||
define void @test12(i8* %X, i8 zeroext %c, i32 %n) {
|
||||
; CHECK-LABEL: test12:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: vdup.8 q0, r1
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB11_2
|
||||
; CHECK-NEXT: .LBB11_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB11_1
|
||||
; CHECK-NEXT: .LBB11_2: @ %entry
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
call void @llvm.memset.p0i8.i32(i8* align 1 %X, i8 %c, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Check that WLSTP loop is generated for alignment >= 4
|
||||
define void @test13(i32* %X, i8 zeroext %c, i32 %n) {
|
||||
; CHECK-LABEL: test13:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: vdup.8 q0, r1
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB12_2
|
||||
; CHECK-NEXT: .LBB12_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB12_1
|
||||
; CHECK-NEXT: .LBB12_2: @ %entry
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 %c, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Checks that transform correctly handles input with some arithmetic on input arguments.
|
||||
; void test14(int* X, char c, int n)
|
||||
; {
|
||||
; memset(X+2, c, (n*2)+10);
|
||||
; }
|
||||
|
||||
define void @test14(i32* %X, i8 zeroext %c, i32 %n) {
|
||||
; CHECK-LABEL: test14:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: movs r3, #10
|
||||
; CHECK-NEXT: add.w r2, r3, r2, lsl #1
|
||||
; CHECK-NEXT: vdup.8 q0, r1
|
||||
; CHECK-NEXT: adds r0, #8
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB13_2
|
||||
; CHECK-NEXT: .LBB13_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB13_1
|
||||
; CHECK-NEXT: .LBB13_2: @ %entry
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%add.ptr = getelementptr inbounds i32, i32* %X, i32 2
|
||||
%0 = bitcast i32* %add.ptr to i8*
|
||||
%mul = shl nsw i32 %n, 1
|
||||
%add = add nsw i32 %mul, 10
|
||||
call void @llvm.memset.p0i8.i32(i8* nonnull align 4 %0, i8 %c, i32 %add, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
; Checks that transform handles for-loops (that get implicitly converted to memset)
|
||||
; void test15(int* X, char Y, int n){
|
||||
; for(int i = 0; i < n; ++i){
|
||||
; X[i] = c;
|
||||
; }
|
||||
; }
|
||||
|
||||
define void @test15(i8* nocapture %X, i8 zeroext %c, i32 %n) {
|
||||
; CHECK-LABEL: test15:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: cmp r2, #1
|
||||
; CHECK-NEXT: it lt
|
||||
; CHECK-NEXT: bxlt lr
|
||||
; CHECK-NEXT: .LBB14_1: @ %for.body.preheader
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: vdup.8 q0, r1
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB14_3
|
||||
; CHECK-NEXT: .LBB14_2: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB14_2
|
||||
; CHECK-NEXT: .LBB14_3: @ %for.body.preheader
|
||||
; CHECK-NEXT: pop.w {r7, lr}
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%cmp4 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
call void @llvm.memset.p0i8.i32(i8* align 4 %X, i8 %c, i32 %n, i1 false)
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body.preheader, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Checks that transform handles case with 0 as src value. No difference is expected.
|
||||
define void @test16(i32* %X, i8 zeroext %c, i32 %n) {
|
||||
; CHECK-LABEL: test16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: wlstp.8 lr, r2, .LBB15_2
|
||||
; CHECK-NEXT: .LBB15_1: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vstrb.8 q0, [r0], #16
|
||||
; CHECK-NEXT: letp lr, .LBB15_1
|
||||
; CHECK-NEXT: .LBB15_2: @ %entry
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 0, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { noinline optnone }
|
||||
attributes #1 = { optsize }
|
||||
|
@ -6,6 +6,8 @@
|
||||
|
||||
; Function Attrs: argmemonly nofree nosync nounwind willreturn
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
|
||||
; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
|
||||
declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)
|
||||
|
||||
define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
|
||||
entry:
|
||||
@ -30,6 +32,27 @@
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test3(i32* nocapture %X, i8 zeroext %c, i32 %n) {
|
||||
entry:
|
||||
%0 = bitcast i32* %X to i8*
|
||||
tail call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 %c, i32 %n, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @test4(i8* nocapture %X, i8 zeroext %c, i32 %n) {
|
||||
entry:
|
||||
%cmp4 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
call void @llvm.memset.p0i8.i32(i8* align 1 %X, i8 %c, i32 %n, i1 false)
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body.preheader, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
...
|
||||
---
|
||||
name: test1
|
||||
@ -56,7 +79,7 @@ body: |
|
||||
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
|
||||
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
|
||||
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
|
||||
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
|
||||
@ -97,7 +120,7 @@ body: |
|
||||
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
|
||||
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
|
||||
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
|
||||
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
|
||||
@ -125,3 +148,92 @@ body: |
|
||||
tBX_RET 14 /* CC::al */, $noreg
|
||||
|
||||
...
|
||||
---
|
||||
name: test3
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0.entry:
|
||||
liveins: $r0, $r1, $r2
|
||||
|
||||
; CHECK-LABEL: name: test3
|
||||
; CHECK: liveins: $r0, $r1, $r2
|
||||
; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:mqpr = COPY $r1
|
||||
; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
|
||||
; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
|
||||
; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
|
||||
; CHECK: .1:
|
||||
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %8, %bb.1
|
||||
; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %10, %bb.1
|
||||
; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %12, %bb.1
|
||||
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
|
||||
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY1]], [[PHI]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI1]], 1
|
||||
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
|
||||
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
|
||||
; CHECK: .2.entry:
|
||||
; CHECK: tBX_RET 14 /* CC::al */, $noreg
|
||||
%2:rgpr = COPY $r2
|
||||
%1:mqpr = COPY $r1
|
||||
%0:rgpr = COPY $r0
|
||||
MVE_MEMSETLOOPINST %0, %1, %2
|
||||
tBX_RET 14 /* CC::al */, $noreg
|
||||
|
||||
...
|
||||
---
|
||||
name: test4
|
||||
alignment: 2
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
; CHECK-LABEL: name: test4
|
||||
; CHECK: bb.0.entry:
|
||||
; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000)
|
||||
; CHECK: liveins: $r0, $r1, $r2
|
||||
; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:mqpr = COPY $r1
|
||||
; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
|
||||
; CHECK: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
|
||||
; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
|
||||
; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg
|
||||
; CHECK: bb.1.for.body.preheader:
|
||||
; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
|
||||
; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.4, implicit-def $cpsr
|
||||
; CHECK: bb.3:
|
||||
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %8, %bb.3
|
||||
; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %10, %bb.3
|
||||
; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %12, %bb.3
|
||||
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
|
||||
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg
|
||||
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY1]], [[PHI]], 16, 1, [[MVE_VCTP8_]]
|
||||
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI1]], 1
|
||||
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
|
||||
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
|
||||
; CHECK: bb.4.for.body.preheader:
|
||||
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
|
||||
; CHECK: bb.2.for.cond.cleanup:
|
||||
; CHECK: tBX_RET 14 /* CC::al */, $noreg
|
||||
bb.0.entry:
|
||||
successors: %bb.1(0x50000000), %bb.2(0x30000000)
|
||||
liveins: $r0, $r1, $r2
|
||||
|
||||
%2:rgpr = COPY $r2
|
||||
%1:mqpr = COPY $r1
|
||||
%0:rgpr = COPY $r0
|
||||
t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
|
||||
t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
|
||||
t2B %bb.1, 14 /* CC::al */, $noreg
|
||||
|
||||
bb.1.for.body.preheader:
|
||||
MVE_MEMSETLOOPINST %0, %1, %2
|
||||
|
||||
bb.2.for.cond.cleanup:
|
||||
tBX_RET 14 /* CC::al */, $noreg
|
||||
|
||||
...
|
||||
|
Loading…
x
Reference in New Issue
Block a user