mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
Add codegen support for using post-increment NEON load/store instructions.
The vld1-lane, vld1-dup and vst1-lane instructions do not yet support using post-increment versions, but all the rest of the NEON load/store instructions should be handled now. llvm-svn: 125014
This commit is contained in:
parent
46b105c6a2
commit
65f4a70b82
@ -196,26 +196,30 @@ private:
|
||||
/// 1, 2, 3 or 4. The opcode arrays specify the instructions used for
|
||||
/// loads of D registers and even subregs and odd subregs of Q registers.
|
||||
/// For NumVecs <= 2, QOpcodes1 is not used.
|
||||
SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
|
||||
SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
|
||||
unsigned *DOpcodes,
|
||||
unsigned *QOpcodes0, unsigned *QOpcodes1);
|
||||
|
||||
/// SelectVST - Select NEON store intrinsics. NumVecs should
|
||||
/// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for
|
||||
/// stores of D registers and even subregs and odd subregs of Q registers.
|
||||
/// For NumVecs <= 2, QOpcodes1 is not used.
|
||||
SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
|
||||
SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
|
||||
unsigned *DOpcodes,
|
||||
unsigned *QOpcodes0, unsigned *QOpcodes1);
|
||||
|
||||
/// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should
|
||||
/// be 2, 3 or 4. The opcode arrays specify the instructions used for
|
||||
/// load/store of D registers and Q registers.
|
||||
SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs,
|
||||
SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
|
||||
bool isUpdating, unsigned NumVecs,
|
||||
unsigned *DOpcodes, unsigned *QOpcodes);
|
||||
|
||||
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
|
||||
/// should be 2, 3 or 4. The opcode array specifies the instructions used
|
||||
/// for loading D registers. (Q registers are not supported.)
|
||||
SDNode *SelectVLDDup(SDNode *N, unsigned NumVecs, unsigned *Opcodes);
|
||||
SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
|
||||
unsigned *Opcodes);
|
||||
|
||||
/// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2,
|
||||
/// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be
|
||||
@ -1439,14 +1443,15 @@ SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs,
|
||||
return CurDAG->getTargetConstant(Alignment, MVT::i32);
|
||||
}
|
||||
|
||||
SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
|
||||
SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
|
||||
unsigned *DOpcodes, unsigned *QOpcodes0,
|
||||
unsigned *QOpcodes1) {
|
||||
assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
|
||||
SDValue MemAddr, Align;
|
||||
if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
|
||||
unsigned AddrOpIdx = isUpdating ? 1 : 2;
|
||||
if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
|
||||
return NULL;
|
||||
|
||||
SDValue Chain = N->getOperand(0);
|
||||
@ -1482,46 +1487,39 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
|
||||
ResTyElts *= 2;
|
||||
ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
|
||||
}
|
||||
std::vector<EVT> ResTys;
|
||||
ResTys.push_back(ResTy);
|
||||
if (isUpdating)
|
||||
ResTys.push_back(MVT::i32);
|
||||
ResTys.push_back(MVT::Other);
|
||||
|
||||
SDValue Pred = getAL(CurDAG);
|
||||
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
|
||||
SDValue SuperReg;
|
||||
if (is64BitVector) {
|
||||
const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
|
||||
SDNode *VLd = CurDAG->getMachineNode(DOpcodes[OpcodeIndex], dl,
|
||||
ResTy, MVT::Other, Ops, 5);
|
||||
if (NumVecs == 1)
|
||||
return VLd;
|
||||
SDNode *VLd;
|
||||
SmallVector<SDValue, 7> Ops;
|
||||
|
||||
SuperReg = SDValue(VLd, 0);
|
||||
assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
|
||||
for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
|
||||
SDValue D = CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec,
|
||||
dl, VT, SuperReg);
|
||||
ReplaceUses(SDValue(N, Vec), D);
|
||||
// Double registers and VLD1/VLD2 quad registers are directly supported.
|
||||
if (is64BitVector || NumVecs <= 2) {
|
||||
unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
|
||||
QOpcodes0[OpcodeIndex]);
|
||||
Ops.push_back(MemAddr);
|
||||
Ops.push_back(Align);
|
||||
if (isUpdating) {
|
||||
SDValue Inc = N->getOperand(AddrOpIdx + 1);
|
||||
Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
|
||||
}
|
||||
ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (NumVecs <= 2) {
|
||||
// Quad registers are directly supported for VLD1 and VLD2,
|
||||
// loading pairs of D regs.
|
||||
const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
|
||||
SDNode *VLd = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
|
||||
ResTy, MVT::Other, Ops, 5);
|
||||
if (NumVecs == 1)
|
||||
return VLd;
|
||||
|
||||
SuperReg = SDValue(VLd, 0);
|
||||
Chain = SDValue(VLd, 1);
|
||||
Ops.push_back(Pred);
|
||||
Ops.push_back(Reg0);
|
||||
Ops.push_back(Chain);
|
||||
VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
|
||||
|
||||
} else {
|
||||
// Otherwise, quad registers are loaded with two separate instructions,
|
||||
// where one loads the even registers and the other loads the odd registers.
|
||||
EVT AddrTy = MemAddr.getValueType();
|
||||
|
||||
// Load the even subregs.
|
||||
// Load the even subregs. This is always an updating load, so that it
|
||||
// provides the address to the second load for the odd subregs.
|
||||
SDValue ImplDef =
|
||||
SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
|
||||
const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain };
|
||||
@ -1530,37 +1528,54 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
|
||||
Chain = SDValue(VLdA, 2);
|
||||
|
||||
// Load the odd subregs.
|
||||
const SDValue OpsB[] = { SDValue(VLdA, 1), Align, SDValue(VLdA, 0),
|
||||
Pred, Reg0, Chain };
|
||||
SDNode *VLdB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl,
|
||||
ResTy, MVT::Other, OpsB, 6);
|
||||
SuperReg = SDValue(VLdB, 0);
|
||||
Chain = SDValue(VLdB, 1);
|
||||
Ops.push_back(SDValue(VLdA, 1));
|
||||
Ops.push_back(Align);
|
||||
if (isUpdating) {
|
||||
SDValue Inc = N->getOperand(AddrOpIdx + 1);
|
||||
assert(isa<ConstantSDNode>(Inc.getNode()) &&
|
||||
"only constant post-increment update allowed for VLD3/4");
|
||||
(void)Inc;
|
||||
Ops.push_back(Reg0);
|
||||
}
|
||||
Ops.push_back(SDValue(VLdA, 0));
|
||||
Ops.push_back(Pred);
|
||||
Ops.push_back(Reg0);
|
||||
Ops.push_back(Chain);
|
||||
VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
|
||||
Ops.data(), Ops.size());
|
||||
}
|
||||
|
||||
// Extract out the Q registers.
|
||||
assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
|
||||
for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
|
||||
SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
|
||||
dl, VT, SuperReg);
|
||||
ReplaceUses(SDValue(N, Vec), Q);
|
||||
}
|
||||
ReplaceUses(SDValue(N, NumVecs), Chain);
|
||||
if (NumVecs == 1)
|
||||
return VLd;
|
||||
|
||||
// Extract out the subregisters.
|
||||
SDValue SuperReg = SDValue(VLd, 0);
|
||||
assert(ARM::dsub_7 == ARM::dsub_0+7 &&
|
||||
ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
|
||||
unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
|
||||
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
|
||||
ReplaceUses(SDValue(N, Vec),
|
||||
CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
|
||||
ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
|
||||
if (isUpdating)
|
||||
ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
|
||||
SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
|
||||
unsigned *DOpcodes, unsigned *QOpcodes0,
|
||||
unsigned *QOpcodes1) {
|
||||
assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
|
||||
SDValue MemAddr, Align;
|
||||
if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
|
||||
unsigned AddrOpIdx = isUpdating ? 1 : 2;
|
||||
unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
|
||||
if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
|
||||
return NULL;
|
||||
|
||||
SDValue Chain = N->getOperand(0);
|
||||
EVT VT = N->getOperand(3).getValueType();
|
||||
EVT VT = N->getOperand(Vec0Idx).getValueType();
|
||||
bool is64BitVector = VT.is64BitVector();
|
||||
Align = GetVLDSTAlign(Align, NumVecs, is64BitVector);
|
||||
|
||||
@ -1583,64 +1598,71 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
|
||||
break;
|
||||
}
|
||||
|
||||
std::vector<EVT> ResTys;
|
||||
if (isUpdating)
|
||||
ResTys.push_back(MVT::i32);
|
||||
ResTys.push_back(MVT::Other);
|
||||
|
||||
SDValue Pred = getAL(CurDAG);
|
||||
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
|
||||
SmallVector<SDValue, 7> Ops;
|
||||
|
||||
if (is64BitVector) {
|
||||
// Double registers and VST1/VST2 quad registers are directly supported.
|
||||
if (is64BitVector || NumVecs <= 2) {
|
||||
SDValue SrcReg;
|
||||
if (NumVecs == 1) {
|
||||
SrcReg = N->getOperand(3);
|
||||
} else {
|
||||
SDValue V0 = N->getOperand(0+3);
|
||||
SDValue V1 = N->getOperand(1+3);
|
||||
|
||||
SrcReg = N->getOperand(Vec0Idx);
|
||||
} else if (is64BitVector) {
|
||||
// Form a REG_SEQUENCE to force register allocation.
|
||||
SDValue V0 = N->getOperand(Vec0Idx + 0);
|
||||
SDValue V1 = N->getOperand(Vec0Idx + 1);
|
||||
if (NumVecs == 2)
|
||||
SrcReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
|
||||
else {
|
||||
SDValue V2 = N->getOperand(2+3);
|
||||
SDValue V2 = N->getOperand(Vec0Idx + 2);
|
||||
// If it's a vst3, form a quad D-register and leave the last part as
|
||||
// an undef.
|
||||
SDValue V3 = (NumVecs == 3)
|
||||
? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
|
||||
: N->getOperand(3+3);
|
||||
: N->getOperand(Vec0Idx + 3);
|
||||
SrcReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
|
||||
}
|
||||
}
|
||||
const SDValue Ops[] = { MemAddr, Align, SrcReg, Pred, Reg0, Chain };
|
||||
return CurDAG->getMachineNode(DOpcodes[OpcodeIndex], dl,
|
||||
MVT::Other, Ops, 6);
|
||||
}
|
||||
|
||||
if (NumVecs <= 2) {
|
||||
// Quad registers are directly supported for VST1 and VST2.
|
||||
SDValue SrcReg;
|
||||
if (NumVecs == 1) {
|
||||
SrcReg = N->getOperand(3);
|
||||
} else {
|
||||
// Form a QQ register.
|
||||
SDValue Q0 = N->getOperand(3);
|
||||
SDValue Q1 = N->getOperand(4);
|
||||
SDValue Q0 = N->getOperand(Vec0Idx);
|
||||
SDValue Q1 = N->getOperand(Vec0Idx + 1);
|
||||
SrcReg = SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0);
|
||||
}
|
||||
const SDValue Ops[] = { MemAddr, Align, SrcReg, Pred, Reg0, Chain };
|
||||
return CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
|
||||
MVT::Other, Ops, 6);
|
||||
|
||||
unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
|
||||
QOpcodes0[OpcodeIndex]);
|
||||
Ops.push_back(MemAddr);
|
||||
Ops.push_back(Align);
|
||||
if (isUpdating) {
|
||||
SDValue Inc = N->getOperand(AddrOpIdx + 1);
|
||||
Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
|
||||
}
|
||||
Ops.push_back(SrcReg);
|
||||
Ops.push_back(Pred);
|
||||
Ops.push_back(Reg0);
|
||||
Ops.push_back(Chain);
|
||||
return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
|
||||
}
|
||||
|
||||
// Otherwise, quad registers are stored with two separate instructions,
|
||||
// where one stores the even registers and the other stores the odd registers.
|
||||
|
||||
// Form the QQQQ REG_SEQUENCE.
|
||||
SDValue V0 = N->getOperand(0+3);
|
||||
SDValue V1 = N->getOperand(1+3);
|
||||
SDValue V2 = N->getOperand(2+3);
|
||||
SDValue V0 = N->getOperand(Vec0Idx + 0);
|
||||
SDValue V1 = N->getOperand(Vec0Idx + 1);
|
||||
SDValue V2 = N->getOperand(Vec0Idx + 2);
|
||||
SDValue V3 = (NumVecs == 3)
|
||||
? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
|
||||
: N->getOperand(3+3);
|
||||
: N->getOperand(Vec0Idx + 3);
|
||||
SDValue RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
|
||||
|
||||
// Store the even D registers.
|
||||
// Store the even D registers. This is always an updating store, so that it
|
||||
// provides the address to the second store for the odd subregs.
|
||||
const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain };
|
||||
SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
|
||||
MemAddr.getValueType(),
|
||||
@ -1648,28 +1670,40 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
|
||||
Chain = SDValue(VStA, 1);
|
||||
|
||||
// Store the odd D registers.
|
||||
const SDValue OpsB[] = { SDValue(VStA, 0), Align, RegSeq, Pred, Reg0, Chain };
|
||||
SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl,
|
||||
MVT::Other, OpsB, 6);
|
||||
Chain = SDValue(VStB, 0);
|
||||
ReplaceUses(SDValue(N, 0), Chain);
|
||||
return NULL;
|
||||
Ops.push_back(SDValue(VStA, 0));
|
||||
Ops.push_back(Align);
|
||||
if (isUpdating) {
|
||||
SDValue Inc = N->getOperand(AddrOpIdx + 1);
|
||||
assert(isa<ConstantSDNode>(Inc.getNode()) &&
|
||||
"only constant post-increment update allowed for VST3/4");
|
||||
(void)Inc;
|
||||
Ops.push_back(Reg0);
|
||||
}
|
||||
Ops.push_back(RegSeq);
|
||||
Ops.push_back(Pred);
|
||||
Ops.push_back(Reg0);
|
||||
Ops.push_back(Chain);
|
||||
return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
|
||||
Ops.data(), Ops.size());
|
||||
}
|
||||
|
||||
SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
|
||||
unsigned NumVecs, unsigned *DOpcodes,
|
||||
bool isUpdating, unsigned NumVecs,
|
||||
unsigned *DOpcodes,
|
||||
unsigned *QOpcodes) {
|
||||
assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
|
||||
SDValue MemAddr, Align;
|
||||
if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
|
||||
unsigned AddrOpIdx = isUpdating ? 1 : 2;
|
||||
unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
|
||||
if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
|
||||
return NULL;
|
||||
|
||||
SDValue Chain = N->getOperand(0);
|
||||
unsigned Lane =
|
||||
cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue();
|
||||
EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType();
|
||||
cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
|
||||
EVT VT = N->getOperand(Vec0Idx).getValueType();
|
||||
bool is64BitVector = VT.is64BitVector();
|
||||
|
||||
unsigned Alignment = 0;
|
||||
@ -1701,29 +1735,42 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
|
||||
case MVT::v4i32: OpcodeIndex = 1; break;
|
||||
}
|
||||
|
||||
std::vector<EVT> ResTys;
|
||||
if (IsLoad) {
|
||||
unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
|
||||
if (!is64BitVector)
|
||||
ResTyElts *= 2;
|
||||
ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(),
|
||||
MVT::i64, ResTyElts));
|
||||
}
|
||||
if (isUpdating)
|
||||
ResTys.push_back(MVT::i32);
|
||||
ResTys.push_back(MVT::Other);
|
||||
|
||||
SDValue Pred = getAL(CurDAG);
|
||||
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
|
||||
|
||||
SmallVector<SDValue, 7> Ops;
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
Ops.push_back(MemAddr);
|
||||
Ops.push_back(Align);
|
||||
|
||||
unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
|
||||
QOpcodes[OpcodeIndex]);
|
||||
if (isUpdating) {
|
||||
SDValue Inc = N->getOperand(AddrOpIdx + 1);
|
||||
Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
|
||||
}
|
||||
|
||||
SDValue SuperReg;
|
||||
SDValue V0 = N->getOperand(0+3);
|
||||
SDValue V1 = N->getOperand(1+3);
|
||||
SDValue V0 = N->getOperand(Vec0Idx + 0);
|
||||
SDValue V1 = N->getOperand(Vec0Idx + 1);
|
||||
if (NumVecs == 2) {
|
||||
if (is64BitVector)
|
||||
SuperReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
|
||||
else
|
||||
SuperReg = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0);
|
||||
} else {
|
||||
SDValue V2 = N->getOperand(2+3);
|
||||
SDValue V2 = N->getOperand(Vec0Idx + 2);
|
||||
SDValue V3 = (NumVecs == 3)
|
||||
? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
|
||||
: N->getOperand(3+3);
|
||||
? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
|
||||
: N->getOperand(Vec0Idx + 3);
|
||||
if (is64BitVector)
|
||||
SuperReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
|
||||
else
|
||||
@ -1735,33 +1782,29 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
|
||||
Ops.push_back(Reg0);
|
||||
Ops.push_back(Chain);
|
||||
|
||||
unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
|
||||
QOpcodes[OpcodeIndex]);
|
||||
SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys,
|
||||
Ops.data(), Ops.size());
|
||||
if (!IsLoad)
|
||||
return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 7);
|
||||
|
||||
EVT ResTy;
|
||||
unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
|
||||
if (!is64BitVector)
|
||||
ResTyElts *= 2;
|
||||
ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
|
||||
|
||||
SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other,
|
||||
Ops.data(), 7);
|
||||
SuperReg = SDValue(VLdLn, 0);
|
||||
Chain = SDValue(VLdLn, 1);
|
||||
return VLdLn;
|
||||
|
||||
// Extract the subregisters.
|
||||
assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
|
||||
assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
|
||||
unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
|
||||
SuperReg = SDValue(VLdLn, 0);
|
||||
assert(ARM::dsub_7 == ARM::dsub_0+7 &&
|
||||
ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
|
||||
unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
|
||||
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
|
||||
ReplaceUses(SDValue(N, Vec),
|
||||
CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
|
||||
ReplaceUses(SDValue(N, NumVecs), Chain);
|
||||
CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
|
||||
ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
|
||||
if (isUpdating)
|
||||
ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs,
|
||||
unsigned *Opcodes) {
|
||||
SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
|
||||
unsigned NumVecs, unsigned *Opcodes) {
|
||||
assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
|
||||
@ -1800,13 +1843,26 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs,
|
||||
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
|
||||
SDValue SuperReg;
|
||||
unsigned Opc = Opcodes[OpcodeIndex];
|
||||
const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
|
||||
SmallVector<SDValue, 6> Ops;
|
||||
Ops.push_back(MemAddr);
|
||||
Ops.push_back(Align);
|
||||
if (isUpdating) {
|
||||
SDValue Inc = N->getOperand(2);
|
||||
Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
|
||||
}
|
||||
Ops.push_back(Pred);
|
||||
Ops.push_back(Reg0);
|
||||
Ops.push_back(Chain);
|
||||
|
||||
unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
|
||||
EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
|
||||
SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5);
|
||||
std::vector<EVT> ResTys;
|
||||
ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts));
|
||||
if (isUpdating)
|
||||
ResTys.push_back(MVT::i32);
|
||||
ResTys.push_back(MVT::Other);
|
||||
SDNode *VLdDup =
|
||||
CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
|
||||
SuperReg = SDValue(VLdDup, 0);
|
||||
Chain = SDValue(VLdDup, 1);
|
||||
|
||||
// Extract the subregisters.
|
||||
assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
|
||||
@ -1814,7 +1870,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs,
|
||||
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
|
||||
ReplaceUses(SDValue(N, Vec),
|
||||
CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
|
||||
ReplaceUses(SDValue(N, NumVecs), Chain);
|
||||
ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
|
||||
if (isUpdating)
|
||||
ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -2470,19 +2528,165 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
case ARMISD::VLD2DUP: {
|
||||
unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo,
|
||||
ARM::VLD2DUPd32Pseudo };
|
||||
return SelectVLDDup(N, 2, Opcodes);
|
||||
return SelectVLDDup(N, false, 2, Opcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VLD3DUP: {
|
||||
unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo,
|
||||
ARM::VLD3DUPd32Pseudo };
|
||||
return SelectVLDDup(N, 3, Opcodes);
|
||||
return SelectVLDDup(N, false, 3, Opcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VLD4DUP: {
|
||||
unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo,
|
||||
ARM::VLD4DUPd32Pseudo };
|
||||
return SelectVLDDup(N, 4, Opcodes);
|
||||
return SelectVLDDup(N, false, 4, Opcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VLD2DUP_UPD: {
|
||||
unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd16Pseudo_UPD,
|
||||
ARM::VLD2DUPd32Pseudo_UPD };
|
||||
return SelectVLDDup(N, true, 2, Opcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VLD3DUP_UPD: {
|
||||
unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD,
|
||||
ARM::VLD3DUPd32Pseudo_UPD };
|
||||
return SelectVLDDup(N, true, 3, Opcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VLD4DUP_UPD: {
|
||||
unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD,
|
||||
ARM::VLD4DUPd32Pseudo_UPD };
|
||||
return SelectVLDDup(N, true, 4, Opcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VLD1_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VLD1d8_UPD, ARM::VLD1d16_UPD,
|
||||
ARM::VLD1d32_UPD, ARM::VLD1d64_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VLD1q8Pseudo_UPD, ARM::VLD1q16Pseudo_UPD,
|
||||
ARM::VLD1q32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD };
|
||||
return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0);
|
||||
}
|
||||
|
||||
case ARMISD::VLD2_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VLD2d8Pseudo_UPD, ARM::VLD2d16Pseudo_UPD,
|
||||
ARM::VLD2d32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VLD2q8Pseudo_UPD, ARM::VLD2q16Pseudo_UPD,
|
||||
ARM::VLD2q32Pseudo_UPD };
|
||||
return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
|
||||
}
|
||||
|
||||
case ARMISD::VLD3_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD,
|
||||
ARM::VLD3d32Pseudo_UPD, ARM::VLD1d64TPseudo_UPD };
|
||||
unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
|
||||
ARM::VLD3q16Pseudo_UPD,
|
||||
ARM::VLD3q32Pseudo_UPD };
|
||||
unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
|
||||
ARM::VLD3q16oddPseudo_UPD,
|
||||
ARM::VLD3q32oddPseudo_UPD };
|
||||
return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
}
|
||||
|
||||
case ARMISD::VLD4_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD,
|
||||
ARM::VLD4d32Pseudo_UPD, ARM::VLD1d64QPseudo_UPD };
|
||||
unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
|
||||
ARM::VLD4q16Pseudo_UPD,
|
||||
ARM::VLD4q32Pseudo_UPD };
|
||||
unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
|
||||
ARM::VLD4q16oddPseudo_UPD,
|
||||
ARM::VLD4q32oddPseudo_UPD };
|
||||
return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
}
|
||||
|
||||
case ARMISD::VLD2LN_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD,
|
||||
ARM::VLD2LNd32Pseudo_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
|
||||
ARM::VLD2LNq32Pseudo_UPD };
|
||||
return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VLD3LN_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD,
|
||||
ARM::VLD3LNd32Pseudo_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
|
||||
ARM::VLD3LNq32Pseudo_UPD };
|
||||
return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VLD4LN_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD,
|
||||
ARM::VLD4LNd32Pseudo_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
|
||||
ARM::VLD4LNq32Pseudo_UPD };
|
||||
return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VST1_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VST1d8_UPD, ARM::VST1d16_UPD,
|
||||
ARM::VST1d32_UPD, ARM::VST1d64_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VST1q8Pseudo_UPD, ARM::VST1q16Pseudo_UPD,
|
||||
ARM::VST1q32Pseudo_UPD, ARM::VST1q64Pseudo_UPD };
|
||||
return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0);
|
||||
}
|
||||
|
||||
case ARMISD::VST2_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VST2d8Pseudo_UPD, ARM::VST2d16Pseudo_UPD,
|
||||
ARM::VST2d32Pseudo_UPD, ARM::VST1q64Pseudo_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VST2q8Pseudo_UPD, ARM::VST2q16Pseudo_UPD,
|
||||
ARM::VST2q32Pseudo_UPD };
|
||||
return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
|
||||
}
|
||||
|
||||
case ARMISD::VST3_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD,
|
||||
ARM::VST3d32Pseudo_UPD, ARM::VST1d64TPseudo_UPD };
|
||||
unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
|
||||
ARM::VST3q16Pseudo_UPD,
|
||||
ARM::VST3q32Pseudo_UPD };
|
||||
unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
|
||||
ARM::VST3q16oddPseudo_UPD,
|
||||
ARM::VST3q32oddPseudo_UPD };
|
||||
return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
}
|
||||
|
||||
case ARMISD::VST4_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD,
|
||||
ARM::VST4d32Pseudo_UPD, ARM::VST1d64QPseudo_UPD };
|
||||
unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
|
||||
ARM::VST4q16Pseudo_UPD,
|
||||
ARM::VST4q32Pseudo_UPD };
|
||||
unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
|
||||
ARM::VST4q16oddPseudo_UPD,
|
||||
ARM::VST4q32oddPseudo_UPD };
|
||||
return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
}
|
||||
|
||||
case ARMISD::VST2LN_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD,
|
||||
ARM::VST2LNd32Pseudo_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
|
||||
ARM::VST2LNq32Pseudo_UPD };
|
||||
return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VST3LN_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD,
|
||||
ARM::VST3LNd32Pseudo_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
|
||||
ARM::VST3LNq32Pseudo_UPD };
|
||||
return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case ARMISD::VST4LN_UPD: {
|
||||
unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD,
|
||||
ARM::VST4LNd32Pseudo_UPD };
|
||||
unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
|
||||
ARM::VST4LNq32Pseudo_UPD };
|
||||
return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case ISD::INTRINSIC_VOID:
|
||||
@ -2497,7 +2701,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
ARM::VLD1d32, ARM::VLD1d64 };
|
||||
unsigned QOpcodes[] = { ARM::VLD1q8Pseudo, ARM::VLD1q16Pseudo,
|
||||
ARM::VLD1q32Pseudo, ARM::VLD1q64Pseudo };
|
||||
return SelectVLD(N, 1, DOpcodes, QOpcodes, 0);
|
||||
return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vld2: {
|
||||
@ -2505,7 +2709,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
ARM::VLD2d32Pseudo, ARM::VLD1q64Pseudo };
|
||||
unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
|
||||
ARM::VLD2q32Pseudo };
|
||||
return SelectVLD(N, 2, DOpcodes, QOpcodes, 0);
|
||||
return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vld3: {
|
||||
@ -2517,7 +2721,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo,
|
||||
ARM::VLD3q16oddPseudo,
|
||||
ARM::VLD3q32oddPseudo };
|
||||
return SelectVLD(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vld4: {
|
||||
@ -2529,28 +2733,28 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo,
|
||||
ARM::VLD4q16oddPseudo,
|
||||
ARM::VLD4q32oddPseudo };
|
||||
return SelectVLD(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vld2lane: {
|
||||
unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo,
|
||||
ARM::VLD2LNd32Pseudo };
|
||||
unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo };
|
||||
return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes);
|
||||
return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vld3lane: {
|
||||
unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo,
|
||||
ARM::VLD3LNd32Pseudo };
|
||||
unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo };
|
||||
return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes);
|
||||
return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vld4lane: {
|
||||
unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo,
|
||||
ARM::VLD4LNd32Pseudo };
|
||||
unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo };
|
||||
return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes);
|
||||
return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vst1: {
|
||||
@ -2558,7 +2762,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
ARM::VST1d32, ARM::VST1d64 };
|
||||
unsigned QOpcodes[] = { ARM::VST1q8Pseudo, ARM::VST1q16Pseudo,
|
||||
ARM::VST1q32Pseudo, ARM::VST1q64Pseudo };
|
||||
return SelectVST(N, 1, DOpcodes, QOpcodes, 0);
|
||||
return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vst2: {
|
||||
@ -2566,7 +2770,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
ARM::VST2d32Pseudo, ARM::VST1q64Pseudo };
|
||||
unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
|
||||
ARM::VST2q32Pseudo };
|
||||
return SelectVST(N, 2, DOpcodes, QOpcodes, 0);
|
||||
return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vst3: {
|
||||
@ -2578,7 +2782,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo,
|
||||
ARM::VST3q16oddPseudo,
|
||||
ARM::VST3q32oddPseudo };
|
||||
return SelectVST(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vst4: {
|
||||
@ -2590,28 +2794,28 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo,
|
||||
ARM::VST4q16oddPseudo,
|
||||
ARM::VST4q32oddPseudo };
|
||||
return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vst2lane: {
|
||||
unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo,
|
||||
ARM::VST2LNd32Pseudo };
|
||||
unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo };
|
||||
return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes);
|
||||
return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vst3lane: {
|
||||
unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo,
|
||||
ARM::VST3LNd32Pseudo };
|
||||
unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo };
|
||||
return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes);
|
||||
return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_vst4lane: {
|
||||
unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo,
|
||||
ARM::VST4LNd32Pseudo };
|
||||
unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo };
|
||||
return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes);
|
||||
return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -457,6 +457,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
|
||||
setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
|
||||
setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
|
||||
|
||||
setTargetDAGCombine(ISD::INTRINSIC_VOID);
|
||||
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
|
||||
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
|
||||
setTargetDAGCombine(ISD::SHL);
|
||||
setTargetDAGCombine(ISD::SRL);
|
||||
@ -857,6 +859,23 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
|
||||
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
|
||||
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
|
||||
case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD";
|
||||
case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD";
|
||||
case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD";
|
||||
case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD";
|
||||
case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
|
||||
case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
|
||||
case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
|
||||
case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
|
||||
case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
|
||||
case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
|
||||
case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD";
|
||||
case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD";
|
||||
case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD";
|
||||
case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD";
|
||||
case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
|
||||
case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
|
||||
case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
|
||||
}
|
||||
}
|
||||
|
||||
@ -5210,6 +5229,138 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
|
||||
DAG.getUNDEF(VT), NewMask.data());
|
||||
}
|
||||
|
||||
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
|
||||
/// NEON load/store intrinsics to merge base address updates.
|
||||
static SDValue CombineBaseUpdate(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
|
||||
return SDValue();
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
|
||||
N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
|
||||
unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
|
||||
SDValue Addr = N->getOperand(AddrOpIdx);
|
||||
|
||||
// Search for a use of the address operand that is an increment.
|
||||
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
|
||||
UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
|
||||
SDNode *User = *UI;
|
||||
if (User->getOpcode() != ISD::ADD ||
|
||||
UI.getUse().getResNo() != Addr.getResNo())
|
||||
continue;
|
||||
|
||||
// Check that the add is independent of the load/store. Otherwise, folding
|
||||
// it would create a cycle.
|
||||
if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
|
||||
continue;
|
||||
|
||||
// Find the new opcode for the updating load/store.
|
||||
bool isLoad = true;
|
||||
bool isLaneOp = false;
|
||||
unsigned NewOpc = 0;
|
||||
unsigned NumVecs = 0;
|
||||
if (isIntrinsic) {
|
||||
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
switch (IntNo) {
|
||||
default: assert(0 && "unexpected intrinsic for Neon base update");
|
||||
case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
|
||||
NumVecs = 1; break;
|
||||
case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
|
||||
NumVecs = 2; break;
|
||||
case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
|
||||
NumVecs = 3; break;
|
||||
case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
|
||||
NumVecs = 4; break;
|
||||
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
|
||||
NumVecs = 2; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
|
||||
NumVecs = 3; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
|
||||
NumVecs = 4; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
|
||||
NumVecs = 1; isLoad = false; break;
|
||||
case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
|
||||
NumVecs = 2; isLoad = false; break;
|
||||
case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
|
||||
NumVecs = 3; isLoad = false; break;
|
||||
case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
|
||||
NumVecs = 4; isLoad = false; break;
|
||||
case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
|
||||
NumVecs = 2; isLoad = false; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
|
||||
NumVecs = 3; isLoad = false; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
|
||||
NumVecs = 4; isLoad = false; isLaneOp = true; break;
|
||||
}
|
||||
} else {
|
||||
isLaneOp = true;
|
||||
switch (N->getOpcode()) {
|
||||
default: assert(0 && "unexpected opcode for Neon base update");
|
||||
case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
|
||||
case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
|
||||
case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
|
||||
}
|
||||
}
|
||||
|
||||
// Find the size of memory referenced by the load/store.
|
||||
EVT VecTy;
|
||||
if (isLoad)
|
||||
VecTy = N->getValueType(0);
|
||||
else
|
||||
VecTy = N->getOperand(AddrOpIdx+1).getValueType();
|
||||
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
|
||||
if (isLaneOp)
|
||||
NumBytes /= VecTy.getVectorNumElements();
|
||||
|
||||
// If the increment is a constant, it must match the memory ref size.
|
||||
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
|
||||
if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
|
||||
uint64_t IncVal = CInc->getZExtValue();
|
||||
if (IncVal != NumBytes)
|
||||
continue;
|
||||
} else if (NumBytes >= 3 * 16) {
|
||||
// VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
|
||||
// separate instructions that make it harder to use a non-constant update.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create the new updating load/store node.
|
||||
EVT Tys[6];
|
||||
unsigned NumResultVecs = (isLoad ? NumVecs : 0);
|
||||
unsigned n;
|
||||
for (n = 0; n < NumResultVecs; ++n)
|
||||
Tys[n] = VecTy;
|
||||
Tys[n++] = MVT::i32;
|
||||
Tys[n] = MVT::Other;
|
||||
SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
Ops.push_back(N->getOperand(0)); // incoming chain
|
||||
Ops.push_back(N->getOperand(AddrOpIdx));
|
||||
Ops.push_back(Inc);
|
||||
for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
|
||||
Ops.push_back(N->getOperand(i));
|
||||
}
|
||||
MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
|
||||
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
|
||||
Ops.data(), Ops.size(),
|
||||
MemInt->getMemoryVT(),
|
||||
MemInt->getMemOperand());
|
||||
|
||||
// Update the uses.
|
||||
std::vector<SDValue> NewResults;
|
||||
for (unsigned i = 0; i < NumResultVecs; ++i) {
|
||||
NewResults.push_back(SDValue(UpdN.getNode(), i));
|
||||
}
|
||||
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
|
||||
DCI.CombineTo(N, NewResults);
|
||||
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
|
||||
|
||||
break;
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
|
||||
/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
|
||||
/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
|
||||
@ -5720,6 +5871,31 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
case ISD::ZERO_EXTEND:
|
||||
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
|
||||
case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
|
||||
case ARMISD::VLD2DUP:
|
||||
case ARMISD::VLD3DUP:
|
||||
case ARMISD::VLD4DUP:
|
||||
return CombineBaseUpdate(N, DCI);
|
||||
case ISD::INTRINSIC_VOID:
|
||||
case ISD::INTRINSIC_W_CHAIN:
|
||||
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
|
||||
case Intrinsic::arm_neon_vld1:
|
||||
case Intrinsic::arm_neon_vld2:
|
||||
case Intrinsic::arm_neon_vld3:
|
||||
case Intrinsic::arm_neon_vld4:
|
||||
case Intrinsic::arm_neon_vld2lane:
|
||||
case Intrinsic::arm_neon_vld3lane:
|
||||
case Intrinsic::arm_neon_vld4lane:
|
||||
case Intrinsic::arm_neon_vst1:
|
||||
case Intrinsic::arm_neon_vst2:
|
||||
case Intrinsic::arm_neon_vst3:
|
||||
case Intrinsic::arm_neon_vst4:
|
||||
case Intrinsic::arm_neon_vst2lane:
|
||||
case Intrinsic::arm_neon_vst3lane:
|
||||
case Intrinsic::arm_neon_vst4lane:
|
||||
return CombineBaseUpdate(N, DCI);
|
||||
default: break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
@ -181,7 +181,28 @@ namespace llvm {
|
||||
// Vector load N-element structure to all lanes:
|
||||
VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||
VLD3DUP,
|
||||
VLD4DUP
|
||||
VLD4DUP,
|
||||
|
||||
// NEON loads with post-increment base updates:
|
||||
VLD1_UPD,
|
||||
VLD2_UPD,
|
||||
VLD3_UPD,
|
||||
VLD4_UPD,
|
||||
VLD2LN_UPD,
|
||||
VLD3LN_UPD,
|
||||
VLD4LN_UPD,
|
||||
VLD2DUP_UPD,
|
||||
VLD3DUP_UPD,
|
||||
VLD4DUP_UPD,
|
||||
|
||||
// NEON stores with post-increment base updates:
|
||||
VST1_UPD,
|
||||
VST2_UPD,
|
||||
VST3_UPD,
|
||||
VST4_UPD,
|
||||
VST2LN_UPD,
|
||||
VST3LN_UPD,
|
||||
VST4LN_UPD
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -16,6 +16,18 @@ define <4 x i16> @vld1i16(i16* %A) nounwind {
|
||||
ret <4 x i16> %tmp1
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <4 x i16> @vld1i16_update(i16** %ptr) nounwind {
|
||||
;CHECK: vld1i16_update:
|
||||
;CHECK: vld1.16 {d16}, [r1]!
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = bitcast i16* %A to i8*
|
||||
%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
|
||||
%tmp2 = getelementptr i16* %A, i32 4
|
||||
store i16* %tmp2, i16** %ptr
|
||||
ret <4 x i16> %tmp1
|
||||
}
|
||||
|
||||
define <2 x i32> @vld1i32(i32* %A) nounwind {
|
||||
;CHECK: vld1i32:
|
||||
;CHECK: vld1.32
|
||||
@ -24,6 +36,18 @@ define <2 x i32> @vld1i32(i32* %A) nounwind {
|
||||
ret <2 x i32> %tmp1
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load with register increment.
|
||||
define <2 x i32> @vld1i32_update(i32** %ptr, i32 %inc) nounwind {
|
||||
;CHECK: vld1i32_update:
|
||||
;CHECK: vld1.32 {d16}, [r2], r1
|
||||
%A = load i32** %ptr
|
||||
%tmp0 = bitcast i32* %A to i8*
|
||||
%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
|
||||
%tmp2 = getelementptr i32* %A, i32 %inc
|
||||
store i32* %tmp2, i32** %ptr
|
||||
ret <2 x i32> %tmp1
|
||||
}
|
||||
|
||||
define <2 x float> @vld1f(float* %A) nounwind {
|
||||
;CHECK: vld1f:
|
||||
;CHECK: vld1.32
|
||||
@ -48,6 +72,17 @@ define <16 x i8> @vld1Qi8(i8* %A) nounwind {
|
||||
ret <16 x i8> %tmp1
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <16 x i8> @vld1Qi8_update(i8** %ptr) nounwind {
|
||||
;CHECK: vld1Qi8_update:
|
||||
;CHECK: vld1.8 {d16, d17}, [r1, :64]!
|
||||
%A = load i8** %ptr
|
||||
%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
|
||||
%tmp2 = getelementptr i8* %A, i32 16
|
||||
store i8* %tmp2, i8** %ptr
|
||||
ret <16 x i8> %tmp1
|
||||
}
|
||||
|
||||
define <8 x i16> @vld1Qi16(i16* %A) nounwind {
|
||||
;CHECK: vld1Qi16:
|
||||
;Check the alignment value. Max for this instruction is 128 bits:
|
||||
|
@ -56,6 +56,21 @@ define <2 x float> @vld2f(float* %A) nounwind {
|
||||
ret <2 x float> %tmp4
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <2 x float> @vld2f_update(float** %ptr) nounwind {
|
||||
;CHECK: vld2f_update:
|
||||
;CHECK: vld2.32 {d16, d17}, [r1]!
|
||||
%A = load float** %ptr
|
||||
%tmp0 = bitcast float* %A to i8*
|
||||
%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
|
||||
%tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
|
||||
%tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
|
||||
%tmp4 = fadd <2 x float> %tmp2, %tmp3
|
||||
%tmp5 = getelementptr float* %A, i32 4
|
||||
store float* %tmp5, float** %ptr
|
||||
ret <2 x float> %tmp4
|
||||
}
|
||||
|
||||
define <1 x i64> @vld2i64(i64* %A) nounwind {
|
||||
;CHECK: vld2i64:
|
||||
;Check the alignment value. Max for this instruction is 128 bits:
|
||||
@ -79,6 +94,20 @@ define <16 x i8> @vld2Qi8(i8* %A) nounwind {
|
||||
ret <16 x i8> %tmp4
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load with register increment.
|
||||
define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
|
||||
;CHECK: vld2Qi8_update:
|
||||
;CHECK: vld2.8 {d16, d17, d18, d19}, [r2, :128], r1
|
||||
%A = load i8** %ptr
|
||||
%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 16)
|
||||
%tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
|
||||
%tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
|
||||
%tmp4 = add <16 x i8> %tmp2, %tmp3
|
||||
%tmp5 = getelementptr i8* %A, i32 %inc
|
||||
store i8* %tmp5, i8** %ptr
|
||||
ret <16 x i8> %tmp4
|
||||
}
|
||||
|
||||
define <8 x i16> @vld2Qi16(i16* %A) nounwind {
|
||||
;CHECK: vld2Qi16:
|
||||
;Check the alignment value. Max for this instruction is 256 bits:
|
||||
|
@ -33,6 +33,21 @@ define <4 x i16> @vld3i16(i16* %A) nounwind {
|
||||
ret <4 x i16> %tmp4
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load with register increment.
|
||||
define <4 x i16> @vld3i16_update(i16** %ptr, i32 %inc) nounwind {
|
||||
;CHECK: vld3i16_update:
|
||||
;CHECK: vld3.16 {d16, d17, d18}, [r2], r1
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = bitcast i16* %A to i8*
|
||||
%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
|
||||
%tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
|
||||
%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
|
||||
%tmp4 = add <4 x i16> %tmp2, %tmp3
|
||||
%tmp5 = getelementptr i16* %A, i32 %inc
|
||||
store i16* %tmp5, i16** %ptr
|
||||
ret <4 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <2 x i32> @vld3i32(i32* %A) nounwind {
|
||||
;CHECK: vld3i32:
|
||||
;CHECK: vld3.32
|
||||
@ -103,6 +118,22 @@ define <4 x i32> @vld3Qi32(i32* %A) nounwind {
|
||||
ret <4 x i32> %tmp4
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <4 x i32> @vld3Qi32_update(i32** %ptr) nounwind {
|
||||
;CHECK: vld3Qi32_update:
|
||||
;CHECK: vld3.32 {d16, d18, d20}, [r1]!
|
||||
;CHECK: vld3.32 {d17, d19, d21}, [r1]!
|
||||
%A = load i32** %ptr
|
||||
%tmp0 = bitcast i32* %A to i8*
|
||||
%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
|
||||
%tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
|
||||
%tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
|
||||
%tmp4 = add <4 x i32> %tmp2, %tmp3
|
||||
%tmp5 = getelementptr i32* %A, i32 12
|
||||
store i32* %tmp5, i32** %ptr
|
||||
ret <4 x i32> %tmp4
|
||||
}
|
||||
|
||||
define <4 x float> @vld3Qf(float* %A) nounwind {
|
||||
;CHECK: vld3Qf:
|
||||
;CHECK: vld3.32
|
||||
|
@ -22,6 +22,20 @@ define <8 x i8> @vld4i8(i8* %A) nounwind {
|
||||
ret <8 x i8> %tmp4
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load with register increment.
|
||||
define <8 x i8> @vld4i8_update(i8** %ptr, i32 %inc) nounwind {
|
||||
;CHECK: vld4i8_update:
|
||||
;CHECK: vld4.8 {d16, d17, d18, d19}, [r2, :128], r1
|
||||
%A = load i8** %ptr
|
||||
%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 16)
|
||||
%tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
|
||||
%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
|
||||
%tmp4 = add <8 x i8> %tmp2, %tmp3
|
||||
%tmp5 = getelementptr i8* %A, i32 %inc
|
||||
store i8* %tmp5, i8** %ptr
|
||||
ret <8 x i8> %tmp4
|
||||
}
|
||||
|
||||
define <4 x i16> @vld4i16(i16* %A) nounwind {
|
||||
;CHECK: vld4i16:
|
||||
;Check the alignment value. Max for this instruction is 256 bits:
|
||||
@ -94,6 +108,22 @@ define <8 x i16> @vld4Qi16(i16* %A) nounwind {
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <8 x i16> @vld4Qi16_update(i16** %ptr) nounwind {
|
||||
;CHECK: vld4Qi16_update:
|
||||
;CHECK: vld4.16 {d16, d18, d20, d22}, [r1, :64]!
|
||||
;CHECK: vld4.16 {d17, d19, d21, d23}, [r1, :64]!
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = bitcast i16* %A to i8*
|
||||
%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
|
||||
%tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
|
||||
%tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
|
||||
%tmp4 = add <8 x i16> %tmp2, %tmp3
|
||||
%tmp5 = getelementptr i16* %A, i32 32
|
||||
store i16* %tmp5, i16** %ptr
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <4 x i32> @vld4Qi32(i32* %A) nounwind {
|
||||
;CHECK: vld4Qi32:
|
||||
;CHECK: vld4.32
|
||||
|
@ -89,6 +89,22 @@ define <4 x i16> @vld2dupi16(i16* %A) nounwind {
|
||||
ret <4 x i16> %tmp5
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
|
||||
;CHECK: vld2dupi16_update:
|
||||
;CHECK: vld2.16 {d16[], d17[]}, [r1]!
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
|
||||
%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
|
||||
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
|
||||
%tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
%tmp5 = add <4 x i16> %tmp2, %tmp4
|
||||
%tmp6 = getelementptr i16* %A, i32 2
|
||||
store i16* %tmp6, i16** %ptr
|
||||
ret <4 x i16> %tmp5
|
||||
}
|
||||
|
||||
define <2 x i32> @vld2dupi32(i32* %A) nounwind {
|
||||
;CHECK: vld2dupi32:
|
||||
;Check the alignment value. Max for this instruction is 64 bits:
|
||||
@ -106,8 +122,28 @@ declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8
|
||||
declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
|
||||
declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
|
||||
|
||||
%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
|
||||
%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
|
||||
|
||||
;Check for a post-increment updating load with register increment.
|
||||
define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind {
|
||||
;CHECK: vld3dupi8_update:
|
||||
;CHECK: vld3.8 {d16[], d17[], d18[]}, [r2], r1
|
||||
%A = load i8** %ptr
|
||||
%tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
|
||||
%tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0
|
||||
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1
|
||||
%tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 2
|
||||
%tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%tmp7 = add <8 x i8> %tmp2, %tmp4
|
||||
%tmp8 = add <8 x i8> %tmp7, %tmp6
|
||||
%tmp9 = getelementptr i8* %A, i32 %inc
|
||||
store i8* %tmp9, i8** %ptr
|
||||
ret <8 x i8> %tmp8
|
||||
}
|
||||
|
||||
define <4 x i16> @vld3dupi16(i16* %A) nounwind {
|
||||
;CHECK: vld3dupi16:
|
||||
;Check the (default) alignment value. VLD3 does not support alignment.
|
||||
@ -124,10 +160,34 @@ define <4 x i16> @vld3dupi16(i16* %A) nounwind {
|
||||
ret <4 x i16> %tmp8
|
||||
}
|
||||
|
||||
declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
|
||||
declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
|
||||
|
||||
%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
|
||||
%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
|
||||
;CHECK: vld4dupi16_update:
|
||||
;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
|
||||
%tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
|
||||
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
|
||||
%tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
%tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 2
|
||||
%tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
%tmp7 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 3
|
||||
%tmp8 = shufflevector <4 x i16> %tmp7, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
%tmp9 = add <4 x i16> %tmp2, %tmp4
|
||||
%tmp10 = add <4 x i16> %tmp6, %tmp8
|
||||
%tmp11 = add <4 x i16> %tmp9, %tmp10
|
||||
%tmp12 = getelementptr i16* %A, i32 4
|
||||
store i16* %tmp12, i16** %ptr
|
||||
ret <4 x i16> %tmp11
|
||||
}
|
||||
|
||||
define <2 x i32> @vld4dupi32(i32* %A) nounwind {
|
||||
;CHECK: vld4dupi32:
|
||||
;Check the alignment value. An 8-byte alignment is allowed here even though
|
||||
@ -148,4 +208,5 @@ define <2 x i32> @vld4dupi32(i32* %A) nounwind {
|
||||
ret <2 x i32> %tmp11
|
||||
}
|
||||
|
||||
declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
|
||||
declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
|
||||
|
@ -121,6 +121,22 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
|
||||
ret <2 x i32> %tmp5
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
|
||||
;CHECK: vld2lanei32_update:
|
||||
;CHECK: vld2.32 {d16[1], d17[1]}, [r1]!
|
||||
%A = load i32** %ptr
|
||||
%tmp0 = bitcast i32* %A to i8*
|
||||
%tmp1 = load <2 x i32>* %B
|
||||
%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
|
||||
%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
|
||||
%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
|
||||
%tmp5 = add <2 x i32> %tmp3, %tmp4
|
||||
%tmp6 = getelementptr i32* %A, i32 2
|
||||
store i32* %tmp6, i32** %ptr
|
||||
ret <2 x i32> %tmp5
|
||||
}
|
||||
|
||||
define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
|
||||
;CHECK: vld2lanef:
|
||||
;CHECK: vld2.32
|
||||
@ -260,6 +276,24 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load with register increment.
|
||||
define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
|
||||
;CHECK: vld3laneQi16_update:
|
||||
;CHECK: vld3.16 {d16[1], d18[1], d20[1]}, [r2], r1
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = bitcast i16* %A to i8*
|
||||
%tmp1 = load <8 x i16>* %B
|
||||
%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
|
||||
%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
|
||||
%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
|
||||
%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
|
||||
%tmp6 = add <8 x i16> %tmp3, %tmp4
|
||||
%tmp7 = add <8 x i16> %tmp5, %tmp6
|
||||
%tmp8 = getelementptr i16* %A, i32 %inc
|
||||
store i16* %tmp8, i16** %ptr
|
||||
ret <8 x i16> %tmp7
|
||||
}
|
||||
|
||||
define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
|
||||
;CHECK: vld3laneQi32:
|
||||
;CHECK: vld3.32
|
||||
@ -322,6 +356,25 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
|
||||
ret <8 x i8> %tmp9
|
||||
}
|
||||
|
||||
;Check for a post-increment updating load.
|
||||
define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
|
||||
;CHECK: vld4lanei8_update:
|
||||
;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
|
||||
%A = load i8** %ptr
|
||||
%tmp1 = load <8 x i8>* %B
|
||||
%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
|
||||
%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
|
||||
%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
|
||||
%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
|
||||
%tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
|
||||
%tmp7 = add <8 x i8> %tmp3, %tmp4
|
||||
%tmp8 = add <8 x i8> %tmp5, %tmp6
|
||||
%tmp9 = add <8 x i8> %tmp7, %tmp8
|
||||
%tmp10 = getelementptr i8* %A, i32 4
|
||||
store i8* %tmp10, i8** %ptr
|
||||
ret <8 x i8> %tmp9
|
||||
}
|
||||
|
||||
define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
|
||||
;CHECK: vld4lanei16:
|
||||
;Check that a power-of-two alignment smaller than the total size of the memory
|
||||
|
@ -36,6 +36,19 @@ define void @vst1f(float* %A, <2 x float>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store.
|
||||
define void @vst1f_update(float** %ptr, <2 x float>* %B) nounwind {
|
||||
;CHECK: vst1f_update:
|
||||
;CHECK: vst1.32 {d16}, [r1]!
|
||||
%A = load float** %ptr
|
||||
%tmp0 = bitcast float* %A to i8*
|
||||
%tmp1 = load <2 x float>* %B
|
||||
call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
|
||||
%tmp2 = getelementptr float* %A, i32 2
|
||||
store float* %tmp2, float** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
|
||||
;CHECK: vst1i64:
|
||||
;CHECK: vst1.64
|
||||
@ -64,6 +77,19 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store with register increment.
|
||||
define void @vst1Qi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
|
||||
;CHECK: vst1Qi16_update:
|
||||
;CHECK: vst1.16 {d16, d17}, [r1, :64], r2
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = bitcast i16* %A to i8*
|
||||
%tmp1 = load <8 x i16>* %B
|
||||
call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
|
||||
%tmp2 = getelementptr i16* %A, i32 %inc
|
||||
store i16* %tmp2, i16** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
|
||||
;CHECK: vst1Qi32:
|
||||
;CHECK: vst1.32
|
||||
|
@ -9,6 +9,18 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store with register increment.
|
||||
define void @vst2i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
|
||||
;CHECK: vst2i8_update:
|
||||
;CHECK: vst2.8 {d16, d17}, [r1], r2
|
||||
%A = load i8** %ptr
|
||||
%tmp1 = load <8 x i8>* %B
|
||||
call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
|
||||
%tmp2 = getelementptr i8* %A, i32 %inc
|
||||
store i8* %tmp2, i8** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
|
||||
;CHECK: vst2i16:
|
||||
;Check the alignment value. Max for this instruction is 128 bits:
|
||||
@ -47,6 +59,19 @@ define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store.
|
||||
define void @vst2i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
|
||||
;CHECK: vst2i64_update:
|
||||
;CHECK: vst1.64 {d16, d17}, [r1, :64]!
|
||||
%A = load i64** %ptr
|
||||
%tmp0 = bitcast i64* %A to i8*
|
||||
%tmp1 = load <1 x i64>* %B
|
||||
call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
|
||||
%tmp2 = getelementptr i64* %A, i32 2
|
||||
store i64* %tmp2, i64** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
|
||||
;CHECK: vst2Qi8:
|
||||
;Check the alignment value. Max for this instruction is 256 bits:
|
||||
|
@ -28,6 +28,19 @@ define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store.
|
||||
define void @vst3i32_update(i32** %ptr, <2 x i32>* %B) nounwind {
|
||||
;CHECK: vst3i32_update:
|
||||
;CHECK: vst3.32 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
|
||||
%A = load i32** %ptr
|
||||
%tmp0 = bitcast i32* %A to i8*
|
||||
%tmp1 = load <2 x i32>* %B
|
||||
call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
|
||||
%tmp2 = getelementptr i32* %A, i32 6
|
||||
store i32* %tmp2, i32** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst3f(float* %A, <2 x float>* %B) nounwind {
|
||||
;CHECK: vst3f:
|
||||
;CHECK: vst3.32
|
||||
@ -69,6 +82,20 @@ define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store.
|
||||
define void @vst3Qi16_update(i16** %ptr, <8 x i16>* %B) nounwind {
|
||||
;CHECK: vst3Qi16_update:
|
||||
;CHECK: vst3.16 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
|
||||
;CHECK: vst3.16 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = bitcast i16* %A to i8*
|
||||
%tmp1 = load <8 x i16>* %B
|
||||
call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
|
||||
%tmp2 = getelementptr i16* %A, i32 24
|
||||
store i16* %tmp2, i16** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind {
|
||||
;CHECK: vst3Qi32:
|
||||
;CHECK: vst3.32
|
||||
|
@ -9,6 +9,18 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store with register increment.
|
||||
define void @vst4i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
|
||||
;CHECK: vst4i8_update:
|
||||
;CHECK: vst4.8 {d16, d17, d18, d19}, [r1, :128], r2
|
||||
%A = load i8** %ptr
|
||||
%tmp1 = load <8 x i8>* %B
|
||||
call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
|
||||
%tmp2 = getelementptr i8* %A, i32 %inc
|
||||
store i8* %tmp2, i8** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
|
||||
;CHECK: vst4i16:
|
||||
;Check the alignment value. Max for this instruction is 256 bits:
|
||||
@ -89,6 +101,20 @@ define void @vst4Qf(float* %A, <4 x float>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store.
|
||||
define void @vst4Qf_update(float** %ptr, <4 x float>* %B) nounwind {
|
||||
;CHECK: vst4Qf_update:
|
||||
;CHECK: vst4.32 {d16, d18, d20, d22}, [r1]!
|
||||
;CHECK: vst4.32 {d17, d19, d21, d23}, [r1]!
|
||||
%A = load float** %ptr
|
||||
%tmp0 = bitcast float* %A to i8*
|
||||
%tmp1 = load <4 x float>* %B
|
||||
call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
|
||||
%tmp2 = getelementptr float* %A, i32 16
|
||||
store float* %tmp2, float** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
|
||||
declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
|
||||
declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
|
||||
|
@ -94,6 +94,19 @@ define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store with register increment.
|
||||
define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind {
|
||||
;CHECK: vst2lanei16_update:
|
||||
;CHECK: vst2.16 {d16[1], d17[1]}, [r1], r2
|
||||
%A = load i16** %ptr
|
||||
%tmp0 = bitcast i16* %A to i8*
|
||||
%tmp1 = load <4 x i16>* %B
|
||||
call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
|
||||
%tmp2 = getelementptr i16* %A, i32 %inc
|
||||
store i16* %tmp2, i16** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
|
||||
;CHECK: vst2lanei32:
|
||||
;CHECK: vst2.32
|
||||
@ -205,6 +218,19 @@ define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store.
|
||||
define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
|
||||
;CHECK: vst3laneQi32_update:
|
||||
;CHECK: vst3.32 {d16[0], d18[0], d20[0]}, [r1]!
|
||||
%A = load i32** %ptr
|
||||
%tmp0 = bitcast i32* %A to i8*
|
||||
%tmp1 = load <4 x i32>* %B
|
||||
call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
|
||||
%tmp2 = getelementptr i32* %A, i32 3
|
||||
store i32* %tmp2, i32** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
|
||||
;CHECK: vst3laneQf:
|
||||
;CHECK: vst3.32
|
||||
@ -233,6 +259,18 @@ define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
|
||||
ret void
|
||||
}
|
||||
|
||||
;Check for a post-increment updating store.
|
||||
define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
|
||||
;CHECK: vst4lanei8_update:
|
||||
;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
|
||||
%A = load i8** %ptr
|
||||
%tmp1 = load <8 x i8>* %B
|
||||
call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
|
||||
%tmp2 = getelementptr i8* %A, i32 4
|
||||
store i8* %tmp2, i8** %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
|
||||
;CHECK: vst4lanei16:
|
||||
;CHECK: vst4.16
|
||||
|
Loading…
x
Reference in New Issue
Block a user