diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 34f602bfda6..f65be1cb179 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -140,6 +140,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD1q8Pseudo, ARM::VLD1q8, true, false, SingleSpc, 2, 8 }, { ARM::VLD1q8Pseudo_UPD, ARM::VLD1q8_UPD, true, true, SingleSpc, 2, 8 }, +{ ARM::VLD2DUPd16Pseudo, ARM::VLD2DUPd16, true, false, SingleSpc, 2, 4}, +{ ARM::VLD2DUPd16Pseudo_UPD, ARM::VLD2DUPd16_UPD, true, true, SingleSpc, 2, 4}, +{ ARM::VLD2DUPd32Pseudo, ARM::VLD2DUPd32, true, false, SingleSpc, 2, 2}, +{ ARM::VLD2DUPd32Pseudo_UPD, ARM::VLD2DUPd32_UPD, true, true, SingleSpc, 2, 2}, +{ ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd8, true, false, SingleSpc, 2, 8}, +{ ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd8_UPD, true, true, SingleSpc, 2, 8}, + { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, SingleSpc, 2, 4 }, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, SingleSpc, 2, 4 }, { ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, SingleSpc, 2, 2 }, @@ -933,6 +940,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { case ARM::VLD1DUPq8Pseudo_UPD: case ARM::VLD1DUPq16Pseudo_UPD: case ARM::VLD1DUPq32Pseudo_UPD: + case ARM::VLD2DUPd8Pseudo: + case ARM::VLD2DUPd16Pseudo: + case ARM::VLD2DUPd32Pseudo: + case ARM::VLD2DUPd8Pseudo_UPD: + case ARM::VLD2DUPd16Pseudo_UPD: + case ARM::VLD2DUPd32Pseudo_UPD: ExpandVLD(MBBI); break; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 4648a69a1b2..b9fbdc58a1e 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -197,6 +197,11 @@ private: SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes); + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs + /// should be 2, 3 or 4. The opcode array specifies the instructions used + /// for loading D registers. (Q registers are not supported.) + SDNode *SelectVLDDup(SDNode *N, unsigned NumVecs, unsigned *Opcodes); + /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be /// generated to force the table registers to be consecutive. @@ -1643,6 +1648,62 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, return NULL; } +SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs, + unsigned *Opcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, Align; + if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) + return NULL; + + SDValue Chain = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Alignment = 0; + if (NumVecs != 3) { + Alignment = cast(Align)->getZExtValue(); + unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8; + if (Alignment > NumBytes) + Alignment = NumBytes; + // Alignment must be a power of two; make sure of that. + Alignment = (Alignment & -Alignment); + if (Alignment == 1) + Alignment = 0; + } + Align = CurDAG->getTargetConstant(Alignment, MVT::i32); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld-dup type"); + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + } + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue SuperReg; + unsigned Opc = Opcodes[OpcodeIndex]; + const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; + + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); + SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5); + SuperReg = SDValue(VLdDup, 0); + Chain = SDValue(VLdDup, 1); + + // Extract the subregisters. + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + unsigned SubIdx = ARM::dsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), Chain); + return NULL; +} + SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc) { assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range"); @@ -2294,6 +2355,12 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { N->getOperand(2), N->getOperand(3)); } + case ARMISD::VLD2DUP: { + unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo, + ARM::VLD2DUPd32Pseudo }; + return SelectVLDDup(N, 2, Opcodes); + } + case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index ff22e3fadae..da274de5a0f 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -824,6 +824,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; + case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; + case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; + case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; } } @@ -4836,15 +4839,100 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { DAG.getUNDEF(VT), NewMask.data()); } +/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a +/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic +/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and +/// return true. +static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + // vldN-dup instructions only support 64-bit vectors for N > 1. + if (!VT.is64BitVector()) + return false; + + // Check if the VDUPLANE operand is a vldN-dup intrinsic. + SDNode *VLD = N->getOperand(0).getNode(); + if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return false; + unsigned NumVecs = 0; + unsigned NewOpc = 0; + unsigned IntNo = cast(VLD->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_neon_vld2lane) { + NumVecs = 2; + NewOpc = ARMISD::VLD2DUP; + } else if (IntNo == Intrinsic::arm_neon_vld3lane) { + NumVecs = 3; + NewOpc = ARMISD::VLD3DUP; + } else if (IntNo == Intrinsic::arm_neon_vld4lane) { + NumVecs = 4; + NewOpc = ARMISD::VLD4DUP; + } else { + return false; + } + + // First check that all the vldN-lane uses are VDUPLANEs and that the lane + // numbers match the load. + unsigned VLDLaneNo = + cast(VLD->getOperand(NumVecs+3))->getZExtValue(); + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + // Ignore uses of the chain result. + if (UI.getUse().getResNo() == NumVecs) + continue; + SDNode *User = *UI; + if (User->getOpcode() != ARMISD::VDUPLANE || + VLDLaneNo != cast(User->getOperand(1))->getZExtValue()) + return false; + } + + // Create the vldN-dup node. + EVT Tys[5]; + unsigned n; + for (n = 0; n < NumVecs; ++n) + Tys[n] = VT; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1); + SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; + MemIntrinsicSDNode *VLDMemInt = cast(VLD); + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys, + Ops, 2, VLDMemInt->getMemoryVT(), + VLDMemInt->getMemOperand()); + + // Update the uses. + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + unsigned ResNo = UI.getUse().getResNo(); + // Ignore uses of the chain result. + if (ResNo == NumVecs) + continue; + SDNode *User = *UI; + DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); + } + + // Now the vldN-lane intrinsic is dead except for its chain result. + // Update uses of the chain. + std::vector VLDDupResults; + for (unsigned n = 0; n < NumVecs; ++n) + VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); + VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); + DCI.CombineTo(VLD, VLDDupResults); + + return true; +} + /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. -static SDValue PerformVDUPLANECombine(SDNode *N, SelectionDAG &DAG) { - // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is - // redundant. +static SDValue PerformVDUPLANECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { SDValue Op = N->getOperand(0); - EVT VT = N->getValueType(0); - // Ignore bit_converts. + // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses + // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. + if (CombineVLDDUP(N, DCI)) + return SDValue(N, 0); + + // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is + // redundant. Ignore bit_converts for now; element sizes are checked below. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) @@ -4857,10 +4945,11 @@ static SDValue PerformVDUPLANECombine(SDNode *N, SelectionDAG &DAG) { unsigned EltBits; if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) EltSize = 8; + EVT VT = N->getValueType(0); if (EltSize > VT.getVectorElementType().getSizeInBits()) return SDValue(); - return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); + return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); } /// getVShiftImm - Check if this is a valid build_vector for the immediate @@ -5248,7 +5337,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI.DAG); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); - case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI.DAG); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 4a4b83db0f5..38f73991104 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -172,7 +172,12 @@ namespace llvm { // Vector OR with immediate VORRIMM, // Vector AND with NOT of immediate - VBICIMM + VBICIMM, + + // Vector load N-element structure to all lanes: + VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + VLD3DUP, + VLD4DUP }; } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 4e4aa196c48..fd9b8234123 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -854,6 +854,47 @@ def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo; def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo; // VLD2DUP : Vector Load (single 2-element structure to all lanes) +class VLD2DUP op7_4, string Dt> + : NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn), IIC_VLD2dup, + "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; +} + +def VLD2DUPd8 : VLD2DUP<{0,0,0,?}, "8">; +def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16">; +def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32">; + +def VLD2DUPd8Pseudo : VLDQPseudo; +def VLD2DUPd16Pseudo : VLDQPseudo; +def VLD2DUPd32Pseudo : VLDQPseudo; + +// ...with double-spaced registers (not used for codegen): +def VLD2DUPd8Q : VLD2DUP<{0,0,1,?}, "8">; +def VLD2DUPd16Q : VLD2DUP<{0,1,1,?}, "16">; +def VLD2DUPd32Q : VLD2DUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD2DUPWB op7_4, string Dt> + : NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2dupu, + "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; +} + +def VLD2DUPd8_UPD : VLD2DUPWB<{0,0,0,0}, "8">; +def VLD2DUPd16_UPD : VLD2DUPWB<{0,1,0,?}, "16">; +def VLD2DUPd32_UPD : VLD2DUPWB<{1,0,0,?}, "32">; + +def VLD2DUPd8Q_UPD : VLD2DUPWB<{0,0,1,0}, "8">; +def VLD2DUPd16Q_UPD : VLD2DUPWB<{0,1,1,?}, "16">; +def VLD2DUPd32Q_UPD : VLD2DUPWB<{1,0,1,?}, "32">; + +def VLD2DUPd8Pseudo_UPD : VLDQWBPseudo; +def VLD2DUPd16Pseudo_UPD : VLDQWBPseudo; +def VLD2DUPd32Pseudo_UPD : VLDQWBPseudo; + // VLD3DUP : Vector Load (single 3-element structure to all lanes) // VLD4DUP : Vector Load (single 4-element structure to all lanes) // FIXME: Not yet implemented. diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index acef2f6bdbc..5202da46390 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -146,6 +146,8 @@ def IIC_VLD2u : InstrItinClass; def IIC_VLD2x2u : InstrItinClass; def IIC_VLD2ln : InstrItinClass; def IIC_VLD2lnu : InstrItinClass; +def IIC_VLD2dup : InstrItinClass; +def IIC_VLD2dupu : InstrItinClass; def IIC_VLD3 : InstrItinClass; def IIC_VLD3ln : InstrItinClass; def IIC_VLD3u : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index 44756b7a068..a364c4eccb1 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -523,6 +523,18 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<3, [A8_LSPipe]>], [3, 3, 2, 1, 1, 1, 1, 1]>, // + // VLD2dup + InstrItinData, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1]>, + // + // VLD2dupu + InstrItinData, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1, 1]>, + // // VLD3 InstrItinData, InstrStage<4, [A8_NLSPipe], 0>, diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 0f4e79aeee3..53c53bf1af8 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -887,6 +887,24 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<3, [A9_LSUnit]>], [4, 4, 2, 1, 1, 1, 1, 1]>, // + // VLD2dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1]>, + // + // VLD2dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1]>, + // // VLD3 InstrItinData, InstrStage<1, [A9_MUX0], 0>, diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll index f172ed56d96..00dd134d770 100644 --- a/test/CodeGen/ARM/vlddup.ll +++ b/test/CodeGen/ARM/vlddup.ll @@ -39,3 +39,35 @@ define <16 x i8> @vld1dupQi8(i8* %A) nounwind { %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %tmp3 } + +%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } +%struct.__neon_int2x32x2_t = type { <2 x i32>, <2 x i32> } + +define <8 x i8> @vld2dupi8(i8* %A) nounwind { +;CHECK: vld2dupi8: +;Check the (default) alignment value. +;CHECK: vld2.8 {d16[], d17[]}, [r0] + %tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0 + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer + %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1 + %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer + %tmp5 = add <8 x i8> %tmp2, %tmp4 + ret <8 x i8> %tmp5 +} + +define <2 x i32> @vld2dupi32(i32* %A) nounwind { +;CHECK: vld2dupi32: +;Check the alignment value. Max for this instruction is 64 bits: +;CHECK: vld2.32 {d16[], d17[]}, [r0, :64] + %tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16) + %tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0 + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer + %tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1 + %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer + %tmp5 = add <2 x i32> %tmp2, %tmp4 + ret <2 x i32> %tmp5 +} + +declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly +declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly