From c485ff3ced6c032b811d81934e9ae11c52f676dc Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Fri, 7 Jan 2011 21:37:30 +0000 Subject: [PATCH] Lower some BUILD_VECTORS using VEXT+shuffle. Patch by Tim Northover. llvm-svn: 123035 --- lib/Target/ARM/ARMISelLowering.cpp | 135 ++++++++++++++++++++++++++++- lib/Target/ARM/ARMISelLowering.h | 4 + test/CodeGen/ARM/vext.ll | 59 +++++++++++++ 3 files changed, 196 insertions(+), 2 deletions(-) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index b4432d52369..98939732a2b 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -3530,8 +3530,8 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, // If this is a case we can't handle, return null and let the default // expansion code take care of it. -static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *ST) { +SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const { BuildVectorSDNode *BVN = cast(Op.getNode()); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -3622,6 +3622,13 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (isConstant) return SDValue(); + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. + if (NumElts >= 4) { + SDValue shuffle = ReconstructShuffle(Op, DAG); + if (shuffle != SDValue()) + return shuffle; + } + // Vectors with 32- or 64-bit elements can be built by directly assigning // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands // will be legalized. @@ -3640,6 +3647,130 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return SDValue(); } +// Gather data to see if the operation can be modelled as a +// shuffle in combination with VEXTs. +SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector SourceVecs; + SmallVector MinElts; + SmallVector MaxElts; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + // A shuffle can only come from building a vector from various + // elements of other vectors. + return SDValue(); + } + + // Record this extraction against the appropriate vector if possible... + SDValue SourceVec = V.getOperand(0); + unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); + bool FoundSource = false; + for (unsigned j = 0; j < SourceVecs.size(); ++j) { + if (SourceVecs[j] == SourceVec) { + if (MinElts[j] > EltNo) + MinElts[j] = EltNo; + if (MaxElts[j] < EltNo) + MaxElts[j] = EltNo; + FoundSource = true; + break; + } + } + + // Or record a new source if not... + if (!FoundSource) { + SourceVecs.push_back(SourceVec); + MinElts.push_back(EltNo); + MaxElts.push_back(EltNo); + } + } + + // Currently only do something sane when at most two source vectors + // involved. + if (SourceVecs.size() > 2) + return SDValue(); + + SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; + int VEXTOffsets[2] = {0, 0}; + + // This loop extracts the usage patterns of the source vectors + // and prepares appropriate SDValues for a shuffle if possible. + for (unsigned i = 0; i < SourceVecs.size(); ++i) { + if (SourceVecs[i].getValueType() == VT) { + // No VEXT necessary + ShuffleSrcs[i] = SourceVecs[i]; + VEXTOffsets[i] = 0; + continue; + } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { + // It probably isn't worth padding out a smaller vector just to + // break it down again in a shuffle. + return SDValue(); + } + + unsigned SrcNumElts = SourceVecs[i].getValueType().getVectorNumElements(); + + // Since only 64-bit and 128-bit vectors are legal on ARM and + // we've eliminated the other cases... + assert(SrcNumElts == 2*NumElts); + + if (MaxElts[i] - MinElts[i] >= NumElts) { + // Span too large for a VEXT to cope + return SDValue(); + } + + if (MinElts[i] >= NumElts) { + // The extraction can just take the second half + VEXTOffsets[i] = NumElts; + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], + DAG.getIntPtrConstant(NumElts)); + } else if (MaxElts[i] < NumElts) { + // The extraction can just take the first half + VEXTOffsets[i] = 0; + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], + DAG.getIntPtrConstant(0)); + } else { + // An actual VEXT is needed + VEXTOffsets[i] = MinElts[i]; + SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], + DAG.getIntPtrConstant(0)); + SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], + DAG.getIntPtrConstant(NumElts)); + ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, + DAG.getConstant(VEXTOffsets[i], MVT::i32)); + } + } + + SmallVector Mask; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Entry = Op.getOperand(i); + if (Entry.getOpcode() == ISD::UNDEF) { + Mask.push_back(-1); + continue; + } + + SDValue ExtractVec = Entry.getOperand(0); + int ExtractElt = cast(Op.getOperand(i).getOperand(1))->getSExtValue(); + if (ExtractVec == SourceVecs[0]) { + Mask.push_back(ExtractElt - VEXTOffsets[0]); + } else { + Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); + } + } + + // Final check before we try to produce nonsense... + if (isShuffleMaskLegal(Mask, VT)) + return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], &Mask[0]); + + return SDValue(); +} + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 6d11bad357e..b25cb96d3e3 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -378,6 +378,10 @@ namespace llvm { SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const; + + SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll index e460a84f626..55abefef0fa 100644 --- a/test/CodeGen/ARM/vext.ll +++ b/test/CodeGen/ARM/vext.ll @@ -74,3 +74,62 @@ define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind { ret <16 x i8> %tmp3 } +; Tests for ReconstructShuffle function. Indices have to be carefully +; chosen to reach lowering phase as a BUILD_VECTOR. + +; One vector needs vext, the other can be handled by extract_subvector +; Also checks interleaving of sources is handled correctly. +; Essence: a vext is used on %A and something saner than stack load/store for final result. +define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: test_interleaved: +;CHECK: vext.16 +;CHECK-NOT: vext.16 +;CHECK: vzip.16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> + ret <4 x i16> %tmp3 +} + +; An undef in the shuffle list should still be optimizable +define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: test_undef: +;CHECK: vzip.16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> + ret <4 x i16> %tmp3 +} + +; We should ignore a build_vector with more than two sources. +; Use illegal <32 x i16> type to produce such a shuffle after legalizing types. +; Try to look for fallback to stack expansion. +define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind { +;CHECK: test_multisource: +;CHECK: vst1.16 + %tmp1 = load <32 x i16>* %B + %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> + ret <4 x i16> %tmp2 +} + +; We don't handle shuffles using more than half of a 128-bit vector. +; Again, test for fallback to stack expansion +define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind { +;CHECK: test_largespan: +;CHECK: vst1.16 + %tmp1 = load <8 x i16>* %B + %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> + ret <4 x i16> %tmp2 +} + +; The actual shuffle code only handles some cases, make sure we check +; this rather than blindly emitting a VECTOR_SHUFFLE (infinite +; lowering loop can result otherwise). +define <8 x i8> @test_illegal(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: test_illegal: +;CHECK: vst1.8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <8 x i32> + ret <8 x i8> %tmp3 +}