1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-22 04:22:57 +02:00

[ARM] Update ReconstructShuffle to handle mismatched types

Summary:
Port the ReconstructShuffle function from AArch64 to ARM
to handle mismatched incoming types in the BUILD_VECTOR
node.

This fixes an outstanding FIXME in the ReconstructShuffle
code.

Reviewers: t.p.northover, rengolin

Subscribers: aemerson, llvm-commits, rengolin

Differential Revision: http://reviews.llvm.org/D11720

llvm-svn: 244314
This commit is contained in:
Silviu Baranga 2015-08-07 11:40:46 +00:00
parent 093484721d
commit f1f3f2018c
3 changed files with 264 additions and 93 deletions

View File

@ -5529,18 +5529,46 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
/// getExtFactor - Determine the adjustment factor for the position when
/// generating an "extract from vector registers" instruction.
static unsigned getExtFactor(SDValue &V) {
EVT EltType = V.getValueType().getVectorElementType();
return EltType.getSizeInBits() / 8;
}
// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
SDLoc dl(Op);
EVT VT = Op.getValueType();
unsigned NumElts = VT.getVectorNumElements();
SmallVector<SDValue, 2> SourceVecs;
SmallVector<unsigned, 2> MinElts;
SmallVector<unsigned, 2> MaxElts;
struct ShuffleSourceInfo {
SDValue Vec;
unsigned MinElt;
unsigned MaxElt;
// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
// be compatible with the shuffle we intend to construct. As a result
// ShuffleVec will be some sliding window into the original Vec.
SDValue ShuffleVec;
// Code should guarantee that element i in Vec starts at element "WindowBase
// + i * WindowScale in ShuffleVec".
int WindowBase;
int WindowScale;
bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
ShuffleSourceInfo(SDValue Vec)
: Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
WindowScale(1) {}
};
// First gather all vectors used as an immediate source for this BUILD_VECTOR
// node.
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.getOpcode() == ISD::UNDEF)
@ -5549,127 +5577,161 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
// A shuffle can only come from building a vector from various
// elements of other vectors.
return SDValue();
} else if (V.getOperand(0).getValueType().getVectorElementType() !=
VT.getVectorElementType()) {
// This code doesn't know how to handle shuffles where the vector
// element types do not match (this happens because type legalization
// promotes the return type of EXTRACT_VECTOR_ELT).
// FIXME: It might be appropriate to extend this code to handle
// mismatched types.
return SDValue();
}
// Record this extraction against the appropriate vector if possible...
// Add this element source to the list if it's not already there.
SDValue SourceVec = V.getOperand(0);
// If the element number isn't a constant, we can't effectively
// analyze what's going on.
if (!isa<ConstantSDNode>(V.getOperand(1)))
return SDValue();
unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
bool FoundSource = false;
for (unsigned j = 0; j < SourceVecs.size(); ++j) {
if (SourceVecs[j] == SourceVec) {
if (MinElts[j] > EltNo)
MinElts[j] = EltNo;
if (MaxElts[j] < EltNo)
MaxElts[j] = EltNo;
FoundSource = true;
break;
}
}
auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
if (Source == Sources.end())
Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
// Or record a new source if not...
if (!FoundSource) {
SourceVecs.push_back(SourceVec);
MinElts.push_back(EltNo);
MaxElts.push_back(EltNo);
}
// Update the minimum and maximum lane number seen.
unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
Source->MinElt = std::min(Source->MinElt, EltNo);
Source->MaxElt = std::max(Source->MaxElt, EltNo);
}
// Currently only do something sane when at most two source vectors
// involved.
if (SourceVecs.size() > 2)
// are involved.
if (Sources.size() > 2)
return SDValue();
SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
int VEXTOffsets[2] = {0, 0};
// Find out the smallest element size among result and two sources, and use
// it as element size to build the shuffle_vector.
EVT SmallestEltTy = VT.getVectorElementType();
for (auto &Source : Sources) {
EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
if (SrcEltTy.bitsLT(SmallestEltTy))
SmallestEltTy = SrcEltTy;
}
unsigned ResMultiplier =
VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
// This loop extracts the usage patterns of the source vectors
// and prepares appropriate SDValues for a shuffle if possible.
for (unsigned i = 0; i < SourceVecs.size(); ++i) {
if (SourceVecs[i].getValueType() == VT) {
// No VEXT necessary
ShuffleSrcs[i] = SourceVecs[i];
VEXTOffsets[i] = 0;
// If the source vector is too wide or too narrow, we may nevertheless be able
// to construct a compatible shuffle either by concatenating it with UNDEF or
// extracting a suitable range of elements.
for (auto &Src : Sources) {
EVT SrcVT = Src.ShuffleVec.getValueType();
if (SrcVT.getSizeInBits() == VT.getSizeInBits())
continue;
} else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
// It probably isn't worth padding out a smaller vector just to
// break it down again in a shuffle.
// This stage of the search produces a source with the same element type as
// the original, but with a total width matching the BUILD_VECTOR output.
EVT EltVT = SrcVT.getVectorElementType();
unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
return SDValue();
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
Src.ShuffleVec =
DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
DAG.getUNDEF(Src.ShuffleVec.getValueType()));
continue;
}
// Since only 64-bit and 128-bit vectors are legal on ARM and
// we've eliminated the other cases...
assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
"unexpected vector sizes in ReconstructShuffle");
if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
return SDValue();
if (MaxElts[i] - MinElts[i] >= NumElts) {
if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
// Span too large for a VEXT to cope
return SDValue();
}
if (MinElts[i] >= NumElts) {
if (Src.MinElt >= NumSrcElts) {
// The extraction can just take the second half
VEXTOffsets[i] = NumElts;
ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
SourceVecs[i],
DAG.getIntPtrConstant(NumElts, dl));
} else if (MaxElts[i] < NumElts) {
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i32));
Src.WindowBase = -NumSrcElts;
} else if (Src.MaxElt < NumSrcElts) {
// The extraction can just take the first half
VEXTOffsets[i] = 0;
ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
SourceVecs[i],
DAG.getIntPtrConstant(0, dl));
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i32));
} else {
// An actual VEXT is needed
VEXTOffsets[i] = MinElts[i];
SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
SourceVecs[i],
DAG.getIntPtrConstant(0, dl));
SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
SourceVecs[i],
DAG.getIntPtrConstant(NumElts, dl));
ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
DAG.getConstant(VEXTOffsets[i], dl,
MVT::i32));
SDValue VEXTSrc1 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i32));
SDValue VEXTSrc2 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i32));
unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
VEXTSrc2,
DAG.getConstant(Imm, dl, MVT::i32));
Src.WindowBase = -Src.MinElt;
}
}
SmallVector<int, 8> Mask;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue Entry = Op.getOperand(i);
if (Entry.getOpcode() == ISD::UNDEF) {
Mask.push_back(-1);
// Another possible incompatibility occurs from the vector element types. We
// can fix this by bitcasting the source vectors to the same type we intend
// for the shuffle.
for (auto &Src : Sources) {
EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
if (SrcEltTy == SmallestEltTy)
continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;
}
SDValue ExtractVec = Entry.getOperand(0);
int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
.getOperand(1))->getSExtValue();
if (ExtractVec == SourceVecs[0]) {
Mask.push_back(ExtractElt - VEXTOffsets[0]);
} else {
Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
}
// Final sanity check before we try to actually produce a shuffle.
for (auto Src : Sources)
assert(Src.ShuffleVec.getValueType() == ShuffleVT);
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
if (Entry.getOpcode() == ISD::UNDEF)
continue;
auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
// segment.
EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
VT.getVectorElementType().getSizeInBits());
int LanesDefined = BitsDefined / BitsPerShuffleLane;
// This source is expected to fill ResMultiplier lanes of the final shuffle,
// starting at the appropriate offset.
int *LaneMask = &Mask[i * ResMultiplier];
int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
ExtractBase += NumElts * (Src - Sources.begin());
for (int j = 0; j < LanesDefined; ++j)
LaneMask[j] = ExtractBase + j;
}
// Final check before we try to produce nonsense...
if (isShuffleMaskLegal(Mask, VT))
return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
&Mask[0]);
if (!isShuffleMaskLegal(Mask, ShuffleVT))
return SDValue();
// We can't handle more than two sources. This should have already
// been checked before this point.
assert(Sources.size() <= 2 && "Too many sources!");
SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
for (unsigned i = 0; i < Sources.size(); ++i)
ShuffleOps[i] = Sources[i].ShuffleVec;
SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
ShuffleOps[1], &Mask[0]);
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
/// isShuffleMaskLegal - Targets can use this to indicate that they only

View File

@ -335,3 +335,39 @@ entry:
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 5, i32 3, i32 7>
ret <8 x i16> %0
}
; Here we get a build_vector node, where all the incoming extract_element
; values do modify the type. However, we get different input types, as some of
; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of
; them get truncated from i16 to i8 (from comparing cmp2 with cmp3).
define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
<4 x i32> %cmp0, <4 x i32> %cmp1,
<4 x i16> %cmp2, <4 x i16> %cmp3) {
; CHECK-LABEL: vtrn_mismatched_builvector0
; CHECK: vmovn.i32
; CHECK: vtrn
; CHECK: vbsl
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
%c1 = icmp ult <4 x i16> %cmp2, %cmp3
%c = shufflevector <4 x i1> %c0, <4 x i1> %c1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
%rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
ret <8 x i8> %rv
}
; Here we get a build_vector node, where half the incoming extract_element
; values do not modify the type (the values form cmp2), but half of them do
; (from the icmp operation).
define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
<4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
; CHECK-LABEL: vtrn_mismatched_builvector1
; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
; CHECK: vmovl
; CHECK: vtrn.8
; CHECK: vbsl
%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
%c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
%rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
ret <8 x i8> %rv
}

View File

@ -285,3 +285,76 @@ entry:
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
ret <4 x i32> %0
}
define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
; truncate from i32 to i16 and one vuzp to perform the final truncation for i8.
; CHECK-LABEL: vuzp_trunc
; CHECK: vmovn.i32
; CHECK: vmovn.i32
; CHECK: vuzp
; CHECK: vbsl
%c = icmp ult <8 x i32> %cmp0, %cmp1
%res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
ret <8 x i8> %res
}
; Shuffle the result from the compare with a <4 x i8>.
; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
; to perform the vuzp and get the vbsl mask.
define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
<4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
; CHECK-LABEL: vuzp_trunc_and_shuffle
; CHECK: vmovl
; CHECK: vuzp
; CHECK: vbsl
%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
%c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
ret <8 x i8> %rv
}
; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
; This produces a build_vector with some of the operands undefs.
define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
<4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right
; CHECK: vuzp
; CHECK: vbsl
%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
%c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
ret <8 x i8> %rv
}
define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
<4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left
; CHECK: vuzp
; CHECK: vbsl
%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
%c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
ret <8 x i8> %rv
}
; We're using large data types here, and we have to fill with undef values until we
; get some vector size that we can represent.
define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
<5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
; CHECK-LABEL: vuzp_wide_type
; CHECK: vbsl
%cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
%c0 = icmp ult <5 x i32> %cmp0, %cmp1
%c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
%rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
ret <10 x i8> %rv
}