1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[X86][SSE] Pull out variable shuffle mask combine logic. NFCI.

Hopefully this will make it easier to vary the combine depth threshold per-target.

llvm-svn: 314337
This commit is contained in:
Simon Pilgrim 2017-09-27 20:19:53 +00:00
parent 10acb5d949
commit 0686462fbf

View File

@ -27779,12 +27779,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Depth < 2)
return SDValue();
// Depth threshold above which we can efficiently use variable mask shuffles.
// TODO This should probably be target specific.
bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
bool MaskContainsZeros =
any_of(Mask, [](int M) { return M == SM_SentinelZero; });
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX2() &&
(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasAVX512() &&
@ -27805,7 +27809,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
// vector as the second source.
if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
if (UnaryShuffle && AllowVariableMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
@ -27833,7 +27837,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
@ -27859,7 +27863,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// See if we can combine a single input shuffle with zeros to a bit-mask,
// which is much simpler than any shuffle.
if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
@ -27890,7 +27894,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// If we have a single input shuffle with different shuffle patterns in the
// the 128-bit lanes use the variable mask to VPERMILPS.
// TODO Combine other mask types at higher depths.
if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
SmallVector<SDValue, 16> VPermIdx;
@ -27910,7 +27914,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
// to VPERMIL2PD/VPERMIL2PS.
if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
if (AllowVariableMask && Subtarget.hasXOP() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
MaskVT == MVT::v8f32)) {
// VPERMIL2 Operation.
@ -27952,7 +27956,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Intel's manuals suggest only using PSHUFB if doing so replacing 5
// instructions, but in practice PSHUFB tends to be *very* fast so we're
// more aggressive.
if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
if (UnaryShuffle && AllowVariableMask &&
((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
@ -27970,7 +27974,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
continue;
}
M = Ratio * M + i % Ratio;
assert ((M / 16) == (i / 16) && "Lane crossing detected");
assert((M / 16) == (i / 16) && "Lane crossing detected");
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
@ -27986,8 +27990,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// With XOP, if we have a 128-bit binary input shuffle we can always combine
// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
// slower than PSHUFB on targets that support both.
if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
Subtarget.hasXOP()) {
if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
// VPPERM Mask Operation
// Bits[4:0] - Byte Index (0 - 31)
// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)