[X86][SSE] Pull out variable shuffle mask combine logic. NFCI.

Hopefully this will make it easier to vary the combine depth threshold per-target. llvm-svn: 314337
2024-11-23 11:13:28 +01:00 · 2017-09-27 20:19:53 +00:00 · 2017-09-27 20:19:53 +00:00 · 0686462fbf
commit 0686462fbf
parent 10acb5d949
1 changed files with 13 additions and 10 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -27779,12 +27779,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
  if (Depth < 2)
    return SDValue();

+  // Depth threshold above which we can efficiently use variable mask shuffles.
+  // TODO This should probably be target specific.
+  bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
+
  bool MaskContainsZeros =
      any_of(Mask, [](int M) { return M == SM_SentinelZero; });

  if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
    // If we have a single input lane-crossing shuffle then lower to VPERMV.
-    if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+    if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
        ((Subtarget.hasAVX2() &&
          (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
         (Subtarget.hasAVX512() &&
@ -27805,7 +27809,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

    // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
    // vector as the second source.
-    if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+    if (UnaryShuffle && AllowVariableMask &&
        ((Subtarget.hasAVX512() &&
          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
@ -27833,7 +27837,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
    }

    // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
-    if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+    if (AllowVariableMask && !MaskContainsZeros &&
        ((Subtarget.hasAVX512() &&
          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
@ -27859,7 +27863,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

  // See if we can combine a single input shuffle with zeros to a bit-mask,
  // which is much simpler than any shuffle.
-  if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
+  if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
      isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
      DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
    APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
@ -27890,7 +27894,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
  // If we have a single input shuffle with different shuffle patterns in the
  // the 128-bit lanes use the variable mask to VPERMILPS.
  // TODO Combine other mask types at higher depths.
-  if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
+  if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
      ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
       (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
    SmallVector<SDValue, 16> VPermIdx;
@ -27910,7 +27914,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

  // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
  // to VPERMIL2PD/VPERMIL2PS.
-  if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
+  if (AllowVariableMask && Subtarget.hasXOP() &&
      (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
       MaskVT == MVT::v8f32)) {
    // VPERMIL2 Operation.
@ -27952,7 +27956,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
  // Intel's manuals suggest only using PSHUFB if doing so replacing 5
  // instructions, but in practice PSHUFB tends to be *very* fast so we're
  // more aggressive.
-  if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+  if (UnaryShuffle && AllowVariableMask &&
      ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
       (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
       (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
@ -27970,7 +27974,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
        continue;
      }
      M = Ratio * M + i % Ratio;
-      assert ((M / 16) == (i / 16) && "Lane crossing detected");
+      assert((M / 16) == (i / 16) && "Lane crossing detected");
      PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
    }
    MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
@ -27986,8 +27990,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
  // With XOP, if we have a 128-bit binary input shuffle we can always combine
  // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
  // slower than PSHUFB on targets that support both.
-  if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
-      Subtarget.hasXOP()) {
+  if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
    // VPPERM Mask Operation
    // Bits[4:0] - Byte Index (0 - 31)
    // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)