[x86] Teach a bunch of the x86-specific shuffle combining to work with

256-bit vectors as well as 128-bit vectors. Fixes some of the redundant shuffles for v16i16. llvm-svn: 230752
2024-11-24 03:33:20 +01:00 · 2015-02-27 11:45:13 +00:00 · 2015-02-27 11:45:13 +00:00 · 36698dfd52
commit 36698dfd52
parent a1c6bfd527
2 changed files with 29 additions and 15 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -19435,12 +19435,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
 /// PSHUF-style masks that can be reused with such instructions.
 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+  MVT VT = N.getSimpleValueType();
  SmallVector<int, 4> Mask;
  bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
  (void)HaveMask;
  assert(HaveMask);

+  // If we have more than 128-bits, only the low 128-bits of shuffle mask
+  // matter. Check that the upper masks are repeats and remove them.
+  if (VT.getSizeInBits() > 128) {
+    int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+      for (int j = 0; j < LaneElts; ++j)
+        assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+               "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+    Mask.resize(LaneElts);
+  }
+
  switch (N.getOpcode()) {
  case X86ISD::PSHUFD:
    return Mask;
@ -19513,7 +19527,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
    case X86ISD::UNPCKH:
      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
      // shuffle into a preceding word shuffle.
-      if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+      if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
+          V.getSimpleValueType().getScalarType() != MVT::i16)
        return SDValue();

      // Search for a half-shuffle which we can combine with.
@ -19687,8 +19702,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
    break;
  case X86ISD::PSHUFLW:
  case X86ISD::PSHUFHW:
-    assert(VT == MVT::v8i16);
-    (void)VT;
+    assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");

    if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
      return SDValue(); // We combined away this shuffle, so we're done.
@ -19701,12 +19715,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
      DMask[DOffset + 0] = DOffset + 1;
      DMask[DOffset + 1] = DOffset + 0;
-      V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
      DCI.AddToWorklist(V.getNode());
-      V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                      getV4X86ShuffleImm8ForMask(DMask, DAG));
      DCI.AddToWorklist(V.getNode());
-      return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+      return DAG.getNode(ISD::BITCAST, DL, VT, V);
    }

    // Look for shuffle patterns which can be implemented as a single unpack.
@ -19741,11 +19756,11 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
            std::equal(std::begin(MappedMask), std::end(MappedMask),
                       std::begin(UnpackHiMask))) {
          // We can replace all three shuffles with an unpack.
-          V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+          V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
          DCI.AddToWorklist(V.getNode());
          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
                                                : X86ISD::UNPCKH,
-                             DL, MVT::v8i16, V, V);
+                             DL, VT, V, V);
        }
      }
    }
@ -19893,10 +19908,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    }
  }

-  // Only handle 128 wide vector from here on.
-  if (!VT.is128BitVector())
-    return SDValue();
-
  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
  // consecutive, non-overlapping, and in the right order.
@ -19914,6 +19925,10 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    if (Shuffle.getNode())
      return Shuffle;

+    // Only handle 128 wide vector from here on.
+    if (!VT.is128BitVector())
+      return SDValue();
+
    // Try recursively combining arbitrary sequences of x86 shuffle
    // instructions into higher-order shuffles. We do this after combining
    // specific PSHUF instruction sequences into their minimal form so that we
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@ -2380,8 +2380,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,10,11,8,9,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
 ; AVX2-NEXT:    retq
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
  ret <16 x i16> %shuffle