[X86][AVX] lowerV4X128Shuffle - attempt to widen to 2x256 to simplify shuffles

If we are lowering to X86ISD::SHUF128 we are going to lose track of individual 128-bit lanes that are UNDEF, so if we can widen these to guarantee that they are sequential with their neighbour we should. This helps with later shuffle combines.
2024-10-19 19:12:56 +02:00 · 2020-03-30 12:22:06 +01:00 · 2020-03-30 12:22:06 +01:00 · 2b1f75c079
commit 2b1f75c079
parent fcea358269
2 changed files with 31 additions and 24 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -16861,13 +16861,14 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
  assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

  // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
-  SmallVector<int, 4> WidenedMask;
-  if (!canWidenShuffleElements(Mask, WidenedMask))
+  SmallVector<int, 4> Widened128Mask;
+  if (!canWidenShuffleElements(Mask, Widened128Mask))
    return SDValue();
+  assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");

  // Try to use an insert into a zero vector.
-  if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
-      (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
+  if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
+      (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
    unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
@ -16879,37 +16880,34 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

  // Check for patterns which can be matched with a single insert of a 256-bit
  // subvector.
-  bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
-                                        {0, 1, 2, 3, 0, 1, 2, 3});
-  if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
-                                        {0, 1, 2, 3, 8, 9, 10, 11})) {
+  bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
+  if (OnlyUsesV1 ||
+      isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
-    SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
-                                 OnlyUsesV1 ? V1 : V2,
-                              DAG.getIntPtrConstant(0, DL));
+    SDValue SubVec =
+        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
+                    DAG.getIntPtrConstant(0, DL));
    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
                       DAG.getIntPtrConstant(4, DL));
  }

-  assert(WidenedMask.size() == 4);
-
  // See if this is an insertion of the lower 128-bits of V2 into V1.
  bool IsInsert = true;
  int V2Index = -1;
  for (int i = 0; i < 4; ++i) {
-    assert(WidenedMask[i] >= -1);
-    if (WidenedMask[i] < 0)
+    assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+    if (Widened128Mask[i] < 0)
      continue;

    // Make sure all V1 subvectors are in place.
-    if (WidenedMask[i] < 4) {
-      if (WidenedMask[i] != i) {
+    if (Widened128Mask[i] < 4) {
+      if (Widened128Mask[i] != i) {
        IsInsert = false;
        break;
      }
    } else {
      // Make sure we only have a single V2 index and its the lowest 128-bits.
-      if (V2Index >= 0 || WidenedMask[i] != 4) {
+      if (V2Index >= 0 || Widened128Mask[i] != 4) {
        IsInsert = false;
        break;
      }
@ -16923,16 +16921,26 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
    return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
  }

+  // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
+  // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
+  // possible we at least ensure the lanes stay sequential to help later
+  // combines.
+  SmallVector<int, 2> Widened256Mask;
+  if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
+    Widened128Mask.clear();
+    llvm::scaleShuffleMask<int>(2, Widened256Mask, Widened128Mask);
+  }
+
  // Try to lower to vshuf64x2/vshuf32x4.
  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
  unsigned PermMask = 0;
  // Insure elements came from the same Op.
  for (int i = 0; i < 4; ++i) {
-    assert(WidenedMask[i] >= -1);
-    if (WidenedMask[i] < 0)
+    assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+    if (Widened128Mask[i] < 0)
      continue;

-    SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
+    SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
    unsigned OpIndex = i / 2;
    if (Ops[OpIndex].isUndef())
      Ops[OpIndex] = Op;
@ -16941,7 +16949,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

    // Convert the 128-bit shuffle mask selection values into 128-bit selection
    // bits defined by a vshuf64x2 instruction's immediate control byte.
-    PermMask |= (WidenedMask[i] % 4) << (i * 2);
+    PermMask |= (Widened128Mask[i] % 4) << (i * 2);
  }

  return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@ -2147,8 +2147,7 @@ define <8 x i64> @shuffle_v8i64_4567uuuu(<8 x i64> %a0, <8 x i64> %a1) {
 define <8 x i64> @shuffle_v8i64_uu67zzzz(<8 x i64> %a0, <8 x i64> %a1) {
 ; ALL-LABEL: shuffle_v8i64_uu67zzzz:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,6,7],zmm1[4,5,6,7]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; ALL-NEXT:    ret{{[l|q]}}
  %1 = shufflevector <8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8>
  ret <8 x i64> %1