[X86][AVX] combineX86ShufflesRecursively - attempt to constant fold before widening shuffle inputs

combineX86ShufflesConstants/canonicalizeShuffleMaskWithHorizOp can both handle/earlyout shuffles with inputs of different widths, so delay widening as late as possible to make it easier to match constant folds etc. The plan is to eventually move the widening inside combineX86ShuffleChain so that we don't create any new nodes unless we successfully combine the shuffles.
2024-11-22 02:33:06 +01:00 · 2021-01-22 12:52:01 +00:00 · 2021-01-22 12:52:01 +00:00 · 484ce58d87
commit 484ce58d87
parent 4ca1cdd110
2 changed files with 17 additions and 18 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -36610,6 +36610,17 @@ static SDValue combineX86ShufflesRecursively(
    }
  }

+  // Attempt to constant fold all of the constant source ops.
+  if (SDValue Cst = combineX86ShufflesConstants(
+          Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
+    return Cst;
+
+  // Canonicalize the combined shuffle mask chain with horizontal ops.
+  // NOTE: This will update the Ops and Mask.
+  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+          Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
+    return DAG.getBitcast(Root.getValueType(), HOp);
+
  // Widen any subvector shuffle inputs we've collected.
  if (any_of(Ops, [RootSizeInBits](SDValue Op) {
        return Op.getValueSizeInBits() < RootSizeInBits;
@ -36622,17 +36633,6 @@ static SDValue combineX86ShufflesRecursively(
    resolveTargetShuffleInputsAndMask(Ops, Mask);
  }

-  // Attempt to constant fold all of the constant source ops.
-  if (SDValue Cst = combineX86ShufflesConstants(
-          Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
-    return Cst;
-
-  // Canonicalize the combined shuffle mask chain with horizontal ops.
-  // NOTE: This will update the Ops and Mask.
-  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
-          Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
-    return DAG.getBitcast(Root.getValueType(), HOp);
-
  // We can only combine unary and binary shuffle mask cases.
  if (Ops.size() <= 2) {
    // Minor canonicalization of the accumulated shuffle mask to make it easier
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@ -108,13 +108,12 @@ define void @PR46178(i16* %0) {
 ; X86-NEXT:    vmovdqu (%eax), %ymm1
 ; X86-NEXT:    vpmovqw %ymm0, %xmm0
 ; X86-NEXT:    vpmovqw %ymm1, %xmm1
-; X86-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X86-NEXT:    vpsllw $8, %ymm0, %ymm0
-; X86-NEXT:    vpsraw $8, %ymm0, %ymm0
-; X86-NEXT:    vmovapd {{.*#+}} ymm1 = [0,0,2,0,4,0,4,0]
-; X86-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1
-; X86-NEXT:    vmovupd %ymm1, (%eax)
+; X86-NEXT:    vpsllw $8, %xmm1, %xmm1
+; X86-NEXT:    vpsraw $8, %xmm1, %xmm1
+; X86-NEXT:    vpsllw $8, %xmm0, %xmm0
+; X86-NEXT:    vpsraw $8, %xmm0, %xmm0
+; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
+; X86-NEXT:    vmovupd %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;