X86: improve (V)PMADDWD detection (2)

Implement "full" pattern.
2024-11-21 18:22:53 +01:00 · 2021-11-16 13:50:49 +03:00 · 2021-11-16 13:50:49 +03:00 · 1cc7bdd501
commit 1cc7bdd501
parent 610c27aa1c
1 changed files with 122 additions and 0 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -43550,6 +43550,18 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
    }
  }
  if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
      N1.getOpcode() == ISD::SIGN_EXTEND && N1.hasOneUse() &&
      N0.getOperand(0).getScalarValueSizeInBits() == 16 &&
      N1.getOperand(0).getScalarValueSizeInBits() == 16) {
    // If both arguments are sign-extended, try to replace sign extends
    // with zero extends, which should qualify for the optimization.
    // Otherwise just fallback to zero-extension check.
    Mask17 = 0;
    N0 = DAG.getNode(ISD::ZERO_EXTEND, N0.getNode(), VT, N0.getOperand(0));
    N1 = DAG.getNode(ISD::ZERO_EXTEND, N1.getNode(), VT, N1.getOperand(0));
  }
  if (!!Mask17 && N0.getOpcode() == ISD::SRA) {
    if (isa<ConstantSDNode>(N0.getOperand(1).getOperand(0)) &&
        DAG.ComputeNumSignBits(N1) >= 17 &&
@ -50186,6 +50198,114 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                          PMADDBuilder);
 }
 // Attempt to turn various patterns into PMADDWD when applicable.
 // (add (mul (...), (...)), (mul (...), (...))
 static SDValue matchPMADDWD_3(SelectionDAG &DAG, SDValue N0, SDValue N1,
                              const SDLoc &DL, EVT VT,
                              const X86Subtarget &Subtarget) {
  if (!Subtarget.hasSSE2() || Subtarget.isPMADDWDSlow())
    return SDValue();
  if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
    return SDValue();
  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
    return SDValue();
  // Make sure the type is legal or will be widened to a legal type.
  if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
    return SDValue();
  MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
  // Without BWI, we would need to split v32i16.
  if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
    return SDValue();
  SDValue N00 = N0.getOperand(0);
  SDValue N01 = N0.getOperand(1);
  SDValue N10 = N1.getOperand(0);
  SDValue N11 = N1.getOperand(1);
  APInt Mask17 = APInt::getHighBitsSet(32, 17);
  if (N00.getOpcode() == ISD::SRA && N01.getOpcode() == ISD::SRA &&
      N10.getOpcode() == ISD::SRA && N11.getOpcode() == ISD::SRA) {
    // If both arguments are sign-extended, try to replace sign extends
    // with zero extends, which should qualify for the optimization.
    // Otherwise just fallback to zero-extension check.
    if (isa<ConstantSDNode>(N00.getOperand(1).getOperand(0)) &&
        isa<ConstantSDNode>(N01.getOperand(1).getOperand(0)) &&
        isa<ConstantSDNode>(N10.getOperand(1).getOperand(0)) &&
        isa<ConstantSDNode>(N11.getOperand(1).getOperand(0)) &&
        N00.getOperand(1).getConstantOperandVal(0) == 16 &&
        N01.getOperand(1).getConstantOperandVal(0) == 16 &&
        N10.getOperand(1).getConstantOperandVal(0) == 16 &&
        N11.getOperand(1).getConstantOperandVal(0) == 16 &&
        DAG.isSplatValue(N00.getOperand(1)) &&
        DAG.isSplatValue(N01.getOperand(1)) &&
        DAG.isSplatValue(N10.getOperand(1)) &&
        DAG.isSplatValue(N11.getOperand(1))) {
      SDValue S00 = N00.getOperand(0);
      SDValue S01 = N01.getOperand(0);
      SDValue S10 = N10.getOperand(0);
      SDValue S11 = N11.getOperand(0);
      if (S10.getOpcode() == ISD::SHL && S11.getOpcode() == ISD::SHL) {
        std::swap(S00, S10);
        std::swap(S01, S11);
        std::swap(N00, N10);
        std::swap(N01, N11);
      }
      if (S00.getOpcode() == ISD::SHL && S01.getOpcode() == ISD::SHL) {
        if (S00.getOperand(0) == S10 && S01.getOperand(0) == S11) {
          // Multiplication components are of the same sources
          Mask17 = 0;
          N0 = S10;
          N1 = S11;
        } else {
          KnownBits k00, k01, k10, k11;
          k00 = DAG.computeKnownBits(S00);
          k01 = DAG.computeKnownBits(S01);
          k10 = DAG.computeKnownBits(S10);
          k11 = DAG.computeKnownBits(S11);
          // N00 = N00.getOperand(0);
          // N01 = N01.getOperand(0);
          // N0 = DAG.getNode(ISD::OR, DL, VT, N00, N10);
          // N1 = DAG.getNode(ISD::OR, DL, VT, N01, N11);
        }
      } else {
        Mask17 = 0;
        N00 = DAG.getNode(ISD::SRL, DL, VT, N00.getOperand(0), N00.getOperand(1));
        N01 = DAG.getNode(ISD::SRL, DL, VT, N01.getOperand(0), N01.getOperand(1));
        N10 = DAG.getNode(ISD::AND, DL, VT, N10.getOperand(0), DAG.getConstant(0xffff0000u, DL, VT));
        N11 = DAG.getNode(ISD::AND, DL, VT, N11.getOperand(0), DAG.getConstant(0xffff0000u, DL, VT));
        N0 = DAG.getNode(ISD::OR, DL, VT, N00, N10);
        N1 = DAG.getNode(ISD::OR, DL, VT, N01, N11);
      }
    }
  }
  if (!!Mask17 && (!DAG.MaskedValueIsZero(N00, Mask17) ||
                   !DAG.MaskedValueIsZero(N01, Mask17) ||
                   !DAG.MaskedValueIsZero(N10, Mask17) ||
                   !DAG.MaskedValueIsZero(N11, Mask17)))
    return SDValue();
  // Use SplitOpsAndApply to handle AVX splitting.
  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
    MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
    return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
  };
  return SplitOpsAndApply(DAG, Subtarget, DL, VT,
                          { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
                          PMADDWDBuilder);
 }
 /// CMOV of constants requires materializing constant operands in registers.
 /// Try to fold those constants into an 'add' instruction to reduce instruction
 /// count. We do this with CMOV rather the generic 'select' because there are
@ -50240,6 +50360,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
    return MAdd;
  if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
    return MAdd;
  if (SDValue MAdd = matchPMADDWD_3(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
    return MAdd;
  // Try to synthesize horizontal adds from adds of shuffles.
  if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))