[DAGCombiner] Add vector demanded elements support to ComputeNumSignBits

Currently ComputeNumSignBits returns the minimum number of sign bits for all elements of vector data, when we may only be interested in one/some of the elements. This patch adds a DemandedElts argument that allows us to specify the elements we actually care about. The original ComputeNumSignBits implementation calls with a DemandedElts demanding all elements to match current behaviour. Scalar types set this to 1. I've only added support for BUILD_VECTOR and EXTRACT_VECTOR_ELT so far, all others will default to demanding all elements but can be updated in due course. Followup to D25691. Differential Revision: https://reviews.llvm.org/D31311 llvm-svn: 299219
2024-11-23 11:13:28 +01:00 · 2017-03-31 13:54:09 +00:00 · 2017-03-31 13:54:09 +00:00 · 026e8c9b44
commit 026e8c9b44
parent 1397ecba80
9 changed files with 65 additions and 24 deletions
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@ -1317,6 +1317,17 @@ public:
  /// target nodes to be understood.
  unsigned ComputeNumSignBits(SDValue Op, unsigned Depth = 0) const;

+  /// Return the number of times the sign bit of the register is replicated into
+  /// the other bits. We know that at least 1 bit is always equal to the sign
+  /// bit (itself), but other cases can give us information. For example,
+  /// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
+  /// to each other, so we return 3. The DemandedElts argument allows
+  /// us to only collect the minimum sign bits of the requested vector elements.
+  /// Targets can implement the ComputeNumSignBitsForTarget method in the
+  /// TargetLowering class to allow target nodes to be understood.
+  unsigned ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                              unsigned Depth = 0) const;
+
  /// Return true if the specified operand is an ISD::ADD with a ConstantSDNode
  /// on the right-hand side, or if it is an ISD::OR with a ConstantSDNode that
  /// is guaranteed to have the same semantics as an ADD. This handles the
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@ -2432,8 +2432,11 @@ public:
                                             unsigned Depth = 0) const;

  /// This method can be implemented by targets that want to expose additional
-  /// information about sign bits to the DAG Combiner.
+  /// information about sign bits to the DAG Combiner. The DemandedElts
+  /// argument allows us to only collect the minimum sign bits that are shared
+  /// by the requested vector elements.
  virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   const APInt &DemandedElts,
                                                   const SelectionDAG &DAG,
                                                   unsigned Depth = 0) const;

--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -2899,6 +2899,15 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {

 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
  EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return ComputeNumSignBits(Op, DemandedElts, Depth);
+}
+
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                                          unsigned Depth) const {
+  EVT VT = Op.getValueType();
  assert(VT.isInteger() && "Invalid VT!");
  unsigned VTBits = VT.getScalarSizeInBits();
  unsigned Tmp, Tmp2;
@ -2907,6 +2916,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
  if (Depth == 6)
    return 1;  // Limit search depth.

+  if (!DemandedElts)
+    return 1;  // No demanded elts, better to assume we don't know anything.
+
  switch (Op.getOpcode()) {
  default: break;
  case ISD::AssertSext:
@ -2924,6 +2936,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
  case ISD::BUILD_VECTOR:
    Tmp = VTBits;
    for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
+      if (!DemandedElts[i])
+        continue;
+
      SDValue SrcOp = Op.getOperand(i);
      Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);

@ -3116,18 +3131,28 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
    return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
  }
  case ISD::EXTRACT_VECTOR_ELT: {
-    // At the moment we keep this simple and skip tracking the specific
-    // element. This way we get the lowest common denominator for all elements
-    // of the vector.
-    // TODO: get information for given vector element
+    SDValue InVec = Op.getOperand(0);
+    SDValue EltNo = Op.getOperand(1);
+    EVT VecVT = InVec.getValueType();
    const unsigned BitWidth = Op.getValueSizeInBits();
    const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
+    const unsigned NumSrcElts = VecVT.getVectorNumElements();
+
    // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
    // anything about sign bits. But if the sizes match we can derive knowledge
    // about sign bits from the vector operand.
-    if (BitWidth == EltBitWidth)
-      return ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    break;
+    if (BitWidth != EltBitWidth)
+      break;
+
+    // If we know the element index, just demand that vector element, else for
+    // an unknown element index, ignore DemandedElts and demand them all.
+    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
+      DemandedSrcElts =
+          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+    return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
  }
  case ISD::EXTRACT_SUBVECTOR:
    return ComputeNumSignBits(Op.getOperand(0), Depth + 1);
@ -3162,7 +3187,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
      Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
      Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
      Op.getOpcode() == ISD::INTRINSIC_VOID) {
-    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth);
+    unsigned NumBits =
+        TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
    if (NumBits > 1)
      FirstAnswer = std::max(FirstAnswer, NumBits);
  }
@ -3170,7 +3196,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
  // Finally, if we can prove that the top bits of the result are 0's or 1's,
  // use this information.
  APInt KnownZero, KnownOne;
-  computeKnownBits(Op, KnownZero, KnownOne, Depth);
+  computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth);

  APInt Mask;
  if (KnownZero.isNegative()) {        // sign bit is 0
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -1338,6 +1338,7 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
 /// This method can be implemented by targets that want to expose additional
 /// information about sign bits to the DAG Combiner.
 unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         const APInt &,
                                                         const SelectionDAG &,
                                                         unsigned Depth) const {
  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -3596,7 +3596,8 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
 }

 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
-    SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
  switch (Op.getOpcode()) {
  case AMDGPUISD::BFE_I32: {
    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -206,7 +206,8 @@ public:
                                     const SelectionDAG &DAG,
                                     unsigned Depth = 0) const override;

-  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
                                           unsigned Depth = 0) const override;

  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -26745,7 +26745,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
 }

 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
-    SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
  unsigned VTBits = Op.getScalarValueSizeInBits();
  unsigned Opcode = Op.getOpcode();
  switch (Opcode) {
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -832,6 +832,7 @@ namespace llvm {

    /// Determine the number of bits in the operation that are sign bits.
    unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const APInt &DemandedElts,
                                             const SelectionDAG &DAG,
                                             unsigned Depth) const override;

--- a/test/CodeGen/X86/known-bits-vector.ll
+++ b/test/CodeGen/X86/known-bits-vector.ll
@ -23,18 +23,14 @@ define i32 @knownbits_mask_extract_sext(<8 x i16> %a0) nounwind {
 define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_extract_uitofp:
 ; X32:       # BB#0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    pushl %eax
 ; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
-; X32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    fildll {{[0-9]+}}(%esp)
-; X32-NEXT:    fstps {{[0-9]+}}(%esp)
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
+; X32-NEXT:    vmovd %xmm0, %eax
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; X32-NEXT:    vmovss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_mask_extract_uitofp:
@ -42,7 +38,7 @@ define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
 ; X64-NEXT:    vmovq %xmm0, %rax
-; X64-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; X64-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; X64-NEXT:    retq
  %1 = and <2 x i64> %a0, <i64 65535, i64 -1>
  %2 = extractelement <2 x i64> %1, i32 0