[DAGCombiner] Use computeKnownBits to match rotate patterns that have had their amount masking modified by simplifyDemandedBits

SimplifyDemandedBits can remove bits from the masks for the shift amounts we need to see to detect rotates. This patch uses zeroes from computeKnownBits to fill in some of these mask bits to make the match work. As currently written this calls computeKnownBits even when the mask hasn't been simplified because it made the code simpler. If we're worried about compile time performance we can improve this. I know we're talking about making a rotate intrinsic, but hopefully we can go ahead and do this change and just make sure the rotate intrinsic also handles it. Differential Revision: https://reviews.llvm.org/D47116 llvm-svn: 332895
2024-10-22 20:43:44 +02:00 · 2018-05-21 21:09:18 +00:00 · 2018-05-21 21:09:18 +00:00 · ac8a899f0f
commit ac8a899f0f
parent 93fe0f0f18
3 changed files with 30 additions and 57 deletions
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -4823,7 +4823,8 @@ bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
 // in direction shift1 by Neg.  The range [0, EltSize) means that we only need
 // to consider shift amounts with defined behavior.
-static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
+static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
+                           SelectionDAG &DAG) {
  // If EltSize is a power of 2 then:
  //
  //  (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
@ -4858,9 +4859,13 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
  unsigned MaskLoBits = 0;
  if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
    if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
-      if (NegC->getAPIntValue() == EltSize - 1) {
+      KnownBits Known;
+      DAG.computeKnownBits(Neg.getOperand(0), Known);
+      unsigned Bits = Log2_64(EltSize);
+      if (NegC->getAPIntValue().getActiveBits() <= Bits &&
+          ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) {
        Neg = Neg.getOperand(0);
-        MaskLoBits = Log2_64(EltSize);
+        MaskLoBits = Bits;
      }
    }
  }
@ -4875,10 +4880,16 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {

  // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
  // Pos'.  The truncation is redundant for the purpose of the equality.
-  if (MaskLoBits && Pos.getOpcode() == ISD::AND)
-    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
-      if (PosC->getAPIntValue() == EltSize - 1)
+  if (MaskLoBits && Pos.getOpcode() == ISD::AND) {
+    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) {
+      KnownBits Known;
+      DAG.computeKnownBits(Pos.getOperand(0), Known);
+      if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits &&
+          ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >=
+           MaskLoBits))
        Pos = Pos.getOperand(0);
+    }
+  }

  // The condition we need is now:
  //
@ -4934,7 +4945,7 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
  //          (srl x, (*ext y))) ->
  //   (rotr x, y) or (rotl x, (sub 32, y))
  EVT VT = Shifted.getValueType();
-  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) {
+  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
    bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
    return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
                       HasPos ? Pos : Neg).getNode();
--- a/test/CodeGen/X86/combine-rotates.ll
+++ b/test/CodeGen/X86/combine-rotates.ll
@ -61,27 +61,14 @@ define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) {
 define <4 x i32> @rotate_demanded_bits(<4 x i32>, <4 x i32>) {
 ; XOP-LABEL: rotate_demanded_bits:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [30,30,30,30]
-; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vpshld %xmm1, %xmm0, %xmm3
-; XOP-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; XOP-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
-; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
-; XOP-NEXT:    vpshld %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: rotate_demanded_bits:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30]
-; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm3
-; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
-; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1
+; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
  %3 = and <4 x i32> %1, <i32 30, i32 30, i32 30, i32 30>
  %4 = shl <4 x i32> %0, %3
@ -117,28 +104,15 @@ define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) {
 ; XOP-LABEL: rotate_demanded_bits_3:
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [30,30,30,30]
-; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; XOP-NEXT:    vpshld %xmm3, %xmm0, %xmm3
-; XOP-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; XOP-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
-; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
-; XOP-NEXT:    vpshld %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: rotate_demanded_bits_3:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30]
-; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX512-NEXT:    vpsllvd %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
-; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1
+; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
  %3 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
  %4 = and <4 x i32> %3, <i32 30, i32 30, i32 30, i32 30>
--- a/test/CodeGen/X86/rotate4.ll
+++ b/test/CodeGen/X86/rotate4.ll
@ -284,15 +284,9 @@ define void @rotate_right_m16(i16* %p, i32 %amount) {
 define i32 @rotate_demanded_bits(i32, i32) {
 ; CHECK-LABEL: rotate_demanded_bits:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    andb $30, %sil
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    andl $30, %ecx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    negl %ecx
-; CHECK-NEXT:    andb $30, %cl
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrl %cl, %edi
-; CHECK-NEXT:    orl %eax, %edi
+; CHECK-NEXT:    roll %cl, %edi
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
  %3 = and i32 %1, 30
@ -324,16 +318,10 @@ define i32 @rotate_demanded_bits_2(i32, i32) {
 define i32 @rotate_demanded_bits_3(i32, i32) {
 ; CHECK-LABEL: rotate_demanded_bits_3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addl %esi, %esi
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    andb $30, %cl
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    negl %esi
+; CHECK-NEXT:    addb %sil, %sil
 ; CHECK-NEXT:    andb $30, %sil
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    shrl %cl, %edi
-; CHECK-NEXT:    orl %eax, %edi
+; CHECK-NEXT:    roll %cl, %edi
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
  %3 = shl i32 %1, 1