[X86] Move matching of (and (srl/sra, C), (1<<C) - 1) to BEXTR/BEXTRI instruction to custom isel

Recognizing this pattern during DAG combine hides information about the 'and' and the shift from other combines. I think it should be recognized at isel so its as late as possible. But it can't be done with table based isel because you need to be able to look at both immediates. This patch moves it to custom isel in X86ISelDAGToDAG.cpp. This does break a couple tests in tbm_patterns because we are now emitting an and_flag node or (cmp and, 0) that we dont' recognize yet. We already had this problem for several other TBM patterns so I think this fine and we can address of them together. I've also fixed a bug where the combine to BEXTR was preventing us from using a trick of zero extending AH to handle extracts of bits 15:8. We might still want to use BEXTR if it enables load folding. But honestly I hope we narrowed the load instead before got to isel. I think we should probably also support matching BEXTR from (srl/srl (and mask << C), C). But that should be a different patch. Differential Revision: https://reviews.llvm.org/D37592 llvm-svn: 313054
2024-10-19 02:52:53 +02:00 · 2017-09-12 17:40:25 +00:00 · 2017-09-12 17:40:25 +00:00 · 91215998d2
commit 91215998d2
parent 4386c540a8
6 changed files with 144 additions and 54 deletions
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -422,6 +422,8 @@ namespace {
    }

    bool foldLoadStoreIntoMemOperand(SDNode *Node);
+
+    bool matchBEXTRFromAnd(SDNode *Node);
  };
 }

@ -2280,6 +2282,90 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
  return true;
 }

+// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
+bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
+  MVT NVT = Node->getSimpleValueType(0);
+  SDLoc dl(Node);
+
+  SDValue N0 = Node->getOperand(0);
+  SDValue N1 = Node->getOperand(1);
+
+  if (!Subtarget->hasBMI() && !Subtarget->hasTBM())
+    return false;
+
+  // Must have a shift right.
+  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
+    return false;
+
+  // Shift can't have additional users.
+  if (!N0->hasOneUse())
+    return false;
+
+  // Only supported for 32 and 64 bits.
+  if (NVT != MVT::i32 && NVT != MVT::i64)
+    return false;
+
+  // Shift amount and RHS of and must be constant.
+  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+  if (!MaskCst || !ShiftCst)
+    return false;
+
+  // And RHS must be a mask.
+  uint64_t Mask = MaskCst->getZExtValue();
+  if (!isMask_64(Mask))
+    return false;
+
+  uint64_t Shift = ShiftCst->getZExtValue();
+  uint64_t MaskSize = countPopulation(Mask);
+
+  // Don't interfere with something that can be handled by extracting AH.
+  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
+  if (Shift == 8 && MaskSize == 8)
+    return false;
+
+  // Make sure we are only using bits that were in the original value, not
+  // shifted in.
+  if (Shift + MaskSize > NVT.getSizeInBits())
+    return false;
+
+  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+  unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+  unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+
+  // BMI requires the immediate to placed in a register.
+  if (!Subtarget->hasTBM()) {
+    ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+    MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+    SDNode *Move = CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New);
+    New = SDValue(Move, 0);
+  }
+
+  MachineSDNode *NewNode;
+  SDValue Input = N0->getOperand(0);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (tryFoldLoad(Node, Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+    // Update the chain.
+    ReplaceUses(N1.getValue(1), SDValue(NewNode, 1));
+    // Record the mem-refs
+    LoadSDNode *LoadNode = cast<LoadSDNode>(Input);
+    if (LoadNode) {
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LoadNode->getMemOperand();
+      NewNode->setMemRefs(MemOp, MemOp + 1);
+    }
+  } else {
+    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+  }
+
+  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+  CurDAG->RemoveDeadNode(Node);
+  return true;
+}
+
 void X86DAGToDAGISel::Select(SDNode *Node) {
  MVT NVT = Node->getSimpleValueType(0);
  unsigned Opc, MOpc;
@ -2333,8 +2419,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
  }

  case ISD::AND:
+    // Try to match BEXTR/BEXTRI instruction.
+    if (matchBEXTRFromAnd(Node))
+      return;
+
+    LLVM_FALLTHROUGH;
  case ISD::OR:
  case ISD::XOR: {
+
    // For operations of the form (x << C1) op C2, check if we can use a smaller
    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
    SDValue N0 = Node->getOperand(0);
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -24586,7 +24586,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case X86ISD::OR:                 return "X86ISD::OR";
  case X86ISD::XOR:                return "X86ISD::XOR";
  case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
  case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
  case X86ISD::PTEST:              return "X86ISD::PTEST";
@ -32162,9 +32161,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
    return ShiftRight;

  EVT VT = N->getValueType(0);
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);

  // Attempt to recursively combine a bitmask AND with shuffles.
  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
@ -32177,29 +32173,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
      return SDValue(); // This routine will use CombineTo to replace N.
  }

-  // Create BEXTR instructions
-  // BEXTR is ((X >> imm) & (2**size-1))
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
-    return SDValue();
-  if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
-    return SDValue();
-
-  ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
-  ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-  if (MaskNode && ShiftNode) {
-    uint64_t Mask = MaskNode->getZExtValue();
-    uint64_t Shift = ShiftNode->getZExtValue();
-    if (isMask_64(Mask)) {
-      uint64_t MaskSize = countPopulation(Mask);
-      if (Shift + MaskSize <= VT.getSizeInBits())
-        return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
-                           DAG.getConstant(Shift | (MaskSize << 8), DL,
-                                           VT));
-    }
-  }
  return SDValue();
 }

--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -346,9 +346,6 @@ namespace llvm {
      ADD, SUB, ADC, SBB, SMUL,
      INC, DEC, OR, XOR, AND,

-      // Bit field extract.
-      BEXTR,
-
      // LOW, HI, FLAGS = umul LHS, RHS.
      UMUL,

--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@ -271,8 +271,6 @@ def X86lock_and  : SDNode<"X86ISD::LAND",  SDTLockBinaryArithWithFlags,
                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                           SDNPMemOperand]>;

-def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
-
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;

 def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
@ -2439,17 +2437,6 @@ let Predicates = [HasBMI2] in {
              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
 } // HasBMI2

-let Predicates = [HasBMI] in {
-  def : Pat<(X86bextr GR32:$src1, GR32:$src2),
-            (BEXTR32rr GR32:$src1, GR32:$src2)>;
-  def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
-            (BEXTR32rm addr:$src1, GR32:$src2)>;
-  def : Pat<(X86bextr GR64:$src1, GR64:$src2),
-            (BEXTR64rr GR64:$src1, GR64:$src2)>;
-  def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
-            (BEXTR64rm addr:$src1, GR64:$src2)>;
-} // HasBMI
-
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
                         X86MemOperand x86memop, Intrinsic Int,
                         PatFrag ld_frag> {
@ -2651,15 +2638,6 @@ def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>;
 //===----------------------------------------------------------------------===//

 let Predicates = [HasTBM] in {
-  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
-            (BEXTRI32ri GR32:$src1, imm:$src2)>;
-  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
-            (BEXTRI32mi addr:$src1, imm:$src2)>;
-  def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
-            (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
-  def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
-            (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
-
  // FIXME: patterns for the load versions are not implemented
  def : Pat<(and GR32:$src, (add GR32:$src, 1)),
            (BLCFILL32rr GR32:$src)>;
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@ -311,6 +311,18 @@ define i32 @bextr32b(i32 %x)  uwtable  ssp {
  ret i32 %2
 }

+; Make sure we still use AH subreg trick to extract 15:8
+define i32 @bextr32_subreg(i32 %x)  uwtable  ssp {
+; CHECK-LABEL: bextr32_subreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retq
+  %1 = lshr i32 %x, 8
+  %2 = and i32 %1, 255
+  ret i32 %2
+}
+
 define i32 @bextr32b_load(i32* %x)  uwtable  ssp {
 ; CHECK-LABEL: bextr32b_load:
 ; CHECK:       # BB#0:
@ -357,6 +369,18 @@ define i64 @bextr64b(i64 %x)  uwtable  ssp {
  ret i64 %2
 }

+; Make sure we still use the AH subreg trick to extract 15:8
+define i64 @bextr64_subreg(i64 %x)  uwtable  ssp {
+; CHECK-LABEL: bextr64_subreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retq
+  %1 = lshr i64 %x, 8
+  %2 = and i64 %1, 255
+  ret i64 %2
+}
+
 define i64 @bextr64b_load(i64* %x) {
 ; CHECK-LABEL: bextr64b_load:
 ; CHECK:       # BB#0:
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@ -13,6 +13,18 @@ define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind {
  ret i32 %t1
 }

+; Make sure we still use AH subreg trick for extracting bits 15:8
+define i32 @test_x86_tbm_bextri_u32_subreg(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_subreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retq
+  %t0 = lshr i32 %a, 8
+  %t1 = and i32 %t0, 255
+  ret i32 %t1
+}
+
 define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
 ; CHECK:       # BB#0:
@ -40,7 +52,8 @@ define i32 @test_x86_tbm_bextri_u32_z(i32 %a, i32 %b) nounwind {
 define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT:    shrl $4, %edi
+; CHECK-NEXT:    testw $4095, %di # imm = 0xFFF
 ; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
@ -61,6 +74,18 @@ define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind {
  ret i64 %t1
 }

+; Make sure we still use AH subreg trick for extracting bits 15:8
+define i64 @test_x86_tbm_bextri_u64_subreg(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_subreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retq
+  %t0 = lshr i64 %a, 8
+  %t1 = and i64 %t0, 255
+  ret i64 %t1
+}
+
 define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
 ; CHECK:       # BB#0:
@ -88,7 +113,8 @@ define i64 @test_x86_tbm_bextri_u64_z(i64 %a, i64 %b) nounwind {
 define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT:    shrl $4, %edi
+; CHECK-NEXT:    testw $4095, %di # imm = 0xFFF
 ; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    retq