[X86] Avoid using high register trick for test instruction

Summary: It seems it's main effect is to create addition copies when values are inr register that do not support this trick, which increase register pressure and makes the code bigger. The main noteworthy regression I was able to observe was pattern of the type (setcc (trunc (and X, C)), 0) where C is such as it would benefit from the hi register trick. To prevent this, a new pattern is added to materialize such pattern using a 32 bits test. This has the added benefit of working with any constant that is materializable as a 32bits immediate, not just the ones that can leverage the high register trick, as demonstrated by the test case in test-shrink.ll using the constant 2049 . Reviewers: craig.topper, niravd, spatel, hfinkel Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D42646 llvm-svn: 323690
2024-10-19 11:02:59 +02:00 · 2018-01-29 20:54:33 +00:00 · 2018-01-29 20:54:33 +00:00 · 90c261493f
commit 90c261493f
parent f20f873912
7 changed files with 29 additions and 82 deletions
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -3073,67 +3073,33 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
        return;
      }

-      // For example, "testl %eax, $2048" to "testb %ah, $8".
-      if (isShiftedUInt<8, 8>(Mask) &&
-          (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
-        // Shift the immediate right by 8 bits.
-        SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8);
-        SDValue Reg = N0.getOperand(0);
-
-        // Extract the h-register.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
-                                                        MVT::i8, Reg);
-
-        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
-        // target GR8_NOREX registers, so make sure the register class is
-        // forced.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
-                                                 MVT::i32, Subreg, ShiftedImm);
-        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
-        // one, do not call ReplaceAllUsesWith.
-        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
-                    SDValue(NewNode, 0));
-        CurDAG->RemoveDeadNode(Node);
-        return;
-      }
-
-      // For example, "testl %eax, $32776" to "testw %ax, $32776".
-      // NOTE: We only want to form TESTW instructions if optimizing for
-      // min size. Otherwise we only save one byte and possibly get a length
-      // changing prefix penalty in the decoders.
-      if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 &&
-          (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
-        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16);
-        SDValue Reg = N0.getOperand(0);
-
-        // Extract the 16-bit subregister.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
-                                                        MVT::i16, Reg);
-
-        // Emit a testw.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
-                                                 Subreg, Imm);
-        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
-        // one, do not call ReplaceAllUsesWith.
-        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
-                    SDValue(NewNode, 0));
-        CurDAG->RemoveDeadNode(Node);
-        return;
-      }
-
      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
-      if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 &&
+      if (isUInt<32>(Mask) &&
          (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
-        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
+        MVT VT = MVT::i32;
+        int SubRegOp = X86::sub_32bit;
+        unsigned Op = X86::TEST32ri;
+
+        // For example, "testl %eax, $32776" to "testw %ax, $32776".
+        // NOTE: We only want to form TESTW instructions if optimizing for
+        // min size. Otherwise we only save one byte and possibly get a length
+        // changing prefix penalty in the decoders.
+        if (OptForMinSize && isUInt<16>(Mask) &&
+            (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
+          VT = MVT::i16;
+          SubRegOp = X86::sub_16bit;
+          Op = X86::TEST16ri;
+        }
+
+        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
        SDValue Reg = N0.getOperand(0);

-        // Extract the 32-bit subregister.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
-                                                        MVT::i32, Reg);
+        // Extract the subregister if necessary.
+        if (N0.getValueType() != VT)
+          Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);

-        // Emit a testl.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
-                                                 Subreg, Imm);
+        // Emit a testl or testw.
+        SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm);
        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
        // one, do not call ReplaceAllUsesWith.
        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@ -1257,14 +1257,6 @@ let isCompare = 1 in {
    def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
    let Predicates = [In64BitMode] in
    def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
-
-    // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
-    // register class is constrained to GR8_NOREX. This pseudo is explicitly
-    // marked side-effect free, since it doesn't have an isel pattern like
-    // other test instructions.
-    let isPseudo = 1, hasSideEffects = 0 in
-    def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
-                          "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
  } // Defs = [EFLAGS]

  def TEST8i8    : BinOpAI_F<0xA8, "test", Xi8 , AL,
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@ -8018,9 +8018,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
  case X86::VMOVUPSZ256mr_NOVLX:
    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
                            get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
-  case X86::TEST8ri_NOREX:
-    MI.setDesc(get(X86::TEST8ri));
-    return true;
  case X86::MOV32ri64:
    MI.setDesc(get(X86::MOV32ri));
    return true;
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@ -86,7 +86,6 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
  case X86::TEST16mr:
  case X86::TEST32mr:
  case X86::TEST64mr:
-  case X86::TEST8ri_NOREX:
  case X86::AND16i16:
  case X86::AND16ri:
  case X86::AND16ri8:
--- a/test/CodeGen/X86/test-shrink.ll
+++ b/test/CodeGen/X86/test-shrink.ll
@ -484,8 +484,7 @@ no:
 define void @truncand32(i16 inreg %x) nounwind {
 ; CHECK-LINUX64-LABEL: truncand32:
 ; CHECK-LINUX64:       # %bb.0:
-; CHECK-LINUX64-NEXT:    andl $2049, %edi # imm = 0x801
-; CHECK-LINUX64-NEXT:    testw %di, %di
+; CHECK-LINUX64-NEXT:    testl $2049, %edi # imm = 0x801
 ; CHECK-LINUX64-NEXT:    je .LBB11_1
 ; CHECK-LINUX64-NEXT:  # %bb.2: # %no
 ; CHECK-LINUX64-NEXT:    retq
@ -498,8 +497,7 @@ define void @truncand32(i16 inreg %x) nounwind {
 ; CHECK-WIN32-64-LABEL: truncand32:
 ; CHECK-WIN32-64:       # %bb.0:
 ; CHECK-WIN32-64-NEXT:    subq $40, %rsp
-; CHECK-WIN32-64-NEXT:    andl $2049, %ecx # imm = 0x801
-; CHECK-WIN32-64-NEXT:    testw %cx, %cx
+; CHECK-WIN32-64-NEXT:    testl $2049, %ecx # imm = 0x801
 ; CHECK-WIN32-64-NEXT:    je .LBB11_1
 ; CHECK-WIN32-64-NEXT:  # %bb.2: # %no
 ; CHECK-WIN32-64-NEXT:    addq $40, %rsp
@ -511,8 +509,7 @@ define void @truncand32(i16 inreg %x) nounwind {
 ;
 ; CHECK-X86-LABEL: truncand32:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    andl $2049, %eax # imm = 0x801
-; CHECK-X86-NEXT:    testw %ax, %ax
+; CHECK-X86-NEXT:    testl $2049, %eax # imm = 0x801
 ; CHECK-X86-NEXT:    je .LBB11_1
 ; CHECK-X86-NEXT:  # %bb.2: # %no
 ; CHECK-X86-NEXT:    retl
--- a/test/CodeGen/X86/testb-je-fusion.ll
+++ b/test/CodeGen/X86/testb-je-fusion.ll
@ -6,9 +6,8 @@
 define i32 @check_flag(i32 %flags, ...) nounwind {
 ; CHECK-LABEL: check_flag:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edi, %ecx
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    testb $2, %ch
+; CHECK-NEXT:    testl $512, %edi # imm = 0x200
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
 ; CHECK-NEXT:    movl $1, %eax
--- a/test/CodeGen/X86/vastart-defs-eflags.ll
+++ b/test/CodeGen/X86/vastart-defs-eflags.ll
@ -8,9 +8,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 define i32 @check_flag(i32 %flags, ...) nounwind {
 ; CHECK-LABEL: check_flag:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $48, %rsp
-; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    subq $56, %rsp
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    je LBB0_2
 ; CHECK-NEXT:  ## %bb.1: ## %entry
@ -29,7 +27,7 @@ define i32 @check_flag(i32 %flags, ...) nounwind {
 ; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    testb $2, %bh
+; CHECK-NEXT:    testl $512, %edi ## imm = 0x200
 ; CHECK-NEXT:    je LBB0_4
 ; CHECK-NEXT:  ## %bb.3: ## %if.then
 ; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
@ -40,8 +38,7 @@ define i32 @check_flag(i32 %flags, ...) nounwind {
 ; CHECK-NEXT:    movl $8, 0
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:  LBB0_4: ## %if.end
-; CHECK-NEXT:    addq $48, %rsp
-; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    addq $56, %rsp
 ; CHECK-NEXT:    retq
 entry:
  %and = and i32 %flags, 512