[X86] Use push-pop for materializing small constants under 'minsize'

Use the 3-byte (4 with REX prefix) push-pop sequence for materializing small constants. This is smaller than using a mov (5, 6 or 7 bytes depending on size and REX prefix), but it's likely to be slower, so only used for 'minsize'. This is a follow-up to r255656. Differential Revision: http://reviews.llvm.org/D15549 llvm-svn: 255936
2024-11-22 18:54:02 +01:00 · 2015-12-17 23:18:39 +00:00 · 2015-12-17 23:18:39 +00:00 · 6b696434e4
commit 6b696434e4
parent 72c4ebb415
8 changed files with 262 additions and 106 deletions
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -157,9 +157,13 @@ namespace {
    /// performance.
    bool OptForSize;

+    /// If true, selector should try to optimize for minimum code size.
+    bool OptForMinSize;
+
  public:
    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
-        : SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
+        : SelectionDAGISel(tm, OptLevel), OptForSize(false),
+          OptForMinSize(false) {}

    const char *getPassName() const override {
      return "X86 DAG->DAG Instruction Selection";
@ -531,8 +535,10 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 }

 void X86DAGToDAGISel::PreprocessISelDAG() {
-  // OptForSize is used in pattern predicates that isel is matching.
+  // OptFor[Min]Size are used in pattern predicates that isel is matching.
  OptForSize = MF->getFunction()->optForSize();
+  OptForMinSize = MF->getFunction()->optForMinSize();
+  assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");

  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
       E = CurDAG->allnodes_end(); I != E; ) {
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@ -250,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1 in
+    isPseudo = 1, AddedComplexity = 20 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;

@ -263,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
 }

 let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
-    AddedComplexity = 1 in {
+    AddedComplexity = 15 in {
  // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
  // which only require 3 bytes compared to MOV32ri which requires 5.
  let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
@ -278,6 +278,17 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
  def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
 }

+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
+// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
+// FIXME: Add itinerary class and Schedule.
+def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
+                       [(set GR32:$dst, i32immSExt8:$src)]>,
+                     Requires<[OptForMinSize]>;
+def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
+                       [(set GR64:$dst, i64immSExt8:$src)]>,
+                     Requires<[OptForMinSize, NotWin64WithoutFP]>;
+}
+
 // Materialize i64 constant where top 32-bits are zero. This could theoretically
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
@ -5297,6 +5298,50 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
  return true;
 }

+bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  int64_t Imm = MIB->getOperand(1).getImm();
+  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
+  MachineBasicBlock::iterator I = MIB.getInstr();
+
+  int StackAdjustment;
+
+  if (Subtarget.is64Bit()) {
+    assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
+           MIB->getOpcode() == X86::MOV32ImmSExti8);
+    // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
+    // widen the register if necessary.
+    StackAdjustment = 8;
+    BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm);
+    MIB->setDesc(get(X86::POP64r));
+    MIB->getOperand(0)
+        .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), MVT::i64));
+  } else {
+    assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
+    StackAdjustment = 4;
+    BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm);
+    MIB->setDesc(get(X86::POP32r));
+  }
+
+  // Build CFI if necessary.
+  MachineFunction &MF = *MBB.getParent();
+  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  bool NeedsDwarfCFI =
+      !IsWin64Prologue &&
+      (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
+  if (EmitCFI) {
+    TFL->BuildCFI(MBB, I, DL,
+        MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
+    TFL->BuildCFI(MBB, std::next(I), DL,
+        MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
+  }
+
+  return true;
+}
+
 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
 // code sequence is needed for other targets.
 static void expandLoadStackGuard(MachineInstrBuilder &MIB,
@ -5329,6 +5374,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
    return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
  case X86::MOV32r_1:
    return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+  case X86::MOV32ImmSExti8:
+  case X86::MOV64ImmSExti8:
+    return ExpandMOVImmSExti8(MIB);
  case X86::SETB_C8r:
    return Expand2AddrUndef(MIB, get(X86::SBB8rr));
  case X86::SETB_C16r:
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@ -23,6 +23,7 @@
 #include "X86GenInstrInfo.inc"

 namespace llvm {
+  class MachineInstrBuilder;
  class X86RegisterInfo;
  class X86Subtarget;

@ -564,6 +565,9 @@ private:
  /// operand and follow operands form a reference to the stack frame.
  bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
                      int &FrameIndex) const;
+
+  /// Expand the MOVImmSExti8 pseudo-instructions.
+  bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const;
 };

 } // End llvm namespace
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@ -820,6 +820,8 @@ def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
                             AssemblerPredicate<"Mode32Bit", "32-bit mode">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
+def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
+                                  "Subtarget->getFrameLowering()->hasFP(*MF)">;
 def IsPS4        : Predicate<"Subtarget->isTargetPS4()">;
 def NotPS4       : Predicate<"!Subtarget->isTargetPS4()">;
 def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
@ -833,6 +835,7 @@ def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
 def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
 def IsNotPIC     : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
 def OptForSize   : Predicate<"OptForSize">;
+def OptForMinSize : Predicate<"OptForMinSize">;
 def OptForSpeed  : Predicate<"!OptForSize">;
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
 def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
--- a/test/CodeGen/X86/materialize-one.ll
+++ b/test/CodeGen/X86/materialize-one.ll
@ -1,100 +0,0 @@
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
-
-define i32 @one32() optsize {
-entry:
-  ret i32 1
-
-; CHECK32-LABEL: one32
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  ret
-
-; FIXME: Figure out the best approach in 64-bit mode.
-; CHECK64-LABEL: one32
-; CHECK64:       movl $1, %eax
-; CHECK64-NEXT:  retq
-}
-
-define i32 @minus_one32() optsize {
-entry:
-  ret i32 -1
-
-; CHECK32-LABEL: minus_one32
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  ret
-}
-
-define i16 @one16() optsize {
-entry:
-  ret i16 1
-
-; CHECK32-LABEL: one16
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  retl
-}
-
-define i16 @minus_one16() optsize {
-entry:
-  ret i16 -1
-
-; CHECK32-LABEL: minus_one16
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  retl
-}
-
-define i32 @test_rematerialization() optsize {
-entry:
-  ; Materialize -1 (thiscall forces it into %ecx).
-  tail call x86_thiscallcc void @f(i32 -1)
-
-  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
-  ; spilling it to the stack.
-  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
-  ; -1 should be re-materialized here instead of getting spilled above.
-  ret i32 -1
-
-; CHECK32-LABEL: test_rematerialization
-; CHECK32:       xorl %ecx, %ecx
-; CHECK32-NEXT:  decl %ecx
-; CHECK32:       calll
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NOT:   %eax
-; CHECK32:       retl
-}
-
-define i32 @test_rematerialization2(i32 %x) optsize {
-entry:
-  ; Materialize -1 (thiscall forces it into %ecx).
-  tail call x86_thiscallcc void @f(i32 -1)
-
-  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
-  ; spilling it to the stack.
-  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
-  ; Define eflags.
-  %a = icmp ne i32 %x, 123
-  %b = zext i1 %a to i32
-  ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
-  ; It must therefore not use the xor-dec lowering.
-  %c = select i1 %a, i32 %b, i32 -1
-  ret i32 %c
-
-; CHECK32-LABEL: test_rematerialization2
-; CHECK32:       xorl %ecx, %ecx
-; CHECK32-NEXT:  decl %ecx
-; CHECK32:       calll
-; CHECK32:       cmpl
-; CHECK32:       setne
-; CHECK32-NOT:   xorl
-; CHECK32:       movl $-1
-; CHECK32:       cmov
-; CHECK32:       retl
-}
-
-declare x86_thiscallcc void @f(i32)
--- a/test/CodeGen/X86/materialize.ll
+++ b/test/CodeGen/X86/materialize.ll
@ -0,0 +1,184 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
+; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
+
+define i32 @one32_nooptsize() {
+entry:
+  ret i32 1
+
+; When not optimizing for size, use mov.
+; CHECK32-LABEL: one32_nooptsize:
+; CHECK32:       movl $1, %eax
+; CHECK32-NEXT:  retl
+; CHECK64-LABEL: one32_nooptsize:
+; CHECK64:       movl $1, %eax
+; CHECK64-NEXT:  retq
+}
+
+define i32 @one32() optsize {
+entry:
+  ret i32 1
+
+; CHECK32-LABEL: one32:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32:
+; CHECK64:       movl $1, %eax
+; CHECK64-NEXT:  retq
+}
+
+define i32 @one32_minsize() minsize {
+entry:
+  ret i32 1
+
+; On 32-bit, xor-inc is preferred over push-pop.
+; CHECK32-LABEL: one32_minsize:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+
+; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
+; pop into a 64-bit register even when we just need 32 bits.
+; CHECK64-LABEL: one32_minsize:
+; CHECK64:       pushq $1
+; CHECK64:       .cfi_adjust_cfa_offset 8
+; CHECK64:       popq %rax
+; CHECK64:       .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:  retq
+}
+
+define i64 @one64_minsize() minsize {
+entry:
+  ret i64 1
+; On 64-bit we don't do xor-inc yet, so push-pop it is.
+; CHECK64-LABEL: one64_minsize:
+; CHECK64:       pushq $1
+; CHECK64:       .cfi_adjust_cfa_offset 8
+; CHECK64:       popq %rax
+; CHECK64:       .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:  retq
+
+; On Win64 we can't adjust the stack unless there's a frame pointer.
+; CHECKWIN64-LABEL: one64_minsize:
+; CHECKWIN64:       movl $1, %eax
+; CHECKWIN64-NEXT:  retq
+}
+
+define i32 @minus_one32() optsize {
+entry:
+  ret i32 -1
+
+; CHECK32-LABEL: minus_one32:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i32 @minus_one32_minsize() minsize {
+entry:
+  ret i32 -1
+
+; xor-dec is preferred over push-pop.
+; CHECK32-LABEL: minus_one32_minsize:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i16 @one16() optsize {
+entry:
+  ret i16 1
+
+; CHECK32-LABEL: one16:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i16 @minus_one16() optsize {
+entry:
+  ret i16 -1
+
+; CHECK32-LABEL: minus_one16:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i32 @minus_five32() minsize {
+entry:
+  ret i32 -5
+
+; CHECK32-LABEL: minus_five32:
+; CHECK32: pushl $-5
+; CHECK32: popl %eax
+; CHECK32: retl
+}
+
+define i64 @minus_five64() minsize {
+entry:
+  ret i64 -5
+
+; CHECK64-LABEL: minus_five64:
+; CHECK64: pushq $-5
+; CHECK64:       .cfi_adjust_cfa_offset 8
+; CHECK64: popq %rax
+; CHECK64:       .cfi_adjust_cfa_offset -8
+; CHECK64: retq
+}
+
+define i32 @rematerialize_minus_one() optsize {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; -1 should be re-materialized here instead of getting spilled above.
+  ret i32 -1
+
+; CHECK32-LABEL: rematerialize_minus_one
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NOT:   %eax
+; CHECK32:       retl
+}
+
+define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; Define eflags.
+  %a = icmp ne i32 %x, 123
+  %b = zext i1 %a to i32
+  ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+  ; It must therefore not use the xor-dec lowering.
+  %c = select i1 %a, i32 %b, i32 -1
+  ret i32 %c
+
+; CHECK32-LABEL: rematerialize_minus_one_eflags
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       cmpl
+; CHECK32:       setne
+; CHECK32-NOT:   xorl
+; CHECK32:       movl $-1
+; CHECK32:       cmov
+; CHECK32:       retl
+}
+
+declare x86_thiscallcc void @f(i32)
--- a/test/CodeGen/X86/powi.ll
+++ b/test/CodeGen/X86/powi.ll
@ -29,9 +29,9 @@ define double @pow_wrapper_optsize(double %a) optsize {
 define double @pow_wrapper_minsize(double %a) minsize {
 ; CHECK-LABEL: pow_wrapper_minsize:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movl  $15, %edi
+; CHECK-NEXT:    movl  $128, %edi
 ; CHECK-NEXT:    jmp
-  %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
+  %ret = tail call double @llvm.powi.f64(double %a, i32 128) nounwind ; <double> [#uses=1]
  ret double %ret
 }