1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

[X86] Use push-pop for materializing small constants under 'minsize'

Use the 3-byte (4 with REX prefix) push-pop sequence for materializing
small constants. This is smaller than using a mov (5, 6 or 7 bytes
depending on size and REX prefix), but it's likely to be slower, so
only used for 'minsize'.

This is a follow-up to r255656.

Differential Revision: http://reviews.llvm.org/D15549

llvm-svn: 255936
This commit is contained in:
Hans Wennborg 2015-12-17 23:18:39 +00:00
parent 72c4ebb415
commit 6b696434e4
8 changed files with 262 additions and 106 deletions

View File

@ -157,9 +157,13 @@ namespace {
/// performance.
bool OptForSize;
/// If true, selector should try to optimize for minimum code size.
bool OptForMinSize;
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
: SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
: SelectionDAGISel(tm, OptLevel), OptForSize(false),
OptForMinSize(false) {}
const char *getPassName() const override {
return "X86 DAG->DAG Instruction Selection";
@ -531,8 +535,10 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
}
void X86DAGToDAGISel::PreprocessISelDAG() {
// OptForSize is used in pattern predicates that isel is matching.
// OptFor[Min]Size are used in pattern predicates that isel is matching.
OptForSize = MF->getFunction()->optForSize();
OptForMinSize = MF->getFunction()->optForMinSize();
assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {

View File

@ -250,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
// Alias instruction mapping movr0 to xor.
// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
isPseudo = 1 in
isPseudo = 1, AddedComplexity = 20 in
def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
[(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
@ -263,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
}
let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
AddedComplexity = 1 in {
AddedComplexity = 15 in {
// Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
// which only require 3 bytes compared to MOV32ri which requires 5.
let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
@ -278,6 +278,17 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
}
let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
// FIXME: Add itinerary class and Schedule.
def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
[(set GR32:$dst, i32immSExt8:$src)]>,
Requires<[OptForMinSize]>;
def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
[(set GR64:$dst, i64immSExt8:$src)]>,
Requires<[OptForMinSize, NotWin64WithoutFP]>;
}
// Materialize i64 constant where top 32-bits are zero. This could theoretically
// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
// that would make it more difficult to rematerialize.

View File

@ -23,6 +23,7 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DerivedTypes.h"
@ -5297,6 +5298,50 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
return true;
}
bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
int64_t Imm = MIB->getOperand(1).getImm();
assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
MachineBasicBlock::iterator I = MIB.getInstr();
int StackAdjustment;
if (Subtarget.is64Bit()) {
assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
MIB->getOpcode() == X86::MOV32ImmSExti8);
// 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
// widen the register if necessary.
StackAdjustment = 8;
BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm);
MIB->setDesc(get(X86::POP64r));
MIB->getOperand(0)
.setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), MVT::i64));
} else {
assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
StackAdjustment = 4;
BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm);
MIB->setDesc(get(X86::POP32r));
}
// Build CFI if necessary.
MachineFunction &MF = *MBB.getParent();
const X86FrameLowering *TFL = Subtarget.getFrameLowering();
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsDwarfCFI =
!IsWin64Prologue &&
(MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
if (EmitCFI) {
TFL->BuildCFI(MBB, I, DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
TFL->BuildCFI(MBB, std::next(I), DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
}
return true;
}
// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
// code sequence is needed for other targets.
static void expandLoadStackGuard(MachineInstrBuilder &MIB,
@ -5329,6 +5374,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
case X86::MOV32r_1:
return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
case X86::MOV32ImmSExti8:
case X86::MOV64ImmSExti8:
return ExpandMOVImmSExti8(MIB);
case X86::SETB_C8r:
return Expand2AddrUndef(MIB, get(X86::SBB8rr));
case X86::SETB_C16r:

View File

@ -23,6 +23,7 @@
#include "X86GenInstrInfo.inc"
namespace llvm {
class MachineInstrBuilder;
class X86RegisterInfo;
class X86Subtarget;
@ -564,6 +565,9 @@ private:
/// operand and follow operands form a reference to the stack frame.
bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
int &FrameIndex) const;
/// Expand the MOVImmSExti8 pseudo-instructions.
bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const;
};
} // End llvm namespace

View File

@ -820,6 +820,8 @@ def In32BitMode : Predicate<"Subtarget->is32Bit()">,
AssemblerPredicate<"Mode32Bit", "32-bit mode">;
def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
"Subtarget->getFrameLowering()->hasFP(*MF)">;
def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
@ -833,6 +835,7 @@ def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
def OptForSize : Predicate<"OptForSize">;
def OptForMinSize : Predicate<"OptForMinSize">;
def OptForSpeed : Predicate<"!OptForSize">;
def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;

View File

@ -1,100 +0,0 @@
; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
define i32 @one32() optsize {
entry:
ret i32 1
; CHECK32-LABEL: one32
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: incl %eax
; CHECK32-NEXT: ret
; FIXME: Figure out the best approach in 64-bit mode.
; CHECK64-LABEL: one32
; CHECK64: movl $1, %eax
; CHECK64-NEXT: retq
}
define i32 @minus_one32() optsize {
entry:
ret i32 -1
; CHECK32-LABEL: minus_one32
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: decl %eax
; CHECK32-NEXT: ret
}
define i16 @one16() optsize {
entry:
ret i16 1
; CHECK32-LABEL: one16
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: incl %eax
; CHECK32-NEXT: retl
}
define i16 @minus_one16() optsize {
entry:
ret i16 -1
; CHECK32-LABEL: minus_one16
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: decl %eax
; CHECK32-NEXT: retl
}
define i32 @test_rematerialization() optsize {
entry:
; Materialize -1 (thiscall forces it into %ecx).
tail call x86_thiscallcc void @f(i32 -1)
; Clobber all registers except %esp, leaving nowhere to store the -1 besides
; spilling it to the stack.
tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
; -1 should be re-materialized here instead of getting spilled above.
ret i32 -1
; CHECK32-LABEL: test_rematerialization
; CHECK32: xorl %ecx, %ecx
; CHECK32-NEXT: decl %ecx
; CHECK32: calll
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: decl %eax
; CHECK32-NOT: %eax
; CHECK32: retl
}
define i32 @test_rematerialization2(i32 %x) optsize {
entry:
; Materialize -1 (thiscall forces it into %ecx).
tail call x86_thiscallcc void @f(i32 -1)
; Clobber all registers except %esp, leaving nowhere to store the -1 besides
; spilling it to the stack.
tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
; Define eflags.
%a = icmp ne i32 %x, 123
%b = zext i1 %a to i32
; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
; It must therefore not use the xor-dec lowering.
%c = select i1 %a, i32 %b, i32 -1
ret i32 %c
; CHECK32-LABEL: test_rematerialization2
; CHECK32: xorl %ecx, %ecx
; CHECK32-NEXT: decl %ecx
; CHECK32: calll
; CHECK32: cmpl
; CHECK32: setne
; CHECK32-NOT: xorl
; CHECK32: movl $-1
; CHECK32: cmov
; CHECK32: retl
}
declare x86_thiscallcc void @f(i32)

View File

@ -0,0 +1,184 @@
; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
define i32 @one32_nooptsize() {
entry:
ret i32 1
; When not optimizing for size, use mov.
; CHECK32-LABEL: one32_nooptsize:
; CHECK32: movl $1, %eax
; CHECK32-NEXT: retl
; CHECK64-LABEL: one32_nooptsize:
; CHECK64: movl $1, %eax
; CHECK64-NEXT: retq
}
define i32 @one32() optsize {
entry:
ret i32 1
; CHECK32-LABEL: one32:
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: incl %eax
; CHECK32-NEXT: retl
; FIXME: Figure out the best approach in 64-bit mode.
; CHECK64-LABEL: one32:
; CHECK64: movl $1, %eax
; CHECK64-NEXT: retq
}
define i32 @one32_minsize() minsize {
entry:
ret i32 1
; On 32-bit, xor-inc is preferred over push-pop.
; CHECK32-LABEL: one32_minsize:
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: incl %eax
; CHECK32-NEXT: retl
; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
; pop into a 64-bit register even when we just need 32 bits.
; CHECK64-LABEL: one32_minsize:
; CHECK64: pushq $1
; CHECK64: .cfi_adjust_cfa_offset 8
; CHECK64: popq %rax
; CHECK64: .cfi_adjust_cfa_offset -8
; CHECK64-NEXT: retq
}
define i64 @one64_minsize() minsize {
entry:
ret i64 1
; On 64-bit we don't do xor-inc yet, so push-pop it is.
; CHECK64-LABEL: one64_minsize:
; CHECK64: pushq $1
; CHECK64: .cfi_adjust_cfa_offset 8
; CHECK64: popq %rax
; CHECK64: .cfi_adjust_cfa_offset -8
; CHECK64-NEXT: retq
; On Win64 we can't adjust the stack unless there's a frame pointer.
; CHECKWIN64-LABEL: one64_minsize:
; CHECKWIN64: movl $1, %eax
; CHECKWIN64-NEXT: retq
}
define i32 @minus_one32() optsize {
entry:
ret i32 -1
; CHECK32-LABEL: minus_one32:
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: decl %eax
; CHECK32-NEXT: retl
}
define i32 @minus_one32_minsize() minsize {
entry:
ret i32 -1
; xor-dec is preferred over push-pop.
; CHECK32-LABEL: minus_one32_minsize:
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: decl %eax
; CHECK32-NEXT: retl
}
define i16 @one16() optsize {
entry:
ret i16 1
; CHECK32-LABEL: one16:
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: incl %eax
; CHECK32-NEXT: retl
}
define i16 @minus_one16() optsize {
entry:
ret i16 -1
; CHECK32-LABEL: minus_one16:
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: decl %eax
; CHECK32-NEXT: retl
}
define i32 @minus_five32() minsize {
entry:
ret i32 -5
; CHECK32-LABEL: minus_five32:
; CHECK32: pushl $-5
; CHECK32: popl %eax
; CHECK32: retl
}
define i64 @minus_five64() minsize {
entry:
ret i64 -5
; CHECK64-LABEL: minus_five64:
; CHECK64: pushq $-5
; CHECK64: .cfi_adjust_cfa_offset 8
; CHECK64: popq %rax
; CHECK64: .cfi_adjust_cfa_offset -8
; CHECK64: retq
}
define i32 @rematerialize_minus_one() optsize {
entry:
; Materialize -1 (thiscall forces it into %ecx).
tail call x86_thiscallcc void @f(i32 -1)
; Clobber all registers except %esp, leaving nowhere to store the -1 besides
; spilling it to the stack.
tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
; -1 should be re-materialized here instead of getting spilled above.
ret i32 -1
; CHECK32-LABEL: rematerialize_minus_one
; CHECK32: xorl %ecx, %ecx
; CHECK32-NEXT: decl %ecx
; CHECK32: calll
; CHECK32: xorl %eax, %eax
; CHECK32-NEXT: decl %eax
; CHECK32-NOT: %eax
; CHECK32: retl
}
define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
entry:
; Materialize -1 (thiscall forces it into %ecx).
tail call x86_thiscallcc void @f(i32 -1)
; Clobber all registers except %esp, leaving nowhere to store the -1 besides
; spilling it to the stack.
tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
; Define eflags.
%a = icmp ne i32 %x, 123
%b = zext i1 %a to i32
; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
; It must therefore not use the xor-dec lowering.
%c = select i1 %a, i32 %b, i32 -1
ret i32 %c
; CHECK32-LABEL: rematerialize_minus_one_eflags
; CHECK32: xorl %ecx, %ecx
; CHECK32-NEXT: decl %ecx
; CHECK32: calll
; CHECK32: cmpl
; CHECK32: setne
; CHECK32-NOT: xorl
; CHECK32: movl $-1
; CHECK32: cmov
; CHECK32: retl
}
declare x86_thiscallcc void @f(i32)

View File

@ -29,9 +29,9 @@ define double @pow_wrapper_optsize(double %a) optsize {
define double @pow_wrapper_minsize(double %a) minsize {
; CHECK-LABEL: pow_wrapper_minsize:
; CHECK: # BB#0:
; CHECK-NEXT: movl $15, %edi
; CHECK-NEXT: movl $128, %edi
; CHECK-NEXT: jmp
%ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
%ret = tail call double @llvm.powi.f64(double %a, i32 128) nounwind ; <double> [#uses=1]
ret double %ret
}