mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-21 03:53:04 +02:00
Enable register mask operands for x86 calls.
Call instructions no longer have a list of 43 call-clobbered registers. Instead, they get a single register mask operand with a bit vector of call-preserved registers. This saves a lot of memory, 42 x 32 bytes = 1344 bytes per call instruction, and it speeds up building call instructions because those 43 imp-def operands no longer need to be added to use-def lists. (And removed and shifted and re-added for every explicit call operand). Passes like LiveVariables, LiveIntervals, RAGreedy, PEI, and BranchFolding are significantly faster because they can deal with call clobbers in bulk. Overall, clang -O2 is between 0% and 8% faster, uniformly distributed depending on call density in the compiled code. Debug builds using clang -O0 are 0% - 3% faster. I have verified that this patch doesn't change the assembly generated for the LLVM nightly test suite when building with -disable-copyprop and -disable-branch-fold. Branch folding behaves slightly differently in a few cases because call instructions have different hash values now. Copy propagation flushes its data structures when it crosses a register mask operand. This causes it to leave a few dead copies behind, on the order of 20 instruction across the entire nightly test suite, including SPEC. Fixing this properly would require the pass to use different data structures. llvm-svn: 150638
This commit is contained in:
parent
29bf5e7b09
commit
278e98bcc9
@ -1853,6 +1853,10 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
|
|||||||
for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
|
for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
|
||||||
MIB.addReg(RegArgs[i]);
|
MIB.addReg(RegArgs[i]);
|
||||||
|
|
||||||
|
// Add a register mask with the call-preserved registers.
|
||||||
|
// Proper defs for return values will be added by setPhysRegsDeadExcept().
|
||||||
|
MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
|
||||||
|
|
||||||
// Issue CALLSEQ_END
|
// Issue CALLSEQ_END
|
||||||
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
|
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
|
||||||
unsigned NumBytesCallee = 0;
|
unsigned NumBytesCallee = 0;
|
||||||
|
@ -44,7 +44,6 @@
|
|||||||
#include "llvm/ADT/StringExtras.h"
|
#include "llvm/ADT/StringExtras.h"
|
||||||
#include "llvm/ADT/VariadicFunction.h"
|
#include "llvm/ADT/VariadicFunction.h"
|
||||||
#include "llvm/Support/CallSite.h"
|
#include "llvm/Support/CallSite.h"
|
||||||
#include "llvm/Support/CommandLine.h"
|
|
||||||
#include "llvm/Support/Debug.h"
|
#include "llvm/Support/Debug.h"
|
||||||
#include "llvm/Support/Dwarf.h"
|
#include "llvm/Support/Dwarf.h"
|
||||||
#include "llvm/Support/ErrorHandling.h"
|
#include "llvm/Support/ErrorHandling.h"
|
||||||
@ -57,9 +56,6 @@ using namespace dwarf;
|
|||||||
|
|
||||||
STATISTIC(NumTailCalls, "Number of tail calls");
|
STATISTIC(NumTailCalls, "Number of tail calls");
|
||||||
|
|
||||||
static cl::opt<bool> UseRegMask("x86-use-regmask",
|
|
||||||
cl::desc("Use register masks for x86 calls"));
|
|
||||||
|
|
||||||
// Forward declarations.
|
// Forward declarations.
|
||||||
static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
|
static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
|
||||||
SDValue V2);
|
SDValue V2);
|
||||||
@ -2510,13 +2506,11 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
|
|||||||
if (Is64Bit && isVarArg && !IsWin64)
|
if (Is64Bit && isVarArg && !IsWin64)
|
||||||
Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
|
Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
|
||||||
|
|
||||||
// Experimental: Add a register mask operand representing the call-preserved
|
// Add a register mask operand representing the call-preserved registers.
|
||||||
// registers.
|
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
|
||||||
if (UseRegMask) {
|
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
|
||||||
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
|
assert(Mask && "Missing call preserved mask for calling convention");
|
||||||
if (const uint32_t *Mask = TRI->getCallPreservedMask(CallConv))
|
Ops.push_back(DAG.getRegisterMask(Mask));
|
||||||
Ops.push_back(DAG.getRegisterMask(Mask));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (InFlag.getNode())
|
if (InFlag.getNode())
|
||||||
Ops.push_back(InFlag);
|
Ops.push_back(InFlag);
|
||||||
@ -12227,17 +12221,23 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
|
|||||||
BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
|
BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
|
||||||
|
|
||||||
// Calls into a routine in libgcc to allocate more space from the heap.
|
// Calls into a routine in libgcc to allocate more space from the heap.
|
||||||
|
const uint32_t *RegMask =
|
||||||
|
getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
|
||||||
if (Is64Bit) {
|
if (Is64Bit) {
|
||||||
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
|
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
|
||||||
.addReg(sizeVReg);
|
.addReg(sizeVReg);
|
||||||
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
|
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
|
||||||
.addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI);
|
.addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
|
||||||
|
.addRegMask(RegMask)
|
||||||
|
.addReg(X86::RAX, RegState::ImplicitDefine);
|
||||||
} else {
|
} else {
|
||||||
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
|
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
|
||||||
.addImm(12);
|
.addImm(12);
|
||||||
BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
|
BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
|
||||||
BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
|
BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
|
||||||
.addExternalSymbol("__morestack_allocate_stack_space");
|
.addExternalSymbol("__morestack_allocate_stack_space")
|
||||||
|
.addRegMask(RegMask)
|
||||||
|
.addReg(X86::EAX, RegState::ImplicitDefine);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!Is64Bit)
|
if (!Is64Bit)
|
||||||
@ -12335,6 +12335,11 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
|
|||||||
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
|
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
|
||||||
assert(MI->getOperand(3).isGlobal() && "This should be a global");
|
assert(MI->getOperand(3).isGlobal() && "This should be a global");
|
||||||
|
|
||||||
|
// Get a register mask for the lowered call.
|
||||||
|
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
|
||||||
|
// proper register mask.
|
||||||
|
const uint32_t *RegMask =
|
||||||
|
getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
|
||||||
if (Subtarget->is64Bit()) {
|
if (Subtarget->is64Bit()) {
|
||||||
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
|
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
|
||||||
TII->get(X86::MOV64rm), X86::RDI)
|
TII->get(X86::MOV64rm), X86::RDI)
|
||||||
@ -12345,6 +12350,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
|
|||||||
.addReg(0);
|
.addReg(0);
|
||||||
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
|
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
|
||||||
addDirectMem(MIB, X86::RDI);
|
addDirectMem(MIB, X86::RDI);
|
||||||
|
MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
|
||||||
} else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
|
} else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
|
||||||
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
|
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
|
||||||
TII->get(X86::MOV32rm), X86::EAX)
|
TII->get(X86::MOV32rm), X86::EAX)
|
||||||
@ -12355,6 +12361,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
|
|||||||
.addReg(0);
|
.addReg(0);
|
||||||
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
|
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
|
||||||
addDirectMem(MIB, X86::EAX);
|
addDirectMem(MIB, X86::EAX);
|
||||||
|
MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
|
||||||
} else {
|
} else {
|
||||||
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
|
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
|
||||||
TII->get(X86::MOV32rm), X86::EAX)
|
TII->get(X86::MOV32rm), X86::EAX)
|
||||||
@ -12365,6 +12372,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
|
|||||||
.addReg(0);
|
.addReg(0);
|
||||||
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
|
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
|
||||||
addDirectMem(MIB, X86::EAX);
|
addDirectMem(MIB, X86::EAX);
|
||||||
|
MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
|
||||||
}
|
}
|
||||||
|
|
||||||
MI->eraseFromParent(); // The pseudo instruction is gone now.
|
MI->eraseFromParent(); // The pseudo instruction is gone now.
|
||||||
|
@ -141,11 +141,7 @@ let isCall = 1 in
|
|||||||
// a use to prevent stack-pointer assignments that appear immediately
|
// a use to prevent stack-pointer assignments that appear immediately
|
||||||
// before calls from potentially appearing dead. Uses for argument
|
// before calls from potentially appearing dead. Uses for argument
|
||||||
// registers are added manually.
|
// registers are added manually.
|
||||||
let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
|
let Uses = [ESP] in {
|
||||||
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
|
|
||||||
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
|
|
||||||
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
|
|
||||||
Uses = [ESP] in {
|
|
||||||
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
|
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
|
||||||
(outs), (ins i32imm_pcrel:$dst,variable_ops),
|
(outs), (ins i32imm_pcrel:$dst,variable_ops),
|
||||||
"call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
|
"call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
|
||||||
@ -182,11 +178,7 @@ let isCall = 1 in
|
|||||||
|
|
||||||
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
|
||||||
isCodeGenOnly = 1 in
|
isCodeGenOnly = 1 in
|
||||||
let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
|
let Uses = [ESP] in {
|
||||||
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
|
|
||||||
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
|
|
||||||
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
|
|
||||||
Uses = [ESP] in {
|
|
||||||
def TCRETURNdi : PseudoI<(outs),
|
def TCRETURNdi : PseudoI<(outs),
|
||||||
(ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>;
|
(ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>;
|
||||||
def TCRETURNri : PseudoI<(outs),
|
def TCRETURNri : PseudoI<(outs),
|
||||||
@ -217,12 +209,7 @@ let isCall = 1 in
|
|||||||
// a use to prevent stack-pointer assignments that appear immediately
|
// a use to prevent stack-pointer assignments that appear immediately
|
||||||
// before calls from potentially appearing dead. Uses for argument
|
// before calls from potentially appearing dead. Uses for argument
|
||||||
// registers are added manually.
|
// registers are added manually.
|
||||||
let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
|
let Uses = [RSP] in {
|
||||||
FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
|
|
||||||
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
|
|
||||||
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
|
|
||||||
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
|
|
||||||
Uses = [RSP] in {
|
|
||||||
|
|
||||||
// NOTE: this pattern doesn't match "X86call imm", because we do not know
|
// NOTE: this pattern doesn't match "X86call imm", because we do not know
|
||||||
// that the offset between an arbitrary immediate and the call will fit in
|
// that the offset between an arbitrary immediate and the call will fit in
|
||||||
@ -251,11 +238,7 @@ let isCall = 1, isCodeGenOnly = 1 in
|
|||||||
// a use to prevent stack-pointer assignments that appear immediately
|
// a use to prevent stack-pointer assignments that appear immediately
|
||||||
// before calls from potentially appearing dead. Uses for argument
|
// before calls from potentially appearing dead. Uses for argument
|
||||||
// registers are added manually.
|
// registers are added manually.
|
||||||
let Defs = [RAX, RCX, RDX, R8, R9, R10, R11,
|
let Uses = [RSP] in {
|
||||||
FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
|
|
||||||
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
|
|
||||||
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, YMM_HI_6_15, EFLAGS],
|
|
||||||
Uses = [RSP] in {
|
|
||||||
def WINCALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
|
def WINCALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
|
||||||
(outs), (ins i64i32imm_pcrel:$dst, variable_ops),
|
(outs), (ins i64i32imm_pcrel:$dst, variable_ops),
|
||||||
"call{q}\t$dst", [], IIC_CALL_RI>,
|
"call{q}\t$dst", [], IIC_CALL_RI>,
|
||||||
@ -284,12 +267,7 @@ let isCall = 1, isCodeGenOnly = 1 in
|
|||||||
|
|
||||||
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
|
||||||
isCodeGenOnly = 1 in
|
isCodeGenOnly = 1 in
|
||||||
// AMD64 cc clobbers RSI, RDI, XMM6-XMM15.
|
let Uses = [RSP],
|
||||||
let Defs = [RAX, RCX, RDX, R8, R9, R10, R11,
|
|
||||||
FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
|
|
||||||
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
|
|
||||||
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS],
|
|
||||||
Uses = [RSP],
|
|
||||||
usesCustomInserter = 1 in {
|
usesCustomInserter = 1 in {
|
||||||
def TCRETURNdi64 : PseudoI<(outs),
|
def TCRETURNdi64 : PseudoI<(outs),
|
||||||
(ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
|
(ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
|
||||||
|
Loading…
Reference in New Issue
Block a user