1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

AMDGPU: Directly emit m0 initialization with s_mov_b32

Currently what comes out of instruction selection is a
register initialized to -1, and then copied to m0.
MachineCSE doesn't consider copies, but we want these
to be CSEed. This isn't much of a problem currently,
because SIFoldOperands is run immediately after.

This avoids regressions when SIFoldOperands is run later
from leaving all copies to m0.

llvm-svn: 266377
This commit is contained in:
Matt Arsenault 2016-04-14 21:58:15 +00:00
parent e73cb153a7
commit 61abb9daf9
2 changed files with 37 additions and 14 deletions

View File

@ -1103,10 +1103,18 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ StringRef(RegName) + "\"."));
}
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr *MI, MachineBasicBlock *BB) const {
switch (MI->getOpcode()) {
case AMDGPU::SI_INIT_M0: {
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
BuildMI(*BB, MI->getIterator(), MI->getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addOperand(MI->getOperand(0));
MI->eraseFromParent();
break;
}
case AMDGPU::BRANCH:
return BB;
case AMDGPU::GET_GROUPSTATICSIZE: {
@ -1395,19 +1403,18 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
SDValue V) const {
// We can't use S_MOV_B32 directly, because there is no way to specify m0 as
// the destination register.
//
// We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
// so we will end up with redundant moves to m0.
//
// We can't use S_MOV_B32, because there is no way to specify m0 as the
// destination register.
//
// We have to use them both. Machine cse will combine all the S_MOV_B32
// instructions and the register coalescer eliminate the extra copies.
SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V);
return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32),
SDValue(M0, 0), SDValue()); // Glue
// A Null SDValue creates
// a glue result.
// We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
// A Null SDValue creates a glue result.
SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
V, Chain);
return SDValue(M0, 0);
}
SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,

View File

@ -2014,7 +2014,23 @@ def SI_KILL : InstSI <
} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
// fold operands before it runs.
def SI_INIT_M0 : InstSI <
(outs),
(ins SSrc_32:$src), "", []> {
let Defs = [M0];
let usesCustomInserter = 1;
let isPseudo = 1;
let isCodeGenOnly = 1;
let isAsCheapAsAMove = 1;
let SALU = 1;
let isReMaterializable = 1;
}
let Uses = [EXEC], Defs = [EXEC, VCC, M0] in {
class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
(outs VGPR_32:$dst, SReg_64:$temp),