mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
Revert "[AMDGPU] Insert PS early exit at end of control flow"
This reverts commit 2bfcacf0ad362956277a1c2c9ba00ddc453a42ce. There appears to be an issue to analysis preservation.
This commit is contained in:
parent
5e4157d196
commit
5eef7b47fb
@ -447,15 +447,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case AMDGPU::SI_KILL_CLEANUP:
|
|
||||||
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
|
|
||||||
dominatesAllReachable(MBB)) {
|
|
||||||
KillInstrs.push_back(&MI);
|
|
||||||
} else {
|
|
||||||
MI.eraseFromParent();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -379,9 +379,6 @@ multiclass PseudoInstKill <dag ins> {
|
|||||||
defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
|
defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
|
||||||
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
|
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
|
||||||
|
|
||||||
let Defs = [EXEC] in
|
|
||||||
def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
|
|
||||||
|
|
||||||
let Defs = [EXEC,VCC] in
|
let Defs = [EXEC,VCC] in
|
||||||
def SI_ILLEGAL_COPY : SPseudoInstSI <
|
def SI_ILLEGAL_COPY : SPseudoInstSI <
|
||||||
(outs unknown:$dst), (ins unknown:$src),
|
(outs unknown:$dst), (ins unknown:$src),
|
||||||
|
@ -89,10 +89,8 @@ private:
|
|||||||
MachineRegisterInfo *MRI = nullptr;
|
MachineRegisterInfo *MRI = nullptr;
|
||||||
SetVector<MachineInstr*> LoweredEndCf;
|
SetVector<MachineInstr*> LoweredEndCf;
|
||||||
DenseSet<Register> LoweredIf;
|
DenseSet<Register> LoweredIf;
|
||||||
SmallSet<MachineInstr *, 16> NeedsKillCleanup;
|
|
||||||
|
|
||||||
const TargetRegisterClass *BoolRC = nullptr;
|
const TargetRegisterClass *BoolRC = nullptr;
|
||||||
bool InsertKillCleanups;
|
|
||||||
unsigned AndOpc;
|
unsigned AndOpc;
|
||||||
unsigned OrOpc;
|
unsigned OrOpc;
|
||||||
unsigned XorOpc;
|
unsigned XorOpc;
|
||||||
@ -113,8 +111,6 @@ private:
|
|||||||
|
|
||||||
void combineMasks(MachineInstr &MI);
|
void combineMasks(MachineInstr &MI);
|
||||||
|
|
||||||
void process(MachineInstr &MI);
|
|
||||||
|
|
||||||
// Skip to the next instruction, ignoring debug instructions, and trivial
|
// Skip to the next instruction, ignoring debug instructions, and trivial
|
||||||
// block boundaries (blocks that have one (typically fallthrough) successor,
|
// block boundaries (blocks that have one (typically fallthrough) successor,
|
||||||
// and the successor has one predecessor.
|
// and the successor has one predecessor.
|
||||||
@ -164,28 +160,8 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
|
|||||||
|
|
||||||
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
|
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
|
||||||
|
|
||||||
static bool hasKill(const MachineBasicBlock *Begin,
|
static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
|
||||||
const MachineBasicBlock *End, const SIInstrInfo *TII) {
|
const SIInstrInfo *TII) {
|
||||||
DenseSet<const MachineBasicBlock*> Visited;
|
|
||||||
SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
|
|
||||||
Begin->succ_end());
|
|
||||||
|
|
||||||
while (!Worklist.empty()) {
|
|
||||||
MachineBasicBlock *MBB = Worklist.pop_back_val();
|
|
||||||
|
|
||||||
if (MBB == End || !Visited.insert(MBB).second)
|
|
||||||
continue;
|
|
||||||
for (auto &Term : MBB->terminators())
|
|
||||||
if (TII->isKillTerminator(Term.getOpcode()))
|
|
||||||
return true;
|
|
||||||
|
|
||||||
Worklist.append(MBB->succ_begin(), MBB->succ_end());
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
|
|
||||||
Register SaveExecReg = MI.getOperand(0).getReg();
|
Register SaveExecReg = MI.getOperand(0).getReg();
|
||||||
auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
|
auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
|
||||||
|
|
||||||
@ -194,6 +170,26 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
|
|||||||
U->getOpcode() != AMDGPU::SI_END_CF)
|
U->getOpcode() != AMDGPU::SI_END_CF)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
// Check for SI_KILL_*_TERMINATOR on path from if to endif.
|
||||||
|
// if there is any such terminator simplififcations are not safe.
|
||||||
|
auto SMBB = MI.getParent();
|
||||||
|
auto EMBB = U->getParent();
|
||||||
|
DenseSet<const MachineBasicBlock*> Visited;
|
||||||
|
SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
|
||||||
|
SMBB->succ_end());
|
||||||
|
|
||||||
|
while (!Worklist.empty()) {
|
||||||
|
MachineBasicBlock *MBB = Worklist.pop_back_val();
|
||||||
|
|
||||||
|
if (MBB == EMBB || !Visited.insert(MBB).second)
|
||||||
|
continue;
|
||||||
|
for(auto &Term : MBB->terminators())
|
||||||
|
if (TII->isKillTerminator(Term.getOpcode()))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Worklist.append(MBB->succ_begin(), MBB->succ_end());
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -211,35 +207,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
|
|||||||
// If there is only one use of save exec register and that use is SI_END_CF,
|
// If there is only one use of save exec register and that use is SI_END_CF,
|
||||||
// we can optimize SI_IF by returning the full saved exec mask instead of
|
// we can optimize SI_IF by returning the full saved exec mask instead of
|
||||||
// just cleared bits.
|
// just cleared bits.
|
||||||
bool SimpleIf = isSimpleIf(MI, MRI);
|
bool SimpleIf = isSimpleIf(MI, MRI, TII);
|
||||||
|
|
||||||
if (InsertKillCleanups) {
|
|
||||||
// Check for SI_KILL_*_TERMINATOR on full path of control flow and
|
|
||||||
// flag the associated SI_END_CF for insertion of a kill cleanup.
|
|
||||||
auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
|
|
||||||
while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
|
|
||||||
assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
|
|
||||||
assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
|
|
||||||
MachineOperand &NextExec = UseMI->getOperand(0);
|
|
||||||
Register NextExecReg = NextExec.getReg();
|
|
||||||
if (NextExec.isDead()) {
|
|
||||||
assert(!SimpleIf);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
|
|
||||||
}
|
|
||||||
if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
|
|
||||||
if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
|
|
||||||
NeedsKillCleanup.insert(&*UseMI);
|
|
||||||
SimpleIf = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (SimpleIf) {
|
|
||||||
// Check for SI_KILL_*_TERMINATOR on path from if to endif.
|
|
||||||
// if there is any such terminator simplifications are not safe.
|
|
||||||
auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
|
|
||||||
SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add an implicit def of exec to discourage scheduling VALU after this which
|
// Add an implicit def of exec to discourage scheduling VALU after this which
|
||||||
// will interfere with trying to form s_and_saveexec_b64 later.
|
// will interfere with trying to form s_and_saveexec_b64 later.
|
||||||
@ -459,8 +427,6 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
|
|||||||
|
|
||||||
auto E = B->end();
|
auto E = B->end();
|
||||||
for ( ; It != E; ++It) {
|
for ( ; It != E; ++It) {
|
||||||
if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
|
|
||||||
continue;
|
|
||||||
if (TII->mayReadEXEC(*MRI, *It))
|
if (TII->mayReadEXEC(*MRI, *It))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -495,18 +461,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
|
|||||||
|
|
||||||
LoweredEndCf.insert(NewMI);
|
LoweredEndCf.insert(NewMI);
|
||||||
|
|
||||||
// If this ends control flow which contains kills (as flagged in emitIf)
|
if (LIS)
|
||||||
// then insert an SI_KILL_CLEANUP immediately following the exec mask
|
|
||||||
// manipulation. This can be lowered to early termination if appropriate.
|
|
||||||
MachineInstr *CleanUpMI = nullptr;
|
|
||||||
if (NeedsKillCleanup.count(&MI))
|
|
||||||
CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
|
|
||||||
|
|
||||||
if (LIS) {
|
|
||||||
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
|
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
|
||||||
if (CleanUpMI)
|
|
||||||
LIS->InsertMachineInstrInMaps(*CleanUpMI);
|
|
||||||
}
|
|
||||||
|
|
||||||
MI.eraseFromParent();
|
MI.eraseFromParent();
|
||||||
|
|
||||||
@ -597,10 +553,47 @@ void SILowerControlFlow::optimizeEndCf() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SILowerControlFlow::process(MachineInstr &MI) {
|
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
||||||
MachineBasicBlock &MBB = *MI.getParent();
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||||
MachineBasicBlock::iterator I(MI);
|
TII = ST.getInstrInfo();
|
||||||
MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
|
TRI = &TII->getRegisterInfo();
|
||||||
|
|
||||||
|
// This doesn't actually need LiveIntervals, but we can preserve them.
|
||||||
|
LIS = getAnalysisIfAvailable<LiveIntervals>();
|
||||||
|
MRI = &MF.getRegInfo();
|
||||||
|
BoolRC = TRI->getBoolRC();
|
||||||
|
|
||||||
|
if (ST.isWave32()) {
|
||||||
|
AndOpc = AMDGPU::S_AND_B32;
|
||||||
|
OrOpc = AMDGPU::S_OR_B32;
|
||||||
|
XorOpc = AMDGPU::S_XOR_B32;
|
||||||
|
MovTermOpc = AMDGPU::S_MOV_B32_term;
|
||||||
|
Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
|
||||||
|
XorTermrOpc = AMDGPU::S_XOR_B32_term;
|
||||||
|
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
|
||||||
|
Exec = AMDGPU::EXEC_LO;
|
||||||
|
} else {
|
||||||
|
AndOpc = AMDGPU::S_AND_B64;
|
||||||
|
OrOpc = AMDGPU::S_OR_B64;
|
||||||
|
XorOpc = AMDGPU::S_XOR_B64;
|
||||||
|
MovTermOpc = AMDGPU::S_MOV_B64_term;
|
||||||
|
Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
|
||||||
|
XorTermrOpc = AMDGPU::S_XOR_B64_term;
|
||||||
|
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
|
||||||
|
Exec = AMDGPU::EXEC;
|
||||||
|
}
|
||||||
|
|
||||||
|
MachineFunction::iterator NextBB;
|
||||||
|
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
||||||
|
BI != BE; BI = NextBB) {
|
||||||
|
NextBB = std::next(BI);
|
||||||
|
MachineBasicBlock &MBB = *BI;
|
||||||
|
|
||||||
|
MachineBasicBlock::iterator I, Next, Last;
|
||||||
|
|
||||||
|
for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
|
||||||
|
Next = std::next(I);
|
||||||
|
MachineInstr &MI = *I;
|
||||||
|
|
||||||
switch (MI.getOpcode()) {
|
switch (MI.getOpcode()) {
|
||||||
case AMDGPU::SI_IF:
|
case AMDGPU::SI_IF:
|
||||||
@ -623,105 +616,29 @@ void SILowerControlFlow::process(MachineInstr &MI) {
|
|||||||
emitEndCf(MI);
|
emitEndCf(MI);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
|
||||||
assert(false && "Attempt to process unsupported instruction");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
MachineBasicBlock::iterator Next;
|
|
||||||
for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
|
|
||||||
Next = std::next(I);
|
|
||||||
MachineInstr &MaskMI = *I;
|
|
||||||
switch (MaskMI.getOpcode()) {
|
|
||||||
case AMDGPU::S_AND_B64:
|
case AMDGPU::S_AND_B64:
|
||||||
case AMDGPU::S_OR_B64:
|
case AMDGPU::S_OR_B64:
|
||||||
case AMDGPU::S_AND_B32:
|
case AMDGPU::S_AND_B32:
|
||||||
case AMDGPU::S_OR_B32:
|
case AMDGPU::S_OR_B32:
|
||||||
// Cleanup bit manipulations on exec mask
|
// Cleanup bit manipulations on exec mask
|
||||||
combineMasks(MaskMI);
|
combineMasks(MI);
|
||||||
break;
|
Last = I;
|
||||||
default:
|
continue;
|
||||||
I = MBB.end();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
|
||||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
||||||
TII = ST.getInstrInfo();
|
|
||||||
TRI = &TII->getRegisterInfo();
|
|
||||||
|
|
||||||
// This doesn't actually need LiveIntervals, but we can preserve them.
|
|
||||||
LIS = getAnalysisIfAvailable<LiveIntervals>();
|
|
||||||
MRI = &MF.getRegInfo();
|
|
||||||
BoolRC = TRI->getBoolRC();
|
|
||||||
InsertKillCleanups =
|
|
||||||
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
|
|
||||||
|
|
||||||
if (ST.isWave32()) {
|
|
||||||
AndOpc = AMDGPU::S_AND_B32;
|
|
||||||
OrOpc = AMDGPU::S_OR_B32;
|
|
||||||
XorOpc = AMDGPU::S_XOR_B32;
|
|
||||||
MovTermOpc = AMDGPU::S_MOV_B32_term;
|
|
||||||
Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
|
|
||||||
XorTermrOpc = AMDGPU::S_XOR_B32_term;
|
|
||||||
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
|
|
||||||
Exec = AMDGPU::EXEC_LO;
|
|
||||||
} else {
|
|
||||||
AndOpc = AMDGPU::S_AND_B64;
|
|
||||||
OrOpc = AMDGPU::S_OR_B64;
|
|
||||||
XorOpc = AMDGPU::S_XOR_B64;
|
|
||||||
MovTermOpc = AMDGPU::S_MOV_B64_term;
|
|
||||||
Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
|
|
||||||
XorTermrOpc = AMDGPU::S_XOR_B64_term;
|
|
||||||
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
|
|
||||||
Exec = AMDGPU::EXEC;
|
|
||||||
}
|
|
||||||
|
|
||||||
SmallVector<MachineInstr *, 32> Worklist;
|
|
||||||
|
|
||||||
MachineFunction::iterator NextBB;
|
|
||||||
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
|
||||||
BI != BE; BI = NextBB) {
|
|
||||||
NextBB = std::next(BI);
|
|
||||||
MachineBasicBlock &MBB = *BI;
|
|
||||||
|
|
||||||
MachineBasicBlock::iterator I, Next;
|
|
||||||
for (I = MBB.begin(); I != MBB.end(); I = Next) {
|
|
||||||
Next = std::next(I);
|
|
||||||
MachineInstr &MI = *I;
|
|
||||||
|
|
||||||
switch (MI.getOpcode()) {
|
|
||||||
case AMDGPU::SI_IF:
|
|
||||||
process(MI);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case AMDGPU::SI_ELSE:
|
|
||||||
case AMDGPU::SI_IF_BREAK:
|
|
||||||
case AMDGPU::SI_LOOP:
|
|
||||||
case AMDGPU::SI_END_CF:
|
|
||||||
// Only build worklist if SI_IF instructions must be processed first.
|
|
||||||
if (InsertKillCleanups)
|
|
||||||
Worklist.push_back(&MI);
|
|
||||||
else
|
|
||||||
process(MI);
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
Last = I;
|
||||||
}
|
continue;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (MachineInstr *MI : Worklist)
|
// Replay newly inserted code to combine masks
|
||||||
process(*MI);
|
Next = (Last == MBB.end()) ? MBB.begin() : Last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
optimizeEndCf();
|
optimizeEndCf();
|
||||||
|
|
||||||
LoweredEndCf.clear();
|
LoweredEndCf.clear();
|
||||||
LoweredIf.clear();
|
LoweredIf.clear();
|
||||||
NeedsKillCleanup.clear();
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -61,11 +61,9 @@ loop:
|
|||||||
br label %loop
|
br label %loop
|
||||||
}
|
}
|
||||||
|
|
||||||
; Check that the epilog is the final block
|
; In case there's an epilog, we shouldn't have to do this.
|
||||||
; CHECK-LABEL: return_nonvoid
|
; CHECK-LABEL: return_nonvoid
|
||||||
; CHECK: exp null off, off, off, off done vm
|
; CHECK-NOT: exp null off, off, off, off done vm
|
||||||
; CHECK-NEXT: s_endpgm
|
|
||||||
; CHECK-NEXT: BB{{[0-9]+}}_{{[0-9]+}}:
|
|
||||||
define amdgpu_ps float @return_nonvoid(float %0) #0 {
|
define amdgpu_ps float @return_nonvoid(float %0) #0 {
|
||||||
main_body:
|
main_body:
|
||||||
%cmp = fcmp olt float %0, 1.000000e+01
|
%cmp = fcmp olt float %0, 1.000000e+01
|
||||||
|
@ -470,11 +470,7 @@ bb9: ; preds = %bb4
|
|||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: {{^}}cbranch_kill:
|
; CHECK-LABEL: {{^}}cbranch_kill:
|
||||||
; CHECK: ; %bb.{{[0-9]+}}: ; %export
|
; CHECK-NOT: exp null off, off, off, off done vm
|
||||||
; CHECK-NEXT: s_or_b64
|
|
||||||
; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]]
|
|
||||||
; CHECK: [[EXIT]]:
|
|
||||||
; CHECK-NEXT: exp null off, off, off, off done vm
|
|
||||||
define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) {
|
define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) {
|
||||||
.entry:
|
.entry:
|
||||||
%val0 = extractelement <2 x float> %1, i32 0
|
%val0 = extractelement <2 x float> %1, i32 0
|
||||||
|
Loading…
x
Reference in New Issue
Block a user