mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[AMDGPU] Unify early PS termination blocks
Generate a single early exit block out-of-line and branch to this if all lanes are killed. This avoids branching if lanes are active. Reviewed By: nhaehnle Differential Revision: https://reviews.llvm.org/D82641
This commit is contained in:
parent
f3b790c8b1
commit
d40c701079
@ -1052,8 +1052,8 @@ void GCNPassConfig::addPreEmitPass() {
|
||||
addPass(&SIInsertHardClausesID);
|
||||
|
||||
addPass(&SIRemoveShortExecBranchesID);
|
||||
addPass(&SIPreEmitPeepholeID);
|
||||
addPass(&SIInsertSkipsPassID);
|
||||
addPass(&SIPreEmitPeepholeID);
|
||||
addPass(&BranchRelaxationPassID);
|
||||
}
|
||||
|
||||
|
@ -57,10 +57,13 @@ private:
|
||||
unsigned SkipThreshold = 0;
|
||||
MachineDominatorTree *MDT = nullptr;
|
||||
|
||||
MachineBasicBlock *EarlyExitBlock = nullptr;
|
||||
|
||||
bool shouldSkip(const MachineBasicBlock &From,
|
||||
const MachineBasicBlock &To) const;
|
||||
|
||||
bool dominatesAllReachable(MachineBasicBlock &MBB);
|
||||
void createEarlyExitBlock(MachineBasicBlock &MBB);
|
||||
void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
|
||||
DebugLoc DL);
|
||||
|
||||
@ -161,6 +164,33 @@ bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static void generatePsEndPgm(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I, DebugLoc DL,
|
||||
const SIInstrInfo *TII) {
|
||||
// Generate "null export; s_endpgm".
|
||||
BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
|
||||
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addImm(1) // vm
|
||||
.addImm(0) // compr
|
||||
.addImm(0); // en
|
||||
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
|
||||
}
|
||||
|
||||
void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
DebugLoc DL;
|
||||
|
||||
assert(!EarlyExitBlock);
|
||||
EarlyExitBlock = MF->CreateMachineBasicBlock();
|
||||
MF->insert(MF->end(), EarlyExitBlock);
|
||||
|
||||
generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
|
||||
}
|
||||
|
||||
/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
|
||||
/// iterator. Only applies to pixel shaders.
|
||||
void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
|
||||
@ -168,11 +198,6 @@ void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
|
||||
|
||||
// Currently, SI_KILL_*_TERMINATOR is expected to occur only as the last
|
||||
// terminator of a basic block. If this ever changes, we need to optionally
|
||||
// split MBB here.
|
||||
assert(I == MBB.end());
|
||||
|
||||
// It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
|
||||
// basic block that has no further successors (e.g., there was an
|
||||
// `unreachable` there in IR). This can happen with original source of the
|
||||
@ -186,34 +211,40 @@ void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
|
||||
// In this case, we write the "null_export; s_endpgm" skip code in the
|
||||
// already-existing basic block.
|
||||
auto NextBBI = std::next(MBB.getIterator());
|
||||
bool NoSuccessor = llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
|
||||
MachineBasicBlock *SkipBB;
|
||||
bool NoSuccessor = I == MBB.end() &&
|
||||
llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
|
||||
|
||||
if (NoSuccessor) {
|
||||
SkipBB = &MBB;
|
||||
generatePsEndPgm(MBB, I, DL, TII);
|
||||
} else {
|
||||
// Create a new basic block that will contain the "null export; s_endpgm"
|
||||
// and set up the branching to go around it.
|
||||
SkipBB = MF->CreateMachineBasicBlock();
|
||||
MF->insert(NextBBI, SkipBB);
|
||||
if (!EarlyExitBlock) {
|
||||
createEarlyExitBlock(MBB);
|
||||
// Update next block pointer to reflect any new blocks
|
||||
NextBBI = std::next(MBB.getIterator());
|
||||
}
|
||||
|
||||
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&*NextBBI);
|
||||
MBB.addSuccessor(SkipBB);
|
||||
auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
|
||||
.addMBB(EarlyExitBlock);
|
||||
|
||||
MDT->addNewBlock(SkipBB, &MBB);
|
||||
// Split the block if the branch will not come at the end.
|
||||
auto Next = std::next(BranchMI->getIterator());
|
||||
if (Next != MBB.end() && !Next->isTerminator()) {
|
||||
MachineBasicBlock *SplitBB =
|
||||
MF->CreateMachineBasicBlock(MBB.getBasicBlock());
|
||||
MF->insert(NextBBI, SplitBB);
|
||||
SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end());
|
||||
SplitBB->transferSuccessorsAndUpdatePHIs(&MBB);
|
||||
// FIXME: the expectation is that this will be used near the beginning
|
||||
// of a block so just assume all registers are still live.
|
||||
for (auto LiveIn : MBB.liveins())
|
||||
SplitBB->addLiveIn(LiveIn);
|
||||
MBB.addSuccessor(SplitBB);
|
||||
MDT->addNewBlock(SplitBB, &MBB);
|
||||
}
|
||||
|
||||
MBB.addSuccessor(EarlyExitBlock);
|
||||
MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
|
||||
}
|
||||
|
||||
// Generate "null export; s_endpgm".
|
||||
BuildMI(SkipBB, DL, TII->get(AMDGPU::EXP_DONE))
|
||||
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addImm(1) // vm
|
||||
.addImm(0) // compr
|
||||
.addImm(0); // en
|
||||
BuildMI(SkipBB, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
|
||||
}
|
||||
|
||||
/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
|
||||
@ -428,6 +459,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
|
||||
Kill->eraseFromParent();
|
||||
}
|
||||
KillInstrs.clear();
|
||||
EarlyExitBlock = nullptr;
|
||||
|
||||
return MadeChange;
|
||||
}
|
||||
|
@ -14,15 +14,15 @@
|
||||
|
||||
# CHECK: bb.1:
|
||||
# CHECK: V_CMPX_LE_F32_e32
|
||||
# CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
|
||||
# CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
|
||||
|
||||
# CHECK: bb.2:
|
||||
# CHECK: S_ENDPGM 0
|
||||
|
||||
# CHECK: bb.3:
|
||||
# CHECK-NEXT: EXP_DONE
|
||||
# CHECK: S_ENDPGM 0
|
||||
|
||||
# CHECK: bb.2:
|
||||
# CHECK: S_ENDPGM 0
|
||||
|
||||
name: kill_uncond_branch
|
||||
|
||||
body: |
|
||||
|
@ -12,11 +12,11 @@ define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
|
||||
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
|
||||
; CHECK-NEXT: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b64 exec, 0
|
||||
; CHECK-NEXT: s_cbranch_execnz BB1_2
|
||||
; CHECK-NEXT: s_cbranch_execz BB1_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: exp null off, off, off, off done vm
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB1_2:
|
||||
; CHECK-NEXT: exp null off, off, off, off done vm
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
|
||||
call void @llvm.amdgcn.kill(i1 false)
|
||||
@ -27,15 +27,14 @@ define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
|
||||
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
|
||||
; CHECK-NEXT: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b64 exec, 0
|
||||
; CHECK-NEXT: s_cbranch_execnz BB2_2
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK: BB2_2:
|
||||
; CHECK-NEXT: s_cbranch_execz BB2_3
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_mov_b64 exec, 0
|
||||
; CHECK-NEXT: s_cbranch_execnz BB2_4
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_cbranch_execz BB2_3
|
||||
; CHECK-NEXT: ; %bb.2:
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB2_4:
|
||||
; CHECK-NEXT: BB2_3:
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
|
||||
call void @llvm.amdgcn.kill(i1 false)
|
||||
@ -46,10 +45,11 @@ define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
|
||||
; CHECK-LABEL: {{^}}test_kill_depth_var:
|
||||
; CHECK-NEXT: ; %bb.0:
|
||||
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
||||
; CHECK-NEXT: s_cbranch_execnz BB3_2
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_cbranch_execz BB3_2
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB3_2:
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
|
||||
%cmp = fcmp olt float %x, 0.0
|
||||
@ -61,15 +61,14 @@ define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
|
||||
; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
|
||||
; CHECK-NEXT: ; %bb.0:
|
||||
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
||||
; CHECK-NEXT: s_cbranch_execnz BB4_2
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB4_2:
|
||||
; CHECK-NEXT: s_cbranch_execz BB4_3
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
||||
; CHECK-NEXT: s_cbranch_execnz BB4_4
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_cbranch_execz BB4_3
|
||||
; CHECK-NEXT: ; %bb.2:
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB4_4:
|
||||
; CHECK-NEXT: BB4_3:
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
|
||||
%cmp = fcmp olt float %x, 0.0
|
||||
@ -82,15 +81,14 @@ define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
|
||||
; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
|
||||
; CHECK-NEXT: ; %bb.0:
|
||||
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
||||
; CHECK-NEXT: s_cbranch_execnz BB5_2
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB5_2:
|
||||
; CHECK-NEXT: s_cbranch_execz BB5_3
|
||||
; CHECK-NEXT: ; %bb.1
|
||||
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1
|
||||
; CHECK-NEXT: s_cbranch_execnz BB5_4
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_cbranch_execz BB5_3
|
||||
; CHECK-NEXT: ; %bb.2
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB5_4:
|
||||
; CHECK-NEXT: BB5_3:
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
|
||||
%cmp.x = fcmp olt float %x, 0.0
|
||||
@ -103,18 +101,15 @@ define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
|
||||
; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions:
|
||||
; CHECK-NEXT: ; %bb.0:
|
||||
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
||||
; CHECK-NEXT: s_cbranch_execnz BB6_2
|
||||
; CHECK-NEXT: s_cbranch_execz BB6_3
|
||||
; CHECK-NEXT: ; %bb.1:
|
||||
; CHECK-NEXT: exp
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB6_2:
|
||||
; CHECK: v_mov_b32_e64 v7, -1
|
||||
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
|
||||
; CHECK-NEXT: s_cbranch_execnz BB6_4
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: exp
|
||||
; CHECK-NEXT: s_cbranch_execz BB6_3
|
||||
; CHECK-NEXT: ; %bb.2:
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; CHECK-NEXT: BB6_4:
|
||||
; CHECK-NEXT: BB6_3:
|
||||
; CHECK-NEXT: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
|
||||
%cmp.x = fcmp olt float %x, 0.0
|
||||
@ -237,6 +232,62 @@ exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}test_kill_control_flow_return:
|
||||
|
||||
; CHECK: v_cmp_eq_u32_e64 [[KILL_CC:s\[[0-9]+:[0-9]+\]]], s0, 1
|
||||
; CHECK: s_and_b64 exec, exec, s[2:3]
|
||||
; CHECK-NEXT: s_cbranch_execz [[EXIT_BB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
|
||||
; CHECK: s_cbranch_scc0 [[COND_BB:BB[0-9]+_[0-9]+]]
|
||||
; CHECK: s_branch [[RETURN_BB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; CHECK: [[COND_BB]]:
|
||||
; CHECK: v_mov_b32_e64 v7, -1
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_nop_e64
|
||||
; CHECK: v_mov_b32_e32 v0, v7
|
||||
|
||||
; CHECK: [[EXIT_BB]]:
|
||||
; CHECK-NEXT: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
|
||||
; CHECK: [[RETURN_BB]]:
|
||||
define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
|
||||
entry:
|
||||
%kill = icmp eq i32 %arg, 1
|
||||
%cmp = icmp eq i32 %arg, 0
|
||||
call void @llvm.amdgcn.kill(i1 %kill)
|
||||
br i1 %cmp, label %bb, label %exit
|
||||
|
||||
bb:
|
||||
%var = call float asm sideeffect "
|
||||
v_mov_b32_e64 v7, -1
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", "={v7}"()
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
%ret = phi float [ %var, %bb ], [ 0.0, %entry ]
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}test_kill_divergent_loop:
|
||||
; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
@ -295,12 +346,9 @@ exit:
|
||||
; CHECK-LABEL: {{^}}phi_use_def_before_kill:
|
||||
; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0,
|
||||
; CHECK: v_cmpx_lt_f32_e32 vcc, 0,
|
||||
; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]]
|
||||
; CHECK-NEXT: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; CHECK: exp
|
||||
; CHECK-NEXT: s_endpgm
|
||||
|
||||
; CHECK: [[KILLBB:BB[0-9]+_[0-9]+]]:
|
||||
; CHECK: ; %[[KILLBB:bb.[0-9]+]]:
|
||||
; CHECK-NEXT: s_cbranch_scc0 [[PHIBB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; CHECK: [[PHIBB]]:
|
||||
@ -313,6 +361,10 @@ exit:
|
||||
|
||||
; CHECK: [[ENDBB]]:
|
||||
; CHECK-NEXT: s_endpgm
|
||||
|
||||
; CHECK: [[EXITBB]]:
|
||||
; CHECK: exp null
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
|
||||
bb:
|
||||
%tmp = fadd float %x, 1.000000e+00
|
||||
|
Loading…
x
Reference in New Issue
Block a user