1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 02:33:06 +01:00

[AMDGPU] Rename SIInsertSkips Pass

Pass no longer handles skips.  Pass now removes unnecessary
unconditional branches and lowers early termination branches.
Hence rename to SILateBranchLowering.

Move code to handle returns to epilog from SIPreEmitPeephole
into SILateBranchLowering. This means SIPreEmitPeephole only
contains optional optimisations, and all required transforms
are in SILateBranchLowering.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D98915
This commit is contained in:
Carl Ritson 2021-03-20 11:27:52 +09:00
parent 6af08589c4
commit ab6ec1f384
11 changed files with 88 additions and 91 deletions

View File

@ -207,8 +207,8 @@ extern char &SILowerControlFlowID;
void initializeSIPreEmitPeepholePass(PassRegistry &);
extern char &SIPreEmitPeepholeID;
void initializeSIInsertSkipsPass(PassRegistry &);
extern char &SIInsertSkipsPassID;
void initializeSILateBranchLoweringPass(PassRegistry &);
extern char &SILateBranchLoweringPassID;
void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;

View File

@ -250,7 +250,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIPreEmitPeepholePass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSILateBranchLoweringPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIPreAllocateWWMRegsPass(*PR);
@ -1214,8 +1214,9 @@ void GCNPassConfig::addPreEmitPass() {
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIInsertHardClausesID);
addPass(&SIInsertSkipsPassID);
addPass(&SIPreEmitPeepholeID);
addPass(&SILateBranchLoweringPassID);
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled

View File

@ -119,7 +119,7 @@ add_llvm_target(AMDGPUCodeGen
SIFormMemoryClauses.cpp
SIFrameLowering.cpp
SIInsertHardClauses.cpp
SIInsertSkips.cpp
SILateBranchLowering.cpp
SIInsertWaitcnts.cpp
SIInstrInfo.cpp
SIISelLowering.cpp

View File

@ -1,4 +1,4 @@
//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@ -14,28 +14,23 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
#define DEBUG_TYPE "si-insert-skips"
#define DEBUG_TYPE "si-late-branch-lowering"
namespace {
class SIInsertSkips : public MachineFunctionPass {
class SILateBranchLowering : public MachineFunctionPass {
private:
const SIRegisterInfo *TRI = nullptr;
const SIInstrInfo *TII = nullptr;
MachineDominatorTree *MDT = nullptr;
MachineBasicBlock *EarlyExitBlock = nullptr;
bool EarlyExitClearsExec = false;
void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
void earlyTerm(MachineInstr &MI);
void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
public:
static char ID;
@ -43,12 +38,12 @@ public:
unsigned MovOpc;
Register ExecReg;
SIInsertSkips() : MachineFunctionPass(ID) {}
SILateBranchLowering() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
return "SI insert s_cbranch_execz instructions";
return "SI Final Branch Preparation";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@ -60,15 +55,15 @@ public:
} // end anonymous namespace
char SIInsertSkips::ID = 0;
char SILateBranchLowering::ID = 0;
INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
char &llvm::SILateBranchLoweringPassID = SILateBranchLowering::ID;
static void generateEndPgm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, DebugLoc DL,
@ -89,27 +84,6 @@ static void generateEndPgm(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
}
void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
bool ClearExec) {
MachineFunction *MF = MBB.getParent();
DebugLoc DL;
if (!EarlyExitBlock) {
EarlyExitBlock = MF->CreateMachineBasicBlock();
MF->insert(MF->end(), EarlyExitBlock);
generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII,
MF->getFunction().getCallingConv() ==
CallingConv::AMDGPU_PS);
EarlyExitClearsExec = false;
}
if (ClearExec && !EarlyExitClearsExec) {
auto ExitI = EarlyExitBlock->getFirstNonPHI();
BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(MovOpc), ExecReg).addImm(0);
EarlyExitClearsExec = true;
}
}
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
MachineDominatorTree *MDT) {
MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
@ -125,12 +99,11 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
MDT->getBase().applyUpdates(DTUpdates);
}
void SIInsertSkips::earlyTerm(MachineInstr &MI) {
void SILateBranchLowering::earlyTerm(MachineInstr &MI,
MachineBasicBlock *EarlyExitBlock) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc DL = MI.getDebugLoc();
ensureEarlyExitBlock(MBB, true);
auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
.addMBB(EarlyExitBlock);
auto Next = std::next(MI.getIterator());
@ -142,7 +115,7 @@ void SIInsertSkips::earlyTerm(MachineInstr &MI) {
MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
}
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
@ -152,6 +125,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
SmallVector<MachineInstr *, 4> EarlyTermInstrs;
SmallVector<MachineInstr *, 1> EpilogInstrs;
bool MadeChange = false;
for (MachineBasicBlock &MBB : MF) {
@ -163,7 +137,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
case AMDGPU::S_BRANCH:
// Optimize out branches to the next block.
// FIXME: Shouldn't this be handled by BranchFolding?
// This only occurs in -O0 when BranchFolding is not executed.
if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
assert(&MI == &MBB.back());
MI.eraseFromParent();
@ -175,20 +149,72 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
EarlyTermInstrs.push_back(&MI);
break;
case AMDGPU::SI_RETURN_TO_EPILOG:
EpilogInstrs.push_back(&MI);
break;
default:
break;
}
}
}
for (MachineInstr *Instr : EarlyTermInstrs) {
// Early termination in GS does nothing
if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
earlyTerm(*Instr);
Instr->eraseFromParent();
// Lower any early exit branches first
if (!EarlyTermInstrs.empty()) {
MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock();
DebugLoc DL;
MF.insert(MF.end(), EarlyExitBlock);
BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc),
ExecReg)
.addImm(0);
generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII,
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
for (MachineInstr *Instr : EarlyTermInstrs) {
// Early termination in GS does nothing
if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
earlyTerm(*Instr, EarlyExitBlock);
Instr->eraseFromParent();
}
EarlyTermInstrs.clear();
MadeChange = true;
}
// Now check return to epilog instructions occur at function end
if (!EpilogInstrs.empty()) {
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
// If there are multiple returns to epilog then all will
// become jumps to new empty end block.
if (EpilogInstrs.size() > 1) {
EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
MF.insert(MF.end(), EmptyMBBAtEnd);
}
for (auto MI : EpilogInstrs) {
auto MBB = MI->getParent();
if (MBB == &MF.back() && MI == &MBB->back())
continue;
// SI_RETURN_TO_EPILOG is not the last instruction.
// Jump to empty block at function end.
if (!EmptyMBBAtEnd) {
EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
MF.insert(MF.end(), EmptyMBBAtEnd);
}
MBB->addSuccessor(EmptyMBBAtEnd);
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
.addMBB(EmptyMBBAtEnd);
MI->eraseFromParent();
MadeChange = true;
}
EpilogInstrs.clear();
}
EarlyTermInstrs.clear();
EarlyExitBlock = nullptr;
return MadeChange;
}

View File

@ -14,7 +14,6 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
@ -345,7 +344,6 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
bool Changed = false;
MF.RenumberBlocks();
@ -368,34 +366,6 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
break;
}
}
// Check all terminators for SI_RETURN_TO_EPILOG
// FIXME: This is not an optimization and should be moved somewhere else.
while (TermI != MBB.end()) {
MachineInstr &MI = *TermI;
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
// because external bytecode will be appended at the end.
if (&MBB != &MF.back() || &MI != &MBB.back()) {
// SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
// at the end and jump there.
if (!EmptyMBBAtEnd) {
EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
MF.insert(MF.end(), EmptyMBBAtEnd);
}
MBB.addSuccessor(EmptyMBBAtEnd);
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
.addMBB(EmptyMBBAtEnd);
MI.eraseFromParent();
MBBE = MBB.getFirstTerminator();
TermI = MBBE;
continue;
}
}
TermI++;
}
if (!ST.hasVGPRIndexMode())
continue;

View File

@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-insert-skips -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck %s
--- |
define amdgpu_ps void @early_term_scc0_end_block() {

View File

@ -50,7 +50,7 @@ end:
; CHECK-LABEL: only_kill
; CHECK: exp null off, off, off, off done vm
; CHECK-NEXT: s_endpgm
; SIInsertSkips inserts an extra null export here, but it should be harmless.
; SILateBranchLowering inserts an extra null export here, but it should be harmless.
; CHECK: exp null off, off, off, off done vm
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @only_kill() #0 {

View File

@ -1,4 +1,4 @@
# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-skips -verify-machineinstrs | FileCheck -check-prefix=GCN %s
# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-late-branch-lowering -verify-machineinstrs | FileCheck -check-prefix=GCN %s
# GCN-LABEL: readlane_exec0
# GCN: bb.0

View File

@ -1,4 +1,4 @@
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-insert-skips -o - %s | FileCheck -check-prefix=GCN %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-late-branch-lowering -o - %s | FileCheck -check-prefix=GCN %s
# GCN-LABEL: name: subbrev{{$}}
# GCN: V_SUBBREV_U32_e32 0, undef $vgpr0, implicit-def $vcc, implicit killed $vcc, implicit $exec

View File

@ -1,4 +1,4 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-after=si-insert-skips < %s | FileCheck --check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-after=si-late-branch-lowering < %s | FileCheck --check-prefix=GCN %s
; GCN-LABEL: name: syncscopes
; GCN: FLAT_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("agent") seq_cst 4 into %ir.agent_out)

View File

@ -203,7 +203,7 @@ static_library("LLVMAMDGPUCodeGen") {
"SIFrameLowering.cpp",
"SIISelLowering.cpp",
"SIInsertHardClauses.cpp",
"SIInsertSkips.cpp",
"SILateBranchLowering.cpp",
"SIInsertWaitcnts.cpp",
"SIInstrInfo.cpp",
"SILoadStoreOptimizer.cpp",