mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
BranchRelaxation: Support expanding unconditional branches
AMDGPU needs to expand unconditional branches in a new block with an indirect branch. llvm-svn: 283464
This commit is contained in:
parent
8534eb14b7
commit
7acab19d31
@ -455,6 +455,19 @@ public:
|
||||
llvm_unreachable("target did not implement");
|
||||
}
|
||||
|
||||
/// Insert an unconditional indirect branch at the end of \p MBB to \p
|
||||
/// NewDestBB. \p BrOffset indicates the offset of \p NewDestBB relative to
|
||||
/// the offset of the position to insert the new branch.
|
||||
///
|
||||
/// \returns The number of bytes added to the block.
|
||||
virtual unsigned insertIndirectBranch(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock &NewDestBB,
|
||||
const DebugLoc &DL,
|
||||
int64_t BrOffset = 0,
|
||||
RegScavenger *RS = nullptr) const {
|
||||
llvm_unreachable("target did not implement");
|
||||
}
|
||||
|
||||
/// Analyze the branching code at the end of MBB, returning
|
||||
/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
|
||||
/// implemented for a target). Upon success, this returns false and returns
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/RegisterScavenging.h"
|
||||
#include "llvm/Target/TargetInstrInfo.h"
|
||||
#include "llvm/Target/TargetSubtargetInfo.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
@ -23,6 +24,7 @@ using namespace llvm;
|
||||
|
||||
STATISTIC(NumSplit, "Number of basic blocks split");
|
||||
STATISTIC(NumConditionalRelaxed, "Number of conditional branches relaxed");
|
||||
STATISTIC(NumUnconditionalRelaxed, "Number of unconditional branches relaxed");
|
||||
|
||||
#define BRANCH_RELAX_NAME "Branch relaxation pass"
|
||||
|
||||
@ -66,17 +68,22 @@ class BranchRelaxation : public MachineFunctionPass {
|
||||
};
|
||||
|
||||
SmallVector<BasicBlockInfo, 16> BlockInfo;
|
||||
std::unique_ptr<RegScavenger> RS;
|
||||
|
||||
MachineFunction *MF;
|
||||
const TargetInstrInfo *TII;
|
||||
|
||||
bool relaxBranchInstructions();
|
||||
void scanFunction();
|
||||
|
||||
MachineBasicBlock *createNewBlockAfter(MachineBasicBlock &BB);
|
||||
|
||||
MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI);
|
||||
void adjustBlockOffsets(MachineBasicBlock &MBB);
|
||||
bool isBlockInRange(const MachineInstr &MI, const MachineBasicBlock &BB) const;
|
||||
|
||||
bool fixupConditionalBranch(MachineInstr &MI);
|
||||
bool fixupUnconditionalBranch(MachineInstr &MI);
|
||||
uint64_t computeBlockSize(const MachineBasicBlock &MBB) const;
|
||||
unsigned getInstrOffset(const MachineInstr &MI) const;
|
||||
void dumpBBs();
|
||||
@ -182,6 +189,19 @@ void BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert a new empty basic block and insert it after \BB
|
||||
MachineBasicBlock *BranchRelaxation::createNewBlockAfter(MachineBasicBlock &BB) {
|
||||
// Create a new MBB for the code after the OrigBB.
|
||||
MachineBasicBlock *NewBB =
|
||||
MF->CreateMachineBasicBlock(BB.getBasicBlock());
|
||||
MF->insert(++BB.getIterator(), NewBB);
|
||||
|
||||
// Insert an entry into BlockInfo to align it properly with the block numbers.
|
||||
BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
|
||||
|
||||
return NewBB;
|
||||
}
|
||||
|
||||
/// Split the basic block containing MI into two blocks, which are joined by
|
||||
/// an unconditional branch. Update data structures and renumber blocks to
|
||||
/// account for this change and returns the newly created block.
|
||||
@ -333,16 +353,55 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
|
||||
unsigned OldBrSize = TII->getInstSizeInBytes(MI);
|
||||
MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
|
||||
|
||||
int64_t DestOffset = BlockInfo[DestBB->getNumber()].Offset;
|
||||
int64_t SrcOffset = getInstrOffset(MI);
|
||||
|
||||
assert(!TII->isBranchOffsetInRange(MI.getOpcode(), DestOffset - SrcOffset));
|
||||
|
||||
BlockInfo[MBB->getNumber()].Size -= OldBrSize;
|
||||
|
||||
MachineBasicBlock *BranchBB = MBB;
|
||||
|
||||
// If this was an expanded conditional branch, there is already a single
|
||||
// unconditional branch in a block.
|
||||
if (!MBB->empty()) {
|
||||
BranchBB = createNewBlockAfter(*MBB);
|
||||
|
||||
// Add live outs.
|
||||
for (const MachineBasicBlock *Succ : MBB->successors()) {
|
||||
for (const MachineBasicBlock::RegisterMaskPair &LiveIn : Succ->liveins())
|
||||
BranchBB->addLiveIn(LiveIn);
|
||||
}
|
||||
|
||||
BranchBB->addSuccessor(DestBB);
|
||||
MBB->replaceSuccessor(DestBB, BranchBB);
|
||||
}
|
||||
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
MI.eraseFromParent();
|
||||
|
||||
// insertUnconditonalBranch may have inserted a new block.
|
||||
BlockInfo[MBB->getNumber()].Size += TII->insertIndirectBranch(
|
||||
*BranchBB, *DestBB, DL, DestOffset - SrcOffset, RS.get());
|
||||
|
||||
computeBlockSize(*BranchBB);
|
||||
adjustBlockOffsets(*MBB);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BranchRelaxation::relaxBranchInstructions() {
|
||||
bool Changed = false;
|
||||
|
||||
// Relaxing branches involves creating new basic blocks, so re-eval
|
||||
// end() for termination.
|
||||
for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) {
|
||||
MachineBasicBlock &MBB = *I;
|
||||
MachineBasicBlock::iterator J = MBB.getFirstTerminator();
|
||||
if (J == MBB.end())
|
||||
continue;
|
||||
|
||||
|
||||
MachineBasicBlock::iterator Next;
|
||||
for (MachineBasicBlock::iterator J = MBB.getFirstTerminator();
|
||||
@ -377,6 +436,21 @@ bool BranchRelaxation::relaxBranchInstructions() {
|
||||
Next = MBB.getFirstTerminator();
|
||||
}
|
||||
}
|
||||
|
||||
if (MI.isUnconditionalBranch()) {
|
||||
// Unconditional branch destination might be unanalyzable, assume these
|
||||
// are OK.
|
||||
if (MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI)) {
|
||||
if (!isBlockInRange(MI, *DestBB)) {
|
||||
fixupUnconditionalBranch(MI);
|
||||
++NumUnconditionalRelaxed;
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Unconditional branch is the last terminator.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -388,7 +462,12 @@ bool BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
|
||||
|
||||
DEBUG(dbgs() << "***** BranchRelaxation *****\n");
|
||||
|
||||
TII = MF->getSubtarget().getInstrInfo();
|
||||
const TargetSubtargetInfo &ST = MF->getSubtarget();
|
||||
TII = ST.getInstrInfo();
|
||||
|
||||
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
if (TRI->trackLivenessAfterRegAlloc(*MF))
|
||||
RS.reset(new RegScavenger());
|
||||
|
||||
// Renumber all of the machine basic blocks in the function, guaranteeing that
|
||||
// the numbers agree with the position of the block in the function.
|
||||
|
@ -120,6 +120,21 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
|
||||
emitStartOfRuntimeMetadata(M);
|
||||
}
|
||||
|
||||
bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
|
||||
const MachineBasicBlock *MBB) const {
|
||||
if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
|
||||
return false;
|
||||
|
||||
if (MBB->empty())
|
||||
return true;
|
||||
|
||||
// If this is a block implementing a long branch, an expression relative to
|
||||
// the start of the block is needed. to the start of the block.
|
||||
// XXX - Is there a smarter way to check this?
|
||||
return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
|
||||
}
|
||||
|
||||
|
||||
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
|
||||
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
|
||||
SIProgramInfo KernelInfo;
|
||||
|
@ -131,6 +131,9 @@ public:
|
||||
|
||||
void EmitStartOfAsmFile(Module &M) override;
|
||||
|
||||
bool isBlockOnlyReachableByFallthrough(
|
||||
const MachineBasicBlock *MBB) const override;
|
||||
|
||||
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
|
||||
unsigned AsmVariant, const char *ExtraCode,
|
||||
raw_ostream &O) override;
|
||||
|
@ -47,6 +47,27 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
|
||||
}
|
||||
}
|
||||
|
||||
const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
|
||||
const MachineBasicBlock &SrcBB,
|
||||
const MachineOperand &MO) const {
|
||||
const MCExpr *DestBBSym
|
||||
= MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
|
||||
const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
|
||||
|
||||
assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 &&
|
||||
ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
|
||||
|
||||
// s_getpc_b64 returns the address of next instruction.
|
||||
const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
|
||||
SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
|
||||
|
||||
if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD)
|
||||
return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
|
||||
|
||||
assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD);
|
||||
return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
|
||||
}
|
||||
|
||||
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
|
||||
|
||||
int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
|
||||
@ -71,8 +92,14 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
|
||||
MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
|
||||
break;
|
||||
case MachineOperand::MO_MachineBasicBlock:
|
||||
MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
|
||||
MO.getMBB()->getSymbol(), Ctx));
|
||||
if (MO.getTargetFlags() != 0) {
|
||||
MCOp = MCOperand::createExpr(
|
||||
getLongBranchBlockExpr(*MI->getParent(), MO));
|
||||
} else {
|
||||
MCOp = MCOperand::createExpr(
|
||||
MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
|
||||
}
|
||||
|
||||
break;
|
||||
case MachineOperand::MO_GlobalAddress: {
|
||||
const GlobalValue *GV = MO.getGlobal();
|
||||
@ -93,6 +120,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
|
||||
MCOp = MCOperand::createExpr(Expr);
|
||||
break;
|
||||
}
|
||||
case MachineOperand::MO_MCSymbol:
|
||||
MCOp = MCOperand::createExpr(
|
||||
MCSymbolRefExpr::create(MO.getMCSymbol(), Ctx));
|
||||
break;
|
||||
}
|
||||
OutMI.addOperand(MCOp);
|
||||
}
|
||||
|
@ -14,8 +14,11 @@ namespace llvm {
|
||||
|
||||
class AMDGPUSubtarget;
|
||||
class AsmPrinter;
|
||||
class MachineBasicBlock;
|
||||
class MachineInstr;
|
||||
class MachineOperand;
|
||||
class MCContext;
|
||||
class MCExpr;
|
||||
class MCInst;
|
||||
|
||||
class AMDGPUMCInstLower {
|
||||
@ -23,6 +26,9 @@ class AMDGPUMCInstLower {
|
||||
const AMDGPUSubtarget &ST;
|
||||
const AsmPrinter &AP;
|
||||
|
||||
const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
|
||||
const MachineOperand &MO) const;
|
||||
|
||||
public:
|
||||
AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
|
||||
const AsmPrinter &AP);
|
||||
|
@ -602,6 +602,7 @@ void GCNPassConfig::addPreEmitPass() {
|
||||
addPass(createSIShrinkInstructionsPass());
|
||||
addPass(&SIInsertSkipsPassID);
|
||||
addPass(createSIDebuggerInsertNopsPass());
|
||||
addPass(&BranchRelaxationPassID);
|
||||
}
|
||||
|
||||
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
|
||||
|
@ -38,12 +38,15 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
|
||||
const MCValue &Target,
|
||||
const MCFixup &Fixup,
|
||||
bool IsPCRel) const {
|
||||
// SCRATCH_RSRC_DWORD[01] is a special global variable that represents
|
||||
// the scratch buffer.
|
||||
if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
|
||||
return ELF::R_AMDGPU_ABS32_LO;
|
||||
if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
|
||||
return ELF::R_AMDGPU_ABS32_HI;
|
||||
if (const auto *SymA = Target.getSymA()) {
|
||||
// SCRATCH_RSRC_DWORD[01] is a special global variable that represents
|
||||
// the scratch buffer.
|
||||
if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
|
||||
return ELF::R_AMDGPU_ABS32_LO;
|
||||
|
||||
if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
|
||||
return ELF::R_AMDGPU_ABS32_HI;
|
||||
}
|
||||
|
||||
switch (Target.getAccessVariant()) {
|
||||
default:
|
||||
|
@ -28,6 +28,13 @@
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// Must be at least 4 to be able to branch over minimum unconditional branch
|
||||
// code. This is only for making it possible to write reasonably small tests for
|
||||
// long branches.
|
||||
static cl::opt<unsigned>
|
||||
BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
|
||||
cl::desc("Restrict range of branch instructions (DEBUG)"));
|
||||
|
||||
SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
|
||||
: AMDGPUInstrInfo(ST), RI(), ST(ST) {}
|
||||
|
||||
@ -1045,6 +1052,128 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
|
||||
return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
|
||||
}
|
||||
|
||||
bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
|
||||
int64_t BrOffset) const {
|
||||
// BranchRelaxation should never have to check s_setpc_b64 because its dest
|
||||
// block is unanalyzable.
|
||||
assert(BranchOp != AMDGPU::S_SETPC_B64);
|
||||
|
||||
// Convert to dwords.
|
||||
BrOffset /= 4;
|
||||
|
||||
// The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
|
||||
// from the next instruction.
|
||||
BrOffset -= 1;
|
||||
|
||||
return isIntN(BranchOffsetBits, BrOffset);
|
||||
}
|
||||
|
||||
MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
|
||||
const MachineInstr &MI) const {
|
||||
if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
|
||||
// This would be a difficult analysis to perform, but can always be legal so
|
||||
// there's no need to analyze it.
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return MI.getOperand(0).getMBB();
|
||||
}
|
||||
|
||||
unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock &DestBB,
|
||||
const DebugLoc &DL,
|
||||
int64_t BrOffset,
|
||||
RegScavenger *RS) const {
|
||||
assert(RS && "RegScavenger required for long branching");
|
||||
assert(MBB.empty() &&
|
||||
"new block should be inserted for expanding unconditional branch");
|
||||
assert(MBB.pred_size() == 1);
|
||||
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||
|
||||
// FIXME: Virtual register workaround for RegScavenger not working with empty
|
||||
// blocks.
|
||||
unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
||||
|
||||
auto I = MBB.end();
|
||||
|
||||
// We need to compute the offset relative to the instruction immediately after
|
||||
// s_getpc_b64. Insert pc arithmetic code before last terminator.
|
||||
MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
|
||||
|
||||
// TODO: Handle > 32-bit block address.
|
||||
if (BrOffset >= 0) {
|
||||
BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
|
||||
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
|
||||
.addReg(PCReg, 0, AMDGPU::sub0)
|
||||
.addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
|
||||
BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
|
||||
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
|
||||
.addReg(PCReg, 0, AMDGPU::sub1)
|
||||
.addImm(0);
|
||||
} else {
|
||||
// Backwards branch.
|
||||
BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
|
||||
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
|
||||
.addReg(PCReg, 0, AMDGPU::sub0)
|
||||
.addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
|
||||
BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
|
||||
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
|
||||
.addReg(PCReg, 0, AMDGPU::sub1)
|
||||
.addImm(0);
|
||||
}
|
||||
|
||||
// Insert the indirect branch after the other terminator.
|
||||
BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
|
||||
.addReg(PCReg);
|
||||
|
||||
// FIXME: If spilling is necessary, this will fail because this scavenger has
|
||||
// no emergency stack slots. It is non-trivial to spill in this situation,
|
||||
// because the restore code needs to be specially placed after the
|
||||
// jump. BranchRelaxation then needs to be made aware of the newly inserted
|
||||
// block.
|
||||
//
|
||||
// If a spill is needed for the pc register pair, we need to insert a spill
|
||||
// restore block right before the destination block, and insert a short branch
|
||||
// into the old destination block's fallthrough predecessor.
|
||||
// e.g.:
|
||||
//
|
||||
// s_cbranch_scc0 skip_long_branch:
|
||||
//
|
||||
// long_branch_bb:
|
||||
// spill s[8:9]
|
||||
// s_getpc_b64 s[8:9]
|
||||
// s_add_u32 s8, s8, restore_bb
|
||||
// s_addc_u32 s9, s9, 0
|
||||
// s_setpc_b64 s[8:9]
|
||||
//
|
||||
// skip_long_branch:
|
||||
// foo;
|
||||
//
|
||||
// .....
|
||||
//
|
||||
// dest_bb_fallthrough_predecessor:
|
||||
// bar;
|
||||
// s_branch dest_bb
|
||||
//
|
||||
// restore_bb:
|
||||
// restore s[8:9]
|
||||
// fallthrough dest_bb
|
||||
///
|
||||
// dest_bb:
|
||||
// buzz;
|
||||
|
||||
RS->enterBasicBlockEnd(MBB);
|
||||
unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
|
||||
MachineBasicBlock::iterator(GetPC), 0);
|
||||
MRI.replaceRegWith(PCReg, Scav);
|
||||
MRI.clearVirtRegs();
|
||||
RS->setRegUsed(Scav);
|
||||
|
||||
return 4 + 8 + 4 + 4;
|
||||
}
|
||||
|
||||
unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
|
||||
switch (Cond) {
|
||||
case SIInstrInfo::SCC_TRUE:
|
||||
@ -1083,15 +1212,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
|
||||
}
|
||||
}
|
||||
|
||||
bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
|
||||
MachineBasicBlock *&FBB,
|
||||
SmallVectorImpl<MachineOperand> &Cond,
|
||||
bool AllowModify) const {
|
||||
MachineBasicBlock::iterator I = MBB.getFirstTerminator();
|
||||
|
||||
if (I == MBB.end())
|
||||
return false;
|
||||
|
||||
bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I,
|
||||
MachineBasicBlock *&TBB,
|
||||
MachineBasicBlock *&FBB,
|
||||
SmallVectorImpl<MachineOperand> &Cond,
|
||||
bool AllowModify) const {
|
||||
if (I->getOpcode() == AMDGPU::S_BRANCH) {
|
||||
// Unconditional Branch
|
||||
TBB = I->getOperand(0).getMBB();
|
||||
@ -1122,6 +1248,44 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
|
||||
MachineBasicBlock *&FBB,
|
||||
SmallVectorImpl<MachineOperand> &Cond,
|
||||
bool AllowModify) const {
|
||||
MachineBasicBlock::iterator I = MBB.getFirstTerminator();
|
||||
if (I == MBB.end())
|
||||
return false;
|
||||
|
||||
if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
|
||||
return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
|
||||
|
||||
++I;
|
||||
|
||||
// TODO: Should be able to treat as fallthrough?
|
||||
if (I == MBB.end())
|
||||
return true;
|
||||
|
||||
if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
|
||||
return true;
|
||||
|
||||
MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
|
||||
|
||||
// Specifically handle the case where the conditional branch is to the same
|
||||
// destination as the mask branch. e.g.
|
||||
//
|
||||
// si_mask_branch BB8
|
||||
// s_cbranch_execz BB8
|
||||
// s_cbranch BB9
|
||||
//
|
||||
// This is required to understand divergent loops which may need the branches
|
||||
// to be relaxed.
|
||||
if (TBB != MaskBrDest || Cond.empty())
|
||||
return true;
|
||||
|
||||
auto Pred = Cond[0].getImm();
|
||||
return (Pred != EXECZ && Pred != EXECNZ);
|
||||
}
|
||||
|
||||
unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
|
||||
int *BytesRemoved) const {
|
||||
MachineBasicBlock::iterator I = MBB.getFirstTerminator();
|
||||
@ -1130,6 +1294,11 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
|
||||
unsigned RemovedSize = 0;
|
||||
while (I != MBB.end()) {
|
||||
MachineBasicBlock::iterator Next = std::next(I);
|
||||
if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
|
||||
I = Next;
|
||||
continue;
|
||||
}
|
||||
|
||||
RemovedSize += getInstSizeInBytes(*I);
|
||||
I->eraseFromParent();
|
||||
++Count;
|
||||
|
@ -158,6 +158,24 @@ public:
|
||||
bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
|
||||
unsigned &SrcOpIdx2) const override;
|
||||
|
||||
bool isBranchOffsetInRange(unsigned BranchOpc,
|
||||
int64_t BrOffset) const override;
|
||||
|
||||
MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
|
||||
|
||||
unsigned insertIndirectBranch(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock &NewDestBB,
|
||||
const DebugLoc &DL,
|
||||
int64_t BrOffset,
|
||||
RegScavenger *RS = nullptr) const override;
|
||||
|
||||
bool analyzeBranchImpl(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I,
|
||||
MachineBasicBlock *&TBB,
|
||||
MachineBasicBlock *&FBB,
|
||||
SmallVectorImpl<MachineOperand> &Cond,
|
||||
bool AllowModify) const;
|
||||
|
||||
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
|
||||
MachineBasicBlock *&FBB,
|
||||
SmallVectorImpl<MachineOperand> &Cond,
|
||||
@ -618,6 +636,12 @@ namespace AMDGPU {
|
||||
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
|
||||
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
|
||||
const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
|
||||
|
||||
// For MachineOperands.
|
||||
enum TargetFlags {
|
||||
TF_LONG_BRANCH_FORWARD = 1 << 0,
|
||||
TF_LONG_BRANCH_BACKWARD = 1 << 1
|
||||
};
|
||||
} // End namespace AMDGPU
|
||||
|
||||
namespace SI {
|
||||
|
@ -25,6 +25,7 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
|
||||
let SALU = 1;
|
||||
let SOP1 = 1;
|
||||
let SchedRW = [WriteSALU];
|
||||
let Size = 4;
|
||||
let UseNamedOperandTable = 1;
|
||||
|
||||
string Mnemonic = opName;
|
||||
@ -41,6 +42,7 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
|
||||
|
||||
let isPseudo = 0;
|
||||
let isCodeGenOnly = 0;
|
||||
let Size = 4;
|
||||
|
||||
// copy relevant pseudo op flags
|
||||
let SubtargetPredicate = ps.SubtargetPredicate;
|
||||
|
238
test/CodeGen/AMDGPU/branch-relax-spill.ll
Normal file
238
test/CodeGen/AMDGPU/branch-relax-spill.ll
Normal file
@ -0,0 +1,238 @@
|
||||
; RUN: not llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s 2>&1 | FileCheck -check-prefix=FAIL %s
|
||||
|
||||
; FIXME: This should be able to compile, but requires inserting an
|
||||
; extra block to restore the scavenged register.
|
||||
|
||||
; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!
|
||||
|
||||
define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
|
||||
entry:
|
||||
%sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0
|
||||
%sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0
|
||||
%sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={SGPR2}"() #0
|
||||
%sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={SGPR3}"() #0
|
||||
%sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={SGPR4}"() #0
|
||||
%sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={SGPR5}"() #0
|
||||
%sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={SGPR6}"() #0
|
||||
%sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={SGPR7}"() #0
|
||||
%sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={SGPR8}"() #0
|
||||
%sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={SGPR9}"() #0
|
||||
%sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={SGPR10}"() #0
|
||||
%sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={SGPR11}"() #0
|
||||
%sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={SGPR12}"() #0
|
||||
%sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={SGPR13}"() #0
|
||||
%sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={SGPR14}"() #0
|
||||
%sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={SGPR15}"() #0
|
||||
%sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={SGPR16}"() #0
|
||||
%sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={SGPR17}"() #0
|
||||
%sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={SGPR18}"() #0
|
||||
%sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={SGPR19}"() #0
|
||||
%sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={SGPR20}"() #0
|
||||
%sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={SGPR21}"() #0
|
||||
%sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={SGPR22}"() #0
|
||||
%sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={SGPR23}"() #0
|
||||
%sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={SGPR24}"() #0
|
||||
%sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={SGPR25}"() #0
|
||||
%sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={SGPR26}"() #0
|
||||
%sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={SGPR27}"() #0
|
||||
%sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={SGPR28}"() #0
|
||||
%sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={SGPR29}"() #0
|
||||
%sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={SGPR30}"() #0
|
||||
%sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={SGPR31}"() #0
|
||||
%sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={SGPR32}"() #0
|
||||
%sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={SGPR33}"() #0
|
||||
%sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={SGPR34}"() #0
|
||||
%sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={SGPR35}"() #0
|
||||
%sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={SGPR36}"() #0
|
||||
%sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={SGPR37}"() #0
|
||||
%sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={SGPR38}"() #0
|
||||
%sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={SGPR39}"() #0
|
||||
%sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={SGPR40}"() #0
|
||||
%sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={SGPR41}"() #0
|
||||
%sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={SGPR42}"() #0
|
||||
%sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={SGPR43}"() #0
|
||||
%sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={SGPR44}"() #0
|
||||
%sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={SGPR45}"() #0
|
||||
%sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={SGPR46}"() #0
|
||||
%sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={SGPR47}"() #0
|
||||
%sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={SGPR48}"() #0
|
||||
%sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={SGPR49}"() #0
|
||||
%sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={SGPR50}"() #0
|
||||
%sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={SGPR51}"() #0
|
||||
%sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={SGPR52}"() #0
|
||||
%sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={SGPR53}"() #0
|
||||
%sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={SGPR54}"() #0
|
||||
%sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={SGPR55}"() #0
|
||||
%sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={SGPR56}"() #0
|
||||
%sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={SGPR57}"() #0
|
||||
%sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={SGPR58}"() #0
|
||||
%sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={SGPR59}"() #0
|
||||
%sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={SGPR60}"() #0
|
||||
%sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={SGPR61}"() #0
|
||||
%sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={SGPR62}"() #0
|
||||
%sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={SGPR63}"() #0
|
||||
%sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={SGPR64}"() #0
|
||||
%sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={SGPR65}"() #0
|
||||
%sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={SGPR66}"() #0
|
||||
%sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={SGPR67}"() #0
|
||||
%sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={SGPR68}"() #0
|
||||
%sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={SGPR69}"() #0
|
||||
%sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={SGPR70}"() #0
|
||||
%sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={SGPR71}"() #0
|
||||
%sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={SGPR72}"() #0
|
||||
%sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={SGPR73}"() #0
|
||||
%sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={SGPR74}"() #0
|
||||
%sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={SGPR75}"() #0
|
||||
%sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={SGPR76}"() #0
|
||||
%sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={SGPR77}"() #0
|
||||
%sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={SGPR78}"() #0
|
||||
%sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={SGPR79}"() #0
|
||||
%sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={SGPR80}"() #0
|
||||
%sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={SGPR81}"() #0
|
||||
%sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={SGPR82}"() #0
|
||||
%sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={SGPR83}"() #0
|
||||
%sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={SGPR84}"() #0
|
||||
%sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={SGPR85}"() #0
|
||||
%sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={SGPR86}"() #0
|
||||
%sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={SGPR87}"() #0
|
||||
%sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={SGPR88}"() #0
|
||||
%sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={SGPR89}"() #0
|
||||
%sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={SGPR90}"() #0
|
||||
%sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={SGPR91}"() #0
|
||||
%sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={SGPR92}"() #0
|
||||
%sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={SGPR93}"() #0
|
||||
%sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={SGPR94}"() #0
|
||||
%sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={SGPR95}"() #0
|
||||
%sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={SGPR96}"() #0
|
||||
%sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={SGPR97}"() #0
|
||||
%sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={SGPR98}"() #0
|
||||
%sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={SGPR99}"() #0
|
||||
%sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={SGPR100}"() #0
|
||||
%sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={SGPR101}"() #0
|
||||
%sgpr102 = tail call i32 asm sideeffect "s_mov_b32 s102, 0", "={SGPR102}"() #0
|
||||
%sgpr103 = tail call i32 asm sideeffect "s_mov_b32 s103, 0", "={SGPR103}"() #0
|
||||
%vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={VCC_LO}"() #0
|
||||
%vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={VCC_HI}"() #0
|
||||
%cmp = icmp eq i32 %cnd, 0
|
||||
br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
|
||||
|
||||
bb2: ; 28 bytes
|
||||
; 24 byte asm
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64",""() #0
|
||||
br label %bb3
|
||||
|
||||
bb3:
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR0}"(i32 %sgpr0) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR1}"(i32 %sgpr1) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR2}"(i32 %sgpr2) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR3}"(i32 %sgpr3) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR4}"(i32 %sgpr4) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR5}"(i32 %sgpr5) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR6}"(i32 %sgpr6) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR7}"(i32 %sgpr7) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR8}"(i32 %sgpr8) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR9}"(i32 %sgpr9) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR10}"(i32 %sgpr10) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR11}"(i32 %sgpr11) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR12}"(i32 %sgpr12) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR13}"(i32 %sgpr13) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR14}"(i32 %sgpr14) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR15}"(i32 %sgpr15) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR16}"(i32 %sgpr16) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR17}"(i32 %sgpr17) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR18}"(i32 %sgpr18) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR19}"(i32 %sgpr19) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR20}"(i32 %sgpr20) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR21}"(i32 %sgpr21) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR22}"(i32 %sgpr22) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR23}"(i32 %sgpr23) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR24}"(i32 %sgpr24) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR25}"(i32 %sgpr25) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR26}"(i32 %sgpr26) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR27}"(i32 %sgpr27) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR28}"(i32 %sgpr28) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR29}"(i32 %sgpr29) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR30}"(i32 %sgpr30) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR31}"(i32 %sgpr31) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR32}"(i32 %sgpr32) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR33}"(i32 %sgpr33) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR34}"(i32 %sgpr34) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR35}"(i32 %sgpr35) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR36}"(i32 %sgpr36) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR37}"(i32 %sgpr37) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR38}"(i32 %sgpr38) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR39}"(i32 %sgpr39) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR40}"(i32 %sgpr40) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR41}"(i32 %sgpr41) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR42}"(i32 %sgpr42) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR43}"(i32 %sgpr43) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR44}"(i32 %sgpr44) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR45}"(i32 %sgpr45) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR46}"(i32 %sgpr46) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR47}"(i32 %sgpr47) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR48}"(i32 %sgpr48) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR49}"(i32 %sgpr49) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR50}"(i32 %sgpr50) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR51}"(i32 %sgpr51) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR52}"(i32 %sgpr52) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR53}"(i32 %sgpr53) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR54}"(i32 %sgpr54) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR55}"(i32 %sgpr55) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR56}"(i32 %sgpr56) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR57}"(i32 %sgpr57) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR58}"(i32 %sgpr58) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR59}"(i32 %sgpr59) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR60}"(i32 %sgpr60) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR61}"(i32 %sgpr61) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR62}"(i32 %sgpr62) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR63}"(i32 %sgpr63) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR64}"(i32 %sgpr64) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR65}"(i32 %sgpr65) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR66}"(i32 %sgpr66) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR67}"(i32 %sgpr67) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR68}"(i32 %sgpr68) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR69}"(i32 %sgpr69) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR70}"(i32 %sgpr70) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR71}"(i32 %sgpr71) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR72}"(i32 %sgpr72) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR73}"(i32 %sgpr73) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR74}"(i32 %sgpr74) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR75}"(i32 %sgpr75) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR76}"(i32 %sgpr76) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR77}"(i32 %sgpr77) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR78}"(i32 %sgpr78) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR79}"(i32 %sgpr79) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR80}"(i32 %sgpr80) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR81}"(i32 %sgpr81) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR82}"(i32 %sgpr82) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR83}"(i32 %sgpr83) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR84}"(i32 %sgpr84) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR85}"(i32 %sgpr85) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR86}"(i32 %sgpr86) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR87}"(i32 %sgpr87) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR88}"(i32 %sgpr88) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR89}"(i32 %sgpr89) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR90}"(i32 %sgpr90) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR91}"(i32 %sgpr91) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR92}"(i32 %sgpr92) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR93}"(i32 %sgpr93) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR94}"(i32 %sgpr94) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR95}"(i32 %sgpr95) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR96}"(i32 %sgpr96) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR97}"(i32 %sgpr97) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR98}"(i32 %sgpr98) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR99}"(i32 %sgpr99) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR100}"(i32 %sgpr100) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR101}"(i32 %sgpr101) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR102}"(i32 %sgpr102) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{SGPR103}"(i32 %sgpr103) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{VCC_LO}"(i32 %vcc_lo) #0
|
||||
tail call void asm sideeffect "; reg use $0", "{VCC_HI}"(i32 %vcc_hi) #0
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
479
test/CodeGen/AMDGPU/branch-relaxation.ll
Normal file
479
test/CodeGen/AMDGPU/branch-relaxation.ll
Normal file
@ -0,0 +1,479 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s
|
||||
; Restrict maximum branch to between +7 and -8 dwords
|
||||
|
||||
; Used to emit an always 4 byte instruction. Inline asm always assumes
|
||||
; each instruction is the maximum size.
|
||||
declare void @llvm.amdgcn.s.sleep(i32) #0
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
|
||||
; GCN-LABEL: {{^}}uniform_conditional_max_short_forward_branch:
|
||||
; GCN: s_load_dword [[CND:s[0-9]+]]
|
||||
; GCN: s_cmp_eq_u32 [[CND]], 0
|
||||
; GCN-NEXT: s_cbranch_scc1 [[BB3:BB[0-9]+_[0-9]+]]
|
||||
|
||||
|
||||
; GCN-NEXT: ; BB#1: ; %bb2
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
; GCN-NEXT: s_sleep 0
|
||||
|
||||
; GCN-NEXT: [[BB3]]: ; %bb3
|
||||
; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
|
||||
; GCN: buffer_store_dword [[V_CND]]
|
||||
; GCN: s_endpgm
|
||||
define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
|
||||
bb:
|
||||
%cmp = icmp eq i32 %cnd, 0
|
||||
br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
|
||||
|
||||
bb2:
|
||||
; 24 bytes
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
br label %bb3
|
||||
|
||||
bb3:
|
||||
store volatile i32 %cnd, i32 addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_branch:
|
||||
; GCN: s_load_dword [[CND:s[0-9]+]]
|
||||
; GCN: s_cmp_eq_u32 [[CND]], 0
|
||||
; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
|
||||
; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[LONGBB]]:
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
|
||||
; GCN-NEXT: [[ENDBB]]:
|
||||
; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
|
||||
; GCN: buffer_store_dword [[V_CND]]
|
||||
; GCN: s_endpgm
|
||||
define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
|
||||
bb0:
|
||||
%cmp = icmp eq i32 %cnd, 0
|
||||
br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
|
||||
|
||||
bb2:
|
||||
; 32 bytes
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
br label %bb3
|
||||
|
||||
bb3:
|
||||
store volatile i32 %cnd, i32 addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch:
|
||||
; GCN: s_load_dword [[CND:s[0-9]+]]
|
||||
; GCN-DAG: v_cmp_eq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CND]], 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
|
||||
; GCN: s_and_b64 vcc, exec, [[CMP]]
|
||||
; GCN-NEXT: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
|
||||
; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[LONGBB]]:
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
|
||||
; GCN: [[ENDBB]]:
|
||||
; GCN: buffer_store_dword [[V_CND]]
|
||||
; GCN: s_endpgm
|
||||
define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
|
||||
bb0:
|
||||
%cmp = fcmp oeq float %cnd, 0.0
|
||||
br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
|
||||
|
||||
bb2:
|
||||
call void asm sideeffect " ; 32 bytes
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
br label %bb3
|
||||
|
||||
bb3:
|
||||
store volatile float %cnd, float addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}min_long_forward_vbranch:
|
||||
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
|
||||
; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]]
|
||||
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
|
||||
; GCN: s_or_b64 exec, exec, [[SAVE]]
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: s_endpgm
|
||||
define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
|
||||
bb:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = zext i32 %tid to i64
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid.ext
|
||||
%load = load volatile i32, i32 addrspace(1)* %gep
|
||||
%cmp = icmp eq i32 %load, 0
|
||||
br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
|
||||
|
||||
bb2:
|
||||
call void asm sideeffect " ; 32 bytes
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
br label %bb3
|
||||
|
||||
bb3:
|
||||
store volatile i32 %load, i32 addrspace(1)* %gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to use s_cbranch_scc0
|
||||
; GCN-LABEL: {{^}}long_backward_sbranch:
|
||||
; GCN: v_mov_b32_e32 [[LOOPIDX:v[0-9]+]], 0{{$}}
|
||||
|
||||
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2
|
||||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]]
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]]
|
||||
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, vcc
|
||||
; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
|
||||
; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONG_JUMP]]+4)-[[LOOPBB]]
|
||||
; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[ENDBB]]:
|
||||
; GCN-NEXT: s_endpgm
|
||||
define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
|
||||
bb:
|
||||
br label %bb2
|
||||
|
||||
bb2:
|
||||
%loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
|
||||
; 24 bytes
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
%inc = add nsw i32 %loop.idx, 1 ; add cost 4
|
||||
%cmp = icmp slt i32 %inc, 10 ; condition cost = 8
|
||||
br i1 %cmp, label %bb2, label %bb3 ; -
|
||||
|
||||
bb3:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Requires expansion of unconditional branch from %bb2 to %bb4 (and
|
||||
; expansion of conditional branch from %bb to %bb3.
|
||||
|
||||
; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch:
|
||||
; GCN: s_cmp_eq_u32
|
||||
; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
|
||||
; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[BB2]]: ; %bb2
|
||||
; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
|
||||
; GCN: buffer_store_dword [[BB2_K]]
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4)
|
||||
; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN: [[BB3]]: ; %bb3
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: ;;#ASMEND
|
||||
|
||||
; GCN-NEXT: [[BB4]]: ; %bb4
|
||||
; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
|
||||
; GCN: buffer_store_dword [[BB4_K]]
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
|
||||
define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
|
||||
bb0:
|
||||
%tmp = icmp ne i32 %arg1, 0
|
||||
br i1 %tmp, label %bb2, label %bb3
|
||||
|
||||
bb2:
|
||||
store volatile i32 17, i32 addrspace(1)* undef
|
||||
br label %bb4
|
||||
|
||||
bb3:
|
||||
; 32 byte asm
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
br label %bb4
|
||||
|
||||
bb4:
|
||||
store volatile i32 63, i32 addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch:
|
||||
; GCN-NEXT: ; BB#0: ; %entry
|
||||
|
||||
; GCN-NEXT: [[LOOP:BB[0-9]_[0-9]+]]: ; %loop
|
||||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
|
||||
; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
|
||||
; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP]]
|
||||
; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
; GCN-NEXT .Lfunc_end{{[0-9]+}}:
|
||||
define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
; 32 byte asm
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
br label %loop
|
||||
}
|
||||
|
||||
; Expansion of branch from %bb1 to %bb3 introduces need to expand
|
||||
; branch from %bb0 to %bb2
|
||||
|
||||
; GCN-LABEL: {{^}}expand_requires_expand:
|
||||
; GCN-NEXT: ; BB#0: ; %bb0
|
||||
; GCN: s_load_dword
|
||||
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 0{{$}}
|
||||
; GCN-NEXT: s_cbranch_scc0 [[BB1:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN-NEXT: [[LONGBB0:BB[0-9]+_[0-9]+]]: ; %bb0
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4)
|
||||
; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[BB1]]: ; %bb1
|
||||
; GCN-NEXT: s_load_dword
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_eq_u32 s{{[0-9]+}}, 3{{$}}
|
||||
; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]_[0-9]+]]
|
||||
|
||||
; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: ; %bb1
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
|
||||
; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[BB2]]: ; %bb2
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: v_nop_e64
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
|
||||
; GCN-NEXT: [[BB3]]: ; %bb3
|
||||
; GCN-NEXT: s_endpgm
|
||||
define void @expand_requires_expand(i32 %cond0) #0 {
|
||||
bb0:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
%cmp0 = icmp slt i32 %cond0, 0
|
||||
br i1 %cmp0, label %bb2, label %bb1
|
||||
|
||||
bb1:
|
||||
%val = load volatile i32, i32 addrspace(2)* undef
|
||||
%cmp1 = icmp eq i32 %val, 3
|
||||
br i1 %cmp1, label %bb3, label %bb2
|
||||
|
||||
bb2:
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
br label %bb3
|
||||
|
||||
bb3:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Requires expanding of required skip branch.
|
||||
|
||||
; GCN-LABEL: {{^}}uniform_inside_divergent:
|
||||
; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
|
||||
; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; GCN-NEXT: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
|
||||
; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
|
||||
; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
|
||||
; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[IF]]: ; %if
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: s_cmp_lg_u32
|
||||
; GCN: s_cbranch_scc1 [[ENDIF]]
|
||||
|
||||
; GCN-NEXT: ; BB#2: ; %if_uniform
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-NEXT: [[ENDIF]]: ; %endif
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
|
||||
; GCN-NEXT: s_endpgm
|
||||
define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%d_cmp = icmp ult i32 %tid, 16
|
||||
br i1 %d_cmp, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 0, i32 addrspace(1)* %out
|
||||
%u_cmp = icmp eq i32 %cond, 0
|
||||
br i1 %u_cmp, label %if_uniform, label %endif
|
||||
|
||||
if_uniform:
|
||||
store i32 1, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
||||
; si_mask_branch
|
||||
; s_cbranch_execz
|
||||
; s_branch
|
||||
|
||||
; GCN-LABEL: {{^}}analyze_mask_branch:
|
||||
; GCN: v_cmp_lt_f32_e32 vcc
|
||||
; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]]
|
||||
; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
|
||||
; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]]
|
||||
; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN-NEXT: [[BRANCH_SKIP]]: ; %entry
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[RET]]-([[BRANCH_SKIP]]+4)
|
||||
; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[LOOP_BODY]]: ; %loop_body
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: v_nop_e64
|
||||
; GCN: ;;#ASMEND
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, -1{{$}}
|
||||
; GCN-NEXT: s_cbranch_vccz [[RET]]
|
||||
|
||||
; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body
|
||||
; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
|
||||
; GCN-NEXT: s_getpc_b64 vcc
|
||||
; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP_BODY]]
|
||||
; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
|
||||
; GCN-NEXT: s_setpc_b64 vcc
|
||||
|
||||
; GCN-NEXT: [[RET]]: ; %Flow
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
|
||||
; GCN: buffer_store_dword
|
||||
; GCN-NEXT: s_endpgm
|
||||
define void @analyze_mask_branch() #0 {
|
||||
entry:
|
||||
%reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
|
||||
%cmp0 = fcmp ogt float %reg, 0.000000e+00
|
||||
br i1 %cmp0, label %loop, label %ret
|
||||
|
||||
loop:
|
||||
%phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ]
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
%cmp1 = fcmp olt float %phi, 8.0
|
||||
br i1 %cmp1, label %loop_body, label %ret
|
||||
|
||||
loop_body:
|
||||
call void asm sideeffect
|
||||
"v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64
|
||||
v_nop_e64", ""() #0
|
||||
br label %loop
|
||||
|
||||
ret:
|
||||
store volatile i32 7, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
@ -4,24 +4,27 @@
|
||||
; GCN: v_cmp_eq_u32
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: s_xor_b64
|
||||
; GCN: s_branch BB0_1
|
||||
; GCN: ; mask branch [[RET:BB[0-9]+]]
|
||||
; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN: [[RET]]
|
||||
; GCN: s_or_b64 exec, exec
|
||||
; GCN: s_endpgm
|
||||
|
||||
; GCN: [[UNREACHABLE]]:
|
||||
; GCN: ds_write_b32
|
||||
; GCN: s_waitcnt
|
||||
define void @lower_control_flow_unreachable_terminator() #0 {
|
||||
bb:
|
||||
%tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%tmp63 = icmp eq i32 %tmp15, 32
|
||||
br i1 %tmp63, label %bb64, label %bb68
|
||||
br i1 %tmp63, label %unreachable, label %ret
|
||||
|
||||
bb64:
|
||||
unreachable:
|
||||
store volatile i32 0, i32 addrspace(3)* undef, align 4
|
||||
unreachable
|
||||
|
||||
bb68:
|
||||
ret:
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -29,21 +32,25 @@ bb68:
|
||||
; GCN: v_cmp_eq_u32
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: s_xor_b64
|
||||
; GCN: s_endpgm
|
||||
; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN: s_or_b64 exec, exec
|
||||
; GCN-NEXT: ; %ret
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
||||
; GCN-NEXT: [[UNREACHABLE]]:
|
||||
; GCN-NEXT: s_or_b64 exec, exec
|
||||
; GCN: ds_write_b32
|
||||
; GCN: s_waitcnt
|
||||
define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
|
||||
bb:
|
||||
%tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%tmp63 = icmp eq i32 %tmp15, 32
|
||||
br i1 %tmp63, label %bb68, label %bb64
|
||||
br i1 %tmp63, label %ret, label %unreachable
|
||||
|
||||
bb68:
|
||||
ret:
|
||||
ret void
|
||||
|
||||
bb64:
|
||||
unreachable:
|
||||
store volatile i32 0, i32 addrspace(3)* undef, align 4
|
||||
unreachable
|
||||
}
|
||||
|
@ -105,7 +105,7 @@ define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
|
||||
|
||||
; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
|
||||
; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
|
||||
; CHECK-NEXT: ; BB#3:
|
||||
; CHECK-NEXT: ; BB#2:
|
||||
; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
|
||||
; CHECK-NEXT: s_endpgm
|
||||
|
||||
@ -156,7 +156,7 @@ exit:
|
||||
; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
|
||||
; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; CHECK-NEXT: ; BB#4:
|
||||
; CHECK-NEXT: ; BB#2:
|
||||
; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
|
||||
; CHECK-NEXT: s_endpgm
|
||||
|
||||
@ -270,7 +270,7 @@ exit:
|
||||
; CHECK: s_and_b64 vcc, exec, vcc
|
||||
; CHECK: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; CHECK: ; BB#3: ; %bb10
|
||||
; CHECK: ; %bb10
|
||||
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9
|
||||
; CHECK: buffer_store_dword
|
||||
|
||||
@ -306,7 +306,7 @@ end:
|
||||
; CHECK: s_and_b64 vcc, exec,
|
||||
; CHECK: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; CHECK: ; BB#3: ; %bb6
|
||||
; CHECK: ; %bb6
|
||||
; CHECK: s_mov_b64 exec, 0
|
||||
|
||||
; CHECK: [[SKIPKILL]]:
|
||||
|
Loading…
Reference in New Issue
Block a user