mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-18 18:42:46 +02:00
[AMDGPU] gfx1010 loop alignment
Differential Revision: https://reviews.llvm.org/D61529 llvm-svn: 359935
This commit is contained in:
parent
ab504d3276
commit
ca03fb25e6
@ -99,6 +99,11 @@ static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
|
||||
cl::init(5),
|
||||
cl::ReallyHidden);
|
||||
|
||||
static cl::opt<bool> DisableLoopAlignment(
|
||||
"amdgpu-disable-loop-alignment",
|
||||
cl::desc("Do not align and prefetch loops"),
|
||||
cl::init(false));
|
||||
|
||||
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
|
||||
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
||||
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
|
||||
@ -9966,6 +9971,77 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
|
||||
Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
|
||||
}
|
||||
|
||||
unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
|
||||
const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
|
||||
const unsigned CacheLineAlign = 6; // log2(64)
|
||||
|
||||
// Pre-GFX10 target did not benefit from loop alignment
|
||||
if (!ML || DisableLoopAlignment ||
|
||||
(getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
|
||||
getSubtarget()->hasInstFwdPrefetchBug())
|
||||
return PrefAlign;
|
||||
|
||||
// On GFX10 I$ is 4 x 64 bytes cache lines.
|
||||
// By default prefetcher keeps one cache line behind and reads two ahead.
|
||||
// We can modify it with S_INST_PREFETCH for larger loops to have two lines
|
||||
// behind and one ahead.
|
||||
// Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
|
||||
// If loop fits 64 bytes it always spans no more than two cache lines and
|
||||
// does not need an alignment.
|
||||
// Else if loop is less or equal 128 bytes we do not need to modify prefetch,
|
||||
// Else if loop is less or equal 192 bytes we need two lines behind.
|
||||
|
||||
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
||||
const MachineBasicBlock *Header = ML->getHeader();
|
||||
if (Header->getAlignment() != PrefAlign)
|
||||
return Header->getAlignment(); // Already processed.
|
||||
|
||||
unsigned LoopSize = 0;
|
||||
for (const MachineBasicBlock *MBB : ML->blocks()) {
|
||||
// If inner loop block is aligned assume in average half of the alignment
|
||||
// size to be added as nops.
|
||||
if (MBB != Header)
|
||||
LoopSize += (1 << MBB->getAlignment()) / 2;
|
||||
|
||||
for (const MachineInstr &MI : *MBB) {
|
||||
LoopSize += TII->getInstSizeInBytes(MI);
|
||||
if (LoopSize > 192)
|
||||
return PrefAlign;
|
||||
}
|
||||
}
|
||||
|
||||
if (LoopSize <= 64)
|
||||
return PrefAlign;
|
||||
|
||||
if (LoopSize <= 128)
|
||||
return CacheLineAlign;
|
||||
|
||||
// If any of parent loops is surrounded by prefetch instructions do not
|
||||
// insert new for inner loop, which would reset parent's settings.
|
||||
for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
|
||||
if (MachineBasicBlock *Exit = P->getExitBlock()) {
|
||||
auto I = Exit->getFirstNonDebugInstr();
|
||||
if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
|
||||
return CacheLineAlign;
|
||||
}
|
||||
}
|
||||
|
||||
MachineBasicBlock *Pre = ML->getLoopPreheader();
|
||||
MachineBasicBlock *Exit = ML->getExitBlock();
|
||||
|
||||
if (Pre && Exit) {
|
||||
BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
|
||||
TII->get(AMDGPU::S_INST_PREFETCH))
|
||||
.addImm(1); // prefetch 2 lines behind PC
|
||||
|
||||
BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
|
||||
TII->get(AMDGPU::S_INST_PREFETCH))
|
||||
.addImm(2); // prefetch 1 line behind PC
|
||||
}
|
||||
|
||||
return CacheLineAlign;
|
||||
}
|
||||
|
||||
LLVM_ATTRIBUTE_UNUSED
|
||||
static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
|
||||
assert(N->getOpcode() == ISD::CopyFromReg);
|
||||
|
@ -367,6 +367,8 @@ public:
|
||||
bool SNaN = false,
|
||||
unsigned Depth = 0) const override;
|
||||
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
|
||||
|
||||
unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
|
||||
};
|
||||
|
||||
} // End namespace llvm
|
||||
|
Loading…
Reference in New Issue
Block a user