[AMDGPU] gfx1010 loop alignment

Differential Revision: https://reviews.llvm.org/D61529 llvm-svn: 359935
2024-10-19 02:52:53 +02:00 · 2019-05-03 21:17:29 +00:00 · 2019-05-03 21:17:29 +00:00 · ca03fb25e6
commit ca03fb25e6
parent ab504d3276
2 changed files with 78 additions and 0 deletions
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -99,6 +99,11 @@ static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
  cl::init(5),
  cl::ReallyHidden);
 static cl::opt<bool> DisableLoopAlignment(
  "amdgpu-disable-loop-alignment",
  cl::desc("Do not align and prefetch loops"),
  cl::init(false));
 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@ -9966,6 +9971,77 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
  Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
 }
 unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
  const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
  const unsigned CacheLineAlign = 6; // log2(64)
  // Pre-GFX10 target did not benefit from loop alignment
  if (!ML || DisableLoopAlignment ||
      (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
      getSubtarget()->hasInstFwdPrefetchBug())
    return PrefAlign;
  // On GFX10 I$ is 4 x 64 bytes cache lines.
  // By default prefetcher keeps one cache line behind and reads two ahead.
  // We can modify it with S_INST_PREFETCH for larger loops to have two lines
  // behind and one ahead.
  // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
  // If loop fits 64 bytes it always spans no more than two cache lines and
  // does not need an alignment.
  // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
  // Else if loop is less or equal 192 bytes we need two lines behind.
  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
  const MachineBasicBlock *Header = ML->getHeader();
  if (Header->getAlignment() != PrefAlign)
    return Header->getAlignment(); // Already processed.
  unsigned LoopSize = 0;
  for (const MachineBasicBlock *MBB : ML->blocks()) {
    // If inner loop block is aligned assume in average half of the alignment
    // size to be added as nops.
    if (MBB != Header)
      LoopSize += (1 << MBB->getAlignment()) / 2;
    for (const MachineInstr &MI : *MBB) {
      LoopSize += TII->getInstSizeInBytes(MI);
      if (LoopSize > 192)
        return PrefAlign;
    }
  }
  if (LoopSize <= 64)
    return PrefAlign;
  if (LoopSize <= 128)
    return CacheLineAlign;
  // If any of parent loops is surrounded by prefetch instructions do not
  // insert new for inner loop, which would reset parent's settings.
  for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
    if (MachineBasicBlock *Exit = P->getExitBlock()) {
      auto I = Exit->getFirstNonDebugInstr();
      if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
        return CacheLineAlign;
    }
  }
  MachineBasicBlock *Pre = ML->getLoopPreheader();
  MachineBasicBlock *Exit = ML->getExitBlock();
  if (Pre && Exit) {
    BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
            TII->get(AMDGPU::S_INST_PREFETCH))
      .addImm(1); // prefetch 2 lines behind PC
    BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
            TII->get(AMDGPU::S_INST_PREFETCH))
      .addImm(2); // prefetch 1 line behind PC
  }
  return CacheLineAlign;
 }
 LLVM_ATTRIBUTE_UNUSED
 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
  assert(N->getOpcode() == ISD::CopyFromReg);
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@ -367,6 +367,8 @@ public:
                                    bool SNaN = false,
                                    unsigned Depth = 0) const override;
  AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
  unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
 };
 } // End namespace llvm