[AMDGPU] Enable base pointer.

When the callee requires a dynamic stack realignment, it is not possible to correcty access the incoming stack arguments using the stack pointer. We reserve a base pointer in such cases to access the function arguments inside the callee. The base pointer will hold the incoming stack pointer value before any kind of delta added to it. Reviewed By: arsenm, scott.linder Differential Revision: https://reviews.llvm.org/D78811
2025-01-31 20:51:52 +01:00 · 2020-04-21 15:04:33 +05:30 · 2020-04-21 15:04:33 +05:30 · 52dc890b04
commit 52dc890b04
parent 3438b08380
11 changed files with 390 additions and 92 deletions
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@ -6717,16 +6717,13 @@ describes how the AMDGPU implements function calls:
 1.  SGPR33 is used as a frame pointer (FP) if necessary. Like the SP it is an
    unswizzled scratch address. It is only needed if runtime sized ``alloca``
    are used, or for the reasons defined in ``SIFrameLowering``.
-2.  Runtime stack alignment is not currently supported.
+2.  Runtime stack alignment is supported. SGPR34 is used as a base pointer (BP)
+    to access the incoming stack arguments in the function. The BP is needed
+    only when the function requires the runtime stack alignment.

-    .. TODO::
+3.  Allocating SGPR arguments on the stack are not supported.

-      - If runtime stack alignment is supported, then will an extra argument
-        pointer register be used?
-
-2.  Allocating SGPR arguments on the stack are not supported.
-
-3.  No CFI is currently generated. See
+4.  No CFI is currently generated. See
    :ref:`amdgpu-dwarf-call-frame-information`.

    ..note::
@ -6745,12 +6742,12 @@ describes how the AMDGPU implements function calls:
      local variables and register spill slots are accessed as positive offsets
      relative to ``DW_AT_frame_base``.

-4.  Function argument passing is implemented by copying the input physical
+5.  Function argument passing is implemented by copying the input physical
    registers to virtual registers on entry. The register allocator can spill if
    necessary. These are copied back to physical registers at call sites. The
    net effect is that each function call can have these values in entirely
    distinct locations. The IPRA can help avoid shuffling argument registers.
-5.  Call sites are implemented by setting up the arguments at positive offsets
+6.  Call sites are implemented by setting up the arguments at positive offsets
    from SP. Then SP is incremented to account for the known frame size before
    the call and decremented after the call.

@ -6759,7 +6756,7 @@ describes how the AMDGPU implements function calls:
      The CFI will reflect the changed calculation needed to compute the CFA
      from SP.

-6.  4 byte spill slots are used in the stack frame. One slot is allocated for an
+7.  4 byte spill slots are used in the stack frame. One slot is allocated for an
    emergency spill slot. Buffer instructions are used for stack accesses and
    not the ``flat_scratch`` instruction.

--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -78,11 +78,64 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
  return MCRegister();
 }

-static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
-  LivePhysRegs LiveRegs;
-  LiveRegs.init(*MRI.getTargetRegisterInfo());
-  return findScratchNonCalleeSaveRegister(
-    MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
+                                           LivePhysRegs &LiveRegs,
+                                           Register &TempSGPR,
+                                           Optional<int> &FrameIndex,
+                                           bool IsFP) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+#endif
+
+  // We need to save and restore the current FP/BP.
+
+  // 1: If there is already a VGPR with free lanes, use it. We
+  // may already have to pay the penalty for spilling a CSR VGPR.
+  if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+    int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr,
+                                            TargetStackID::SGPRSpill);
+
+    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+      llvm_unreachable("allocate SGPR spill should have worked");
+
+    FrameIndex = NewFI;
+
+    LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+               dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
+                      << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+                      << '\n');
+    return;
+  }
+
+  // 2: Next, try to save the FP/BP in an unused SGPR.
+  TempSGPR = findScratchNonCalleeSaveRegister(
+      MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+
+  if (!TempSGPR) {
+    int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr,
+                                            TargetStackID::SGPRSpill);
+
+    if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+      // 3: There's no free lane to spill, and no free register to save FP/BP,
+      // so we're forced to spill another VGPR to use for the spill.
+      FrameIndex = NewFI;
+    } else {
+      // 4: If all else fails, spill the FP/BP to memory.
+      FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
+    }
+
+    LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+               dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
+                      << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+                      << '\n';);
+  } else {
+    LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
+                      << printReg(TempSGPR, TRI) << '\n');
+  }
 }

 // We need to specially emit stack operations here because a different frame
@ -613,6 +666,9 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
      LiveRegs.addLiveIns(MBB);
      if (FuncInfo->SGPRForFPSaveRestoreCopy)
        LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+      if (FuncInfo->SGPRForBPSaveRestoreCopy)
+        LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
    } else {
      // In epilog.
      LiveRegs.init(*ST.getRegisterInfo());
@ -650,12 +706,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,

  Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
  Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  Register BasePtrReg =
+      TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
  LivePhysRegs LiveRegs;

  MachineBasicBlock::iterator MBBI = MBB.begin();
  DebugLoc DL;

  bool HasFP = false;
+  bool HasBP = false;
  uint32_t NumBytes = MFI.getStackSize();
  uint32_t RoundedSize = NumBytes;
  // To avoid clobbering VGPRs in lanes that weren't active on function entry,
@ -671,14 +730,46 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
                      TargetStackID::SGPRSpill;
  }

+  bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+  bool SpillBPToMemory = false;
+  // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+  // Otherwise we are spilling the BP to memory.
+  if (HasBPSaveIndex) {
+    SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+                      TargetStackID::SGPRSpill;
+  }
+
  // Emit the copy if we need an FP, and are using a free SGPR to save it.
  if (FuncInfo->SGPRForFPSaveRestoreCopy) {
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
      .addReg(FramePtrReg)
      .setMIFlag(MachineInstr::FrameSetup);
-    // Make the register live throughout the function.
-    for (MachineBasicBlock &MBB : MF)
-      MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy);
+  }
+
+  // Emit the copy if we need a BP, and are using a free SGPR to save it.
+  if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
+            FuncInfo->SGPRForBPSaveRestoreCopy)
+        .addReg(BasePtrReg)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // If a copy has been emitted for FP and/or BP, Make the SGPRs
+  // used in the copy instructions live throughout the function.
+  SmallVector<MCPhysReg, 2> TempSGPRs;
+  if (FuncInfo->SGPRForFPSaveRestoreCopy)
+    TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+  if (FuncInfo->SGPRForBPSaveRestoreCopy)
+    TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
+
+  if (!TempSGPRs.empty()) {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MCPhysReg Reg : TempSGPRs)
+        MBB.addLiveIn(Reg);
+
+      MBB.sortUniqueLiveIns();
+    }
  }

  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
@ -712,6 +803,23 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
                     FuncInfo->FramePointerSaveIndex.getValue());
  }

+  if (HasBPSaveIndex && SpillBPToMemory) {
+    assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
+
+    if (!ScratchExecCopy)
+      ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+    MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+        MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+        .addReg(BasePtrReg);
+
+    buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg,
+                     *FuncInfo->BasePointerSaveIndex);
+  }
+
  if (ScratchExecCopy) {
    // FIXME: Split block and make terminator.
    unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@ -740,6 +848,25 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
        .addReg(Spill[0].VGPR, RegState::Undef);
  }

+  // In this case, spill the BP to a reserved VGPR.
+  if (HasBPSaveIndex && !SpillBPToMemory) {
+    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+    assert(!MFI.isDeadObjectIndex(BasePtrFI));
+
+    assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+        FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+    assert(Spill.size() == 1);
+
+    // Save BP before setting it up.
+    // FIXME: This should respect spillSGPRToVGPR;
+    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+            Spill[0].VGPR)
+        .addReg(BasePtrReg)
+        .addImm(Spill[0].Lane)
+        .addReg(Spill[0].VGPR, RegState::Undef);
+  }
+
  if (TRI.needsStackRealignment(MF)) {
    HasFP = true;
    const unsigned Alignment = MFI.getMaxAlign().value();
@ -749,11 +876,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
      LiveRegs.init(TRI);
      LiveRegs.addLiveIns(MBB);
      LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+      LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
    }

    Register ScratchSPReg = findScratchNonCalleeSaveRegister(
        MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
-    assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
+    assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
+           ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);

    // s_add_u32 tmp_reg, s32, NumBytes
    // s_and_b32 s32, tmp_reg, 0b111...0000
@ -767,15 +896,21 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
        .setMIFlag(MachineInstr::FrameSetup);
    FuncInfo->setIsStackRealigned(true);
  } else if ((HasFP = hasFP(MF))) {
-    // If we need a base pointer, set it up here. It's whatever the value of
-    // the stack pointer is at this point. Any variable size objects will be
-    // allocated after this, so we can still use the base pointer to reference
-    // locals.
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
        .addReg(StackPtrReg)
        .setMIFlag(MachineInstr::FrameSetup);
  }

+  // If we need a base pointer, set it up here. It's whatever the value of
+  // the stack pointer is at this point. Any variable size objects will be
+  // allocated after this, so we can still use the base pointer to reference
+  // the incoming arguments.
+  if ((HasBP = TRI.hasBasePointer(MF))) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+        .addReg(StackPtrReg)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
  if (HasFP && RoundedSize != 0) {
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
        .addReg(StackPtrReg)
@ -790,6 +925,14 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
  assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
                    !FuncInfo->FramePointerSaveIndex)) &&
         "Saved FP but didn't need it");
+
+  assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
+                     FuncInfo->BasePointerSaveIndex)) &&
+         "Needed to save BP but didn't save it anywhere");
+
+  assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
+                    !FuncInfo->BasePointerSaveIndex)) &&
+         "Saved BP but didn't need it");
 }

 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@ -801,6 +944,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
  LivePhysRegs LiveRegs;
  DebugLoc DL;
@ -812,6 +956,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                             : NumBytes;
  const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
  const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  const Register BasePtrReg =
+      TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();

  bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
  bool SpillFPToMemory = false;
@ -820,6 +966,13 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                      TargetStackID::SGPRSpill;
  }

+  bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+  bool SpillBPToMemory = false;
+  if (HasBPSaveIndex) {
+    SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+                      TargetStackID::SGPRSpill;
+  }
+
  if (RoundedSize != 0 && hasFP(MF)) {
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
      .addReg(StackPtrReg)
@ -833,6 +986,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
        .setMIFlag(MachineInstr::FrameSetup);
  }

+  if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+        .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
  Register ScratchExecCopy;
  if (HasFPSaveIndex) {
    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
@ -860,6 +1019,32 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
    }
  }

+  if (HasBPSaveIndex) {
+    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+    assert(!MFI.isDeadObjectIndex(BasePtrFI));
+    if (SpillBPToMemory) {
+      if (!ScratchExecCopy)
+        ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+
+      MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+          MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+      buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+                        FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
+          .addReg(TempVGPR, RegState::Kill);
+    } else {
+      // Reload from VGPR spill.
+      assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+      ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+          FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+      assert(Spill.size() == 1);
+      BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+              BasePtrReg)
+          .addReg(Spill[0].VGPR)
+          .addImm(Spill[0].Lane);
+    }
+  }
+
  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
       FuncInfo->getSGPRSpillVGPRs()) {
    if (!Reg.FI.hasValue())
@ -896,12 +1081,14 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {

 #ifndef NDEBUG
 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
-                                 Optional<int> FramePointerSaveIndex) {
+                                 Optional<int> FramePointerSaveIndex,
+                                 Optional<int> BasePointerSaveIndex) {
  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
       I != E; ++I) {
    if (!MFI.isDeadObjectIndex(I) &&
        MFI.getStackID(I) == TargetStackID::SGPRSpill &&
-        FramePointerSaveIndex && I != FramePointerSaveIndex) {
+        ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
+         (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
      return false;
    }
  }
@ -928,7 +1115,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();

  FuncInfo->removeDeadFrameIndices(MFI);
-  assert(allSGPRSpillsAreDead(MFI, None) &&
+  assert(allSGPRSpillsAreDead(MFI, None, None) &&
         "SGPR spill should have been removed in SILowerSGPRSpills");

  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
@ -984,54 +1171,19 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
  for (auto SSpill : MFI->getSGPRSpillVGPRs())
    SavedVGPRs.reset(SSpill.VGPR);

-  const bool HasFP = WillHaveFP || hasFP(MF);
-  if (!HasFP)
-    return;
+  LivePhysRegs LiveRegs;
+  LiveRegs.init(*TRI);

-  // We need to save and restore the current FP.
-
-  // 1: If there is already a VGPR with free lanes, use it. We
-  // may already have to pay the penalty for spilling a CSR VGPR.
-  if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
-    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
-                                                    TargetStackID::SGPRSpill);
-
-    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
-      llvm_unreachable("allocate SGPR spill should have worked");
-
-    MFI->FramePointerSaveIndex = NewFI;
-
-    LLVM_DEBUG(
-      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
-      dbgs() << "Spilling FP to  " << printReg(Spill.VGPR, TRI)
-             << ':' << Spill.Lane << '\n');
-    return;
+  if (WillHaveFP || hasFP(MF)) {
+    getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
+                                   MFI->FramePointerSaveIndex, true);
  }

-  // 2: Next, try to save the FP in an unused SGPR.
-  MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
-
-  if (!MFI->SGPRForFPSaveRestoreCopy) {
-    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
-                                                    TargetStackID::SGPRSpill);
-
-    if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
-      // 3: There's no free lane to spill, and no free register to save FP, so
-      // we're forced to spill another VGPR to use for the spill.
-      MFI->FramePointerSaveIndex = NewFI;
-    } else {
-      // 4: If all else fails, spill the FP to memory.
-      MFI->FramePointerSaveIndex =
-          FrameInfo.CreateSpillStackObject(4, Align(4));
-    }
-
-    LLVM_DEBUG(
-      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
-      dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
-             << ':' << Spill.Lane << '\n';);
-  } else {
-    LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
-               printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+  if (TRI->hasBasePointer(MF)) {
+    if (MFI->SGPRForFPSaveRestoreCopy)
+      LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
+    getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
+                                   MFI->BasePointerSaveIndex, false);
  }
 }

@ -1058,14 +1210,31 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
    return true; // Early exit if no callee saved registers are modified!

  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+  if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
+      !FuncInfo->SGPRForBPSaveRestoreCopy)
    return false;

+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *RI = ST.getRegisterInfo();
+  Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  Register BasePtrReg = RI->getBaseRegister();
+  unsigned NumModifiedRegs = 0;
+
+  if (FuncInfo->SGPRForFPSaveRestoreCopy)
+    NumModifiedRegs++;
+  if (FuncInfo->SGPRForBPSaveRestoreCopy)
+    NumModifiedRegs++;
+
  for (auto &CS : CSI) {
-    if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
-      if (FuncInfo->SGPRForFPSaveRestoreCopy)
-        CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
-      break;
+    if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
+      CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+      if (--NumModifiedRegs)
+        break;
+    } else if (CS.getReg() == BasePtrReg &&
+               FuncInfo->SGPRForBPSaveRestoreCopy) {
+      CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+      if (--NumModifiedRegs)
+        break;
    }
  }

--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -426,9 +426,9 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
 }

 void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
-  // The FP spill hasn't been inserted yet, so keep it around.
+  // The FP & BP spills haven't been inserted yet, so keep them around.
  for (auto &R : SGPRToVGPRSpills) {
-    if (R.first != FramePointerSaveIndex)
+    if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex)
      MFI.RemoveStackObject(R.first);
  }

@ -436,7 +436,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
  // ID.
  for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
       ++i)
-    if (i != FramePointerSaveIndex)
+    if (i != FramePointerSaveIndex && i != BasePointerSaveIndex)
      MFI.setStackID(i, TargetStackID::Default);

  for (auto &R : VGPRToAGPRSpills) {
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@ -485,6 +485,11 @@ public: // FIXME
  Register SGPRForFPSaveRestoreCopy;
  Optional<int> FramePointerSaveIndex;

+  /// If this is set, an SGPR used for save/restore of the register used for the
+  /// base pointer.
+  Register SGPRForBPSaveRestoreCopy;
+  Optional<int> BasePointerSaveIndex;
+
  Register VGPRReservedForSGPRSpill;
  bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);

--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -112,6 +112,15 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
                        : FuncInfo->getStackPtrOffsetReg();
 }

+bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  // When we need stack realignment, we can't reference off of the
+  // stack pointer, so we reserve a base pointer.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MFI.getNumFixedObjects() && needsStackRealignment(MF);
+}
+
+Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
+
 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
  return CSR_AMDGPU_AllVGPRs_RegMask;
 }
@ -309,6 +318,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
    assert(!isSubRegister(ScratchRSrcReg, FrameReg));
  }

+  if (hasBasePointer(MF)) {
+    MCRegister BasePtrReg = getBaseRegister();
+    reserveRegisterTuples(Reserved, BasePtrReg);
+    assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
+  }
+
  for (MCRegister Reg : MFI->WWMReservedRegs) {
    reserveRegisterTuples(Reserved, Reg);
  }
@ -1058,7 +1073,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
  int Index = MI->getOperand(FIOperandNum).getIndex();

-  Register FrameReg = getFrameRegister(*MF);
+  Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
+                          ? getBaseRegister()
+                          : getFrameRegister(*MF);

  switch (MI->getOpcode()) {
    // SGPR register spill
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@ -65,6 +65,9 @@ public:

  Register getFrameRegister(const MachineFunction &MF) const override;

+  bool hasBasePointer(const MachineFunction &MF) const;
+  Register getBaseRegister() const;
+
  bool canRealignStack(const MachineFunction &MF) const override;
  bool requiresRegisterScavenging(const MachineFunction &Fn) const override;

--- a/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
+++ b/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
@ -10,17 +10,17 @@
 define i32 @fp_save_restore_in_temp_sgpr(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 4 %arg) #0 {
  ; GCN-LABEL: name: fp_save_restore_in_temp_sgpr
  ; GCN: bb.0.begin:
-  ; GCN:   liveins: $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr7, $sgpr30_sgpr31
  ; GCN:   $sgpr7 = frame-setup COPY $sgpr33
  ; GCN:   $sgpr33 = frame-setup COPY $sgpr32
  ; GCN: bb.1.lp_end:
-  ; GCN:   liveins: $sgpr6, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
  ; GCN: bb.2.lp_begin:
-  ; GCN:   liveins: $sgpr6, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr30_sgpr31
  ; GCN: bb.3.Flow:
-  ; GCN:   liveins: $sgpr6, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
  ; GCN: bb.4.end:
-  ; GCN:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr7, $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31
  ; GCN:   $sgpr33 = frame-setup COPY $sgpr7
 begin:
  br label %lp_begin
--- a/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@ -27,7 +27,7 @@ body:             |
    liveins: $vgpr1

    ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
@ -69,7 +69,7 @@ body:             |
    liveins: $vgpr1

    ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
@ -109,7 +109,7 @@ body:             |
    liveins: $vgpr1

    ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
@ -148,7 +148,7 @@ body:             |
    liveins: $vgpr1

    ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
--- a/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@ -22,7 +22,7 @@ body:             |
    liveins: $vgpr1

    ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
--- a/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
+++ b/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
@ -22,7 +22,7 @@ body:             |
    liveins: $vgpr1

    ; CHECK-LABEL: name: scavenge_sgpr_pei
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 262080, implicit-def $scc
    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294705152, implicit-def $scc
--- a/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/test/CodeGen/AMDGPU/stack-realign.ll
@ -148,7 +148,114 @@ define void @disable_realign_align128(i32 %idx) #3 {
  ret void
 }

+declare void @extern_func(<32 x i32>, i32) #0
+define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
+; The test forces the stack to be realigned to a new boundary
+; since there is a local object with an alignment of 1024.
+; Should use BP to access the incoming stack arguments.
+; The BP value is saved/restored with a VGPR spill.
+
+; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill:
+; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
+; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
+; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+
+; GCN: s_mov_b32 s34, s32
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+
+; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
+; GCN-NEXT: s_add_u32 s32, s32, 0x30000
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+
+; GCN: v_readlane_b32 s33, [[VGPR_REG]], 2
+; GCN-NEXT: s_sub_u32 s32, s32, 0x30000
+; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+  %temp = alloca i32, align 1024, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %temp, align 1024
+  call void @extern_func(<32 x i32> %a, i32 %b)
+  ret void
+}
+
+%struct.Data = type { [9 x i32] }
+define i32 @needs_align1024_stack_args_used_inside_loop(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 8 %arg) local_unnamed_addr #4 {
+; The local object allocation needed an alignment of 1024.
+; Since the function argument is accessed in a loop with an
+; index variable, the base pointer first get loaded into a VGPR
+; and that value should be further referenced to load the incoming values.
+; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue.
+
+; GCN-LABEL: needs_align1024_stack_args_used_inside_loop:
+; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
+; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34
+; GCN: s_add_u32 s32, s32, 0x30000
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024
+; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen
+; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]]
+; GCN: s_sub_u32 s32, s32, 0x30000
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
+; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+begin:
+  %local_var = alloca i32, align 1024, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %local_var, align 1024
+  br label %loop_body
+
+loop_end:                                                ; preds = %loop_body
+  %idx_next = add nuw nsw i32 %lp_idx, 1
+  %lp_exit_cond = icmp eq i32 %idx_next, 9
+  br i1 %lp_exit_cond, label %exit, label %loop_body
+
+loop_body:                                                ; preds = %loop_end, %begin
+  %lp_idx = phi i32 [ 0, %begin ], [ %idx_next, %loop_end ]
+  %ptr = getelementptr inbounds %struct.Data, %struct.Data addrspace(5)* %arg, i32 0, i32 0, i32 %lp_idx
+  %val = load i32, i32 addrspace(5)* %ptr, align 8
+  %lp_cond = icmp eq i32 %val, %lp_idx
+  br i1 %lp_cond, label %loop_end, label %exit
+
+exit:                                               ; preds = %loop_end, %loop_body
+  %out = phi i32 [ 0, %loop_body ], [ 1, %loop_end ]
+  ret i32 %out
+}
+
+define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 {
+; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy:
+; GCN: ; %bb.0:
+; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+  %local_val = alloca i32, align 128, addrspace(5)
+  store volatile i32 %b, i32 addrspace(5)* %local_val, align 128
+  ; Use all clobberable registers, so BP has to spill to a VGPR.
+  call void asm sideeffect "",
+    "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
+    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
+    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
+    ,~{vcc_hi}"() #0
+  ret void
+}
+
 attributes #0 = { noinline nounwind }
 attributes #1 = { noinline nounwind "stackrealign" }
 attributes #2 = { noinline nounwind alignstack=4 }
 attributes #3 = { noinline nounwind "no-realign-stack" }
+attributes #4 = { noinline nounwind "frame-pointer"="all"}