1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[AMDGPU] Save all lanes for reserved VGPRs

When SGPRs are spilled to VGPRs, they can overwrite any lane. We need
to preserve the value of inactive lanes in function calls, so we save
the register even if it is marked as caller saved.

Also, teach buildPrologSpill to work when no registers are free like in
CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir and update the comment on
findScratchNonCalleeSaveRegister as it is not used anymore to realign
the stack pointer since D95865.

Differential Revision: https://reviews.llvm.org/D95946
This commit is contained in:
Sebastian Neubauer 2021-02-04 09:56:36 +01:00
parent ba7695d4ea
commit ab4f1aa423
8 changed files with 185 additions and 68 deletions

View File

@ -20,18 +20,10 @@ using namespace llvm;
#define DEBUG_TYPE "frame-info"
// Find a scratch register that we can use at the start of the prologue to
// re-align the stack pointer. We avoid using callee-save registers since they
// may appear to be free when this is called from canUseAsPrologue (during
// shrink wrapping), but then no longer be free when this is called from
// emitPrologue.
//
// FIXME: This is a bit conservative, since in the above case we could use one
// of the callee-save registers as a scratch temp to re-align the stack pointer,
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
// Find a scratch register that we can use in the prologue. We avoid using
// callee-save registers since they may appear to be free when this is called
// from canUseAsPrologue (during shrink wrapping), but then no longer be free
// when this is called from emitPrologue.
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
LivePhysRegs &LiveRegs,
const TargetRegisterClass &RC,
@ -55,12 +47,6 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
}
}
// If we require an unused register, this is used in contexts where failure is
// an option and has an alternative plan. In other contexts, this must
// succeed0.
if (!Unused)
report_fatal_error("failed to find free scratch register");
return MCRegister();
}
@ -178,22 +164,35 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
bool HasOffsetReg = OffsetReg;
if (!HasOffsetReg) {
// No free register, use stack pointer and restore afterwards.
OffsetReg = SPReg;
}
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
.addReg(SPReg)
.addImm(Offset);
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
.addReg(SpillReg, RegState::Kill)
.addReg(OffsetReg, RegState::Kill)
.addImm(0)
.addReg(OffsetReg, HasOffsetReg ? RegState::Kill : 0)
.addImm(0) // offset
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // dlc
.addMemOperand(MMO);
if (!HasOffsetReg) {
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), OffsetReg)
.addReg(SPReg)
.addImm(Offset);
}
} else {
MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
if (OffsetReg) {
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
.addImm(Offset);
@ -202,13 +201,35 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addReg(OffsetReg, RegState::Kill)
.addReg(ScratchRsrcReg)
.addReg(SPReg)
.addImm(0)
.addImm(0) // offset
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
.addImm(0) // swz
.addMemOperand(MMO);
} else {
// No free register, use stack pointer and restore afterwards.
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), SPReg)
.addReg(SPReg)
.addImm(Offset);
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
.addReg(SpillReg, RegState::Kill)
.addReg(ScratchRsrcReg)
.addReg(SPReg)
.addImm(0) // offset
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
.addImm(0) // swz
.addMemOperand(MMO);
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), SPReg)
.addReg(SPReg)
.addImm(Offset);
}
}
LiveRegs.removeReg(SpillReg);
@ -241,6 +262,8 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
}
MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
if (!OffsetReg)
report_fatal_error("failed to find free scratch register");
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
.addReg(SPReg)
@ -273,6 +296,8 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
if (!OffsetReg)
report_fatal_error("failed to find free scratch register");
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
.addImm(Offset);
@ -821,6 +846,8 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
ScratchExecCopy = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, *TRI.getWaveMaskRegClass());
if (!ScratchExecCopy)
report_fatal_error("failed to find free scratch register");
if (!IsProlog)
LiveRegs.removeReg(ScratchExecCopy);
@ -903,6 +930,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
if (!TmpVGPR)
report_fatal_error("failed to find free scratch register");
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(FramePtrReg);
@ -920,6 +949,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
if (!TmpVGPR)
report_fatal_error("failed to find free scratch register");
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(BasePtrReg);
@ -1140,6 +1171,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
if (!TempVGPR)
report_fatal_error("failed to find free scratch register");
buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
@ -1165,6 +1198,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
if (!TempVGPR)
report_fatal_error("failed to find free scratch register");
buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)

View File

@ -289,8 +289,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
assert(Size >= 4 && "invalid sgpr spill size");
assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
// Make sure to handle the case where a wide SGPR spill may span between two
// VGPRs.
for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
@ -316,13 +314,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
return false;
}
Optional<int> CSRSpillFI;
if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
isCalleeSavedReg(CSRegs, LaneVGPR)) {
CSRSpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
Optional<int> SpillFI;
// We need to preserve inactive lanes, so always save, even caller-save
// registers.
if (!isEntryFunction()) {
SpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
}
SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, SpillFI));
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.

View File

@ -214,11 +214,20 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
; GCN-LABEL: {{^}}spill_only_csr_sgpr:
; GCN: s_waitcnt
; GCN-NEXT: s_or_saveexec_b64
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec,
; GCN-NEXT: v_writelane_b32 v0, s42, 0
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; clobber s42
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s42, v0, 0
; GCN-NEXT: s_or_saveexec_b64
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec,
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @spill_only_csr_sgpr() {
call void asm sideeffect "; clobber s42", "~{s42}"()

View File

@ -7,11 +7,12 @@
name: merge_sgpr_spill_into_copy_from_exec_lo
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo
; CHECK: liveins: $vgpr0
; CHECK: S_WAITCNT 0
; CHECK: S_NOP 0, implicit-def $exec_lo
; CHECK: $sgpr0 = S_MOV_B32 $exec_lo
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
@ -31,11 +32,12 @@ body: |
name: merge_sgpr_spill_into_copy_from_exec_hi
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi
; CHECK: liveins: $vgpr0
; CHECK: S_WAITCNT 0
; CHECK: S_NOP 0, implicit-def $exec_hi
; CHECK: $sgpr0 = S_MOV_B32 $exec_hi
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
@ -55,11 +57,12 @@ body: |
name: merge_sgpr_spill_into_copy_from_exec
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec
; CHECK: liveins: $vgpr0
; CHECK: S_WAITCNT 0
; CHECK: S_NOP 0, implicit-def $exec
; CHECK: $sgpr0_sgpr1 = S_MOV_B64 $exec
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
@ -85,11 +88,12 @@ body: |
name: reload_sgpr_spill_into_copy_to_exec_lo
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo
; CHECK: liveins: $vgpr0
; CHECK: S_WAITCNT 0
; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK: $sgpr0 = V_READLANE_B32 $vgpr0, 0
@ -107,11 +111,12 @@ body: |
name: reload_sgpr_spill_into_copy_to_exec_hi
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi
; CHECK: liveins: $vgpr0
; CHECK: S_WAITCNT 0
; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK: $sgpr0 = V_READLANE_B32 $vgpr0, 0
@ -129,11 +134,12 @@ body: |
name: reload_sgpr_spill_into_copy_to_exec
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec
; CHECK: liveins: $vgpr0
; CHECK: S_WAITCNT 0
; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1

View File

@ -7,12 +7,13 @@
name: merge_sgpr_spill_into_copy_from_m0
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0
; CHECK: liveins: $vgpr0
; CHECK: S_WAITCNT 0
; CHECK: S_NOP 0, implicit-def $m0
; CHECK: $sgpr0 = S_MOV_B32 $m0
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
@ -36,12 +37,13 @@ body: |
name: reload_sgpr_spill_into_copy_to_m0
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0
; CHECK: liveins: $vgpr0
; CHECK: S_WAITCNT 0
; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK: $sgpr0 = V_READLANE_B32 $vgpr0, 0

View File

@ -488,22 +488,39 @@ define hidden void @void_func_void_clobber_s33() #1 {
; GFX9-LABEL: void_func_void_clobber_s33:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v0, s33, 0
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; clobber
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_readlane_b32 s33, v0, 0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: void_func_void_clobber_s33:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: v_writelane_b32 v0, s33, 0
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; clobber
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_readlane_b32 s33, v0, 0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; clobber", "~{s33}"() #0
ret void
@ -513,22 +530,39 @@ define hidden void @void_func_void_clobber_s34() #1 {
; GFX9-LABEL: void_func_void_clobber_s34:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v0, s34, 0
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; clobber
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_readlane_b32 s34, v0, 0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: void_func_void_clobber_s34:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: v_writelane_b32 v0, s34, 0
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; clobber
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_readlane_b32 s34, v0, 0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; clobber", "~{s34}"() #0
ret void

View File

@ -73,6 +73,9 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; clobber csr v40
@ -83,6 +86,9 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
; CHECK-NEXT: v_readlane_b32 s33, v1, 0
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_setpc_b64 s[4:5]
bb:
call void asm sideeffect "; clobber csr v40", "~{v40}"()

View File

@ -25,37 +25,59 @@ body: |
; GFX8-LABEL: name: pei_scavenge_vgpr_spill
; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX8: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc
; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
; GFX8: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc
; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GFX8: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
; GFX8: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc
; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
; GFX8: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 12, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
; GFX8: $vcc_lo = S_MOV_B32 8192
; GFX8: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
; GFX8: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
; GFX8: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
; GFX8: $sgpr33 = V_READLANE_B32 $vgpr2, 0
; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX8: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec
; GFX8: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
; GFX8: S_ENDPGM 0, csr_amdgpu_allvgprs
; GFX9-LABEL: name: pei_scavenge_vgpr_spill
; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX9: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc
; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
; GFX9: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc
; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GFX9: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
; GFX9: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc
; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
; GFX9: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 12, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec
; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
; GFX9: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
; GFX9: $sgpr33 = V_READLANE_B32 $vgpr2, 0
; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX9: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec
; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs
; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill
; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
; GFX9-FLATSCR: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX9-FLATSCR: $sgpr4 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc
; GFX9-FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.3, addrspace 5)
; GFX9-FLATSCR: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GFX9-FLATSCR: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
; GFX9-FLATSCR: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc
; GFX9-FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def $scc
@ -65,6 +87,10 @@ body: |
; GFX9-FLATSCR: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec
; GFX9-FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc
; GFX9-FLATSCR: $sgpr33 = V_READLANE_B32 $vgpr2, 0
; GFX9-FLATSCR: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX9-FLATSCR: $sgpr4 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc
; GFX9-FLATSCR: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.3, addrspace 5)
; GFX9-FLATSCR: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GFX9-FLATSCR: S_ENDPGM 0, csr_amdgpu_allvgprs
$vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec
$vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec