From d4ef89cda8ad3b055f05f604dd48655f12d52b1a Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Wed, 10 Mar 2021 11:04:54 +0800 Subject: [PATCH] [AMDGPU] Always create Stack Object for reserved VGPR As we may overwrite inactive lanes of a caller-save-vgpr, we should always save/restore the reserved vgpr for sgpr spill. Reviewed by: arsenm Differential Revision: https://reviews.llvm.org/D98319 --- lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 9 ++-- test/CodeGen/AMDGPU/callee-frame-setup.ll | 59 +++++++++++++++++------ test/CodeGen/AMDGPU/stack-realign.ll | 8 +-- 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index add83746b26..ba96ae6b1b3 100644 --- a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -262,13 +262,10 @@ static bool lowerShiftReservedVGPR(MachineFunction &MF, if (!LowestAvailableVGPR) LowestAvailableVGPR = PreReservedVGPR; - const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - Optional FI; - // Check if we are reserving a CSR. Create a stack object for a possible spill - // in the function prologue. - if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR)) - FI = FrameInfo.CreateSpillStackObject(4, Align(4)); + // Create a stack object for a possible spill in the function prologue. + // Note Non-CSR VGPR also need this as we may overwrite inactive lanes. + Optional FI = FrameInfo.CreateSpillStackObject(4, Align(4)); // Find saved info about the pre-reserved register. const auto *ReservedVGPRInfoItr = diff --git a/test/CodeGen/AMDGPU/callee-frame-setup.ll b/test/CodeGen/AMDGPU/callee-frame-setup.ll index 3cadd4ca636..c7dfa16e4d3 100644 --- a/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -269,6 +269,10 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: ; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v1, s33, 63 ; GCN-COUNT-60: v_writelane_b32 v1 ; GCN: s_mov_b32 s33, s32 @@ -280,11 +284,15 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { ; GCN: ;;#ASMSTART ; GCN: v_writelane_b32 v1 -; MUBUF: s_add_u32 s32, s32, 0x300 -; MUBUF: s_sub_u32 s32, s32, 0x300 -; FLATSCR: s_add_u32 s32, s32, 12 -; FLATSCR: s_sub_u32 s32, s32, 12 +; MUBUF: s_add_u32 s32, s32, 0x400 +; MUBUF: s_sub_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 +; FLATSCR: s_sub_u32 s32, s32, 16 ; GCN-NEXT: v_readlane_b32 s33, v1, 63 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @last_lane_vgpr_for_fp_csr() #1 { @@ -306,6 +314,10 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: ; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-COUNT-62: v_writelane_b32 v1, ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 @@ -318,12 +330,16 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; GCN: v_writelane_b32 v1, ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; MUBUF: s_add_u32 s32, s32, 0x300 -; FLATSCR: s_add_u32 s32, s32, 12 +; MUBUF: s_add_u32 s32, s32, 0x400 +; FLATSCR: s_add_u32 s32, s32, 16 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 -; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x400 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @no_new_vgpr_for_fp_csr() #1 { @@ -368,6 +384,10 @@ define void @realign_stack_no_fp_elim() #1 { ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v1, s33, 2 ; GCN-NEXT: v_writelane_b32 v1, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 @@ -377,15 +397,20 @@ define void @realign_stack_no_fp_elim() #1 { ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART -; MUBUF: v_readlane_b32 s4, v1, 0 -; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 +; MUBUF: s_add_u32 s32, s32, 0x300 +; MUBUF-NEXT: v_readlane_b32 s4, v1, 0 ; MUBUF-NEXT: v_readlane_b32 s5, v1, 1 -; FLATSCR: v_readlane_b32 s0, v1, 0 -; FLATSCR-NEXT: s_add_u32 s32, s32, 8 +; FLATSCR: s_add_u32 s32, s32, 12 +; FLATSCR-NEXT: v_readlane_b32 s0, v1, 0 ; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1 -; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 -; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 +; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 ; GCN-NEXT: v_readlane_b32 s33, v1, 2 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; GCN-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[4:5] ; FLATSCR-NEXT: s_setpc_b64 s[0:1] define void @no_unused_non_csr_sgpr_for_fp() #1 { @@ -645,9 +670,11 @@ define void @callee_need_to_spill_fp_to_reg() #1 { ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset ; MUBUF: s_or_saveexec_b64 s[4:5], -1 +; MUBUF: v_mov_b32_e32 v0, 0x1008 +; MUBUF-NEXT: buffer_store_dword v39, v0, s[0:3], s32 offen ; 4-byte Folded Spill ; MUBUF: v_mov_b32_e32 v0, s33 -; GCN-NOT: v_mov_b32_e32 v0, 0x1008 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008 +; GCN-NOT: v_mov_b32_e32 v0, 0x100c +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x100c ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004 ; FLATSCR: v_mov_b32_e32 v0, 0 diff --git a/test/CodeGen/AMDGPU/stack-realign.ll b/test/CodeGen/AMDGPU/stack-realign.ll index 3132668b505..b53c10ce599 100644 --- a/test/CodeGen/AMDGPU/stack-realign.ll +++ b/test/CodeGen/AMDGPU/stack-realign.ll @@ -293,12 +293,12 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i ; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset ; GCN: s_or_saveexec_b64 s[4:5], -1 ; GCN: v_mov_b32_e32 v0, s33 -; GCN-NOT: v_mov_b32_e32 v0, 0x1084 -; GCN-NEXT: v_mov_b32_e32 v1, 0x1084 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen -; GCN: v_mov_b32_e32 v0, s34 ; GCN-NOT: v_mov_b32_e32 v0, 0x1088 ; GCN-NEXT: v_mov_b32_e32 v1, 0x1088 +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen +; GCN: v_mov_b32_e32 v0, s34 +; GCN-NOT: v_mov_b32_e32 v0, 0x108c +; GCN-NEXT: v_mov_b32_e32 v1, 0x108c ; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128