From 21c0f36daba2463932cfeddb229ebb578e06fcfc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 28 Jul 2020 17:36:14 -0400 Subject: [PATCH] AMDGPU: Correct prolog SP initialization logic Having callees that will read SP is not the only reason we need to reference the stack pointer. --- lib/Target/AMDGPU/SIFrameLowering.cpp | 43 +++++++++++++++++-- lib/Target/AMDGPU/SIFrameLowering.h | 2 + .../GlobalISel/dynamic-alloca-uniform.ll | 6 +++ .../AMDGPU/GlobalISel/non-entry-alloca.ll | 6 ++- test/CodeGen/AMDGPU/non-entry-alloca.ll | 2 + 5 files changed, 54 insertions(+), 5 deletions(-) diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index a5b04570655..fdccd833b36 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -447,7 +447,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } assert(ScratchWaveOffsetReg); - if (MF.getFrameInfo().hasCalls()) { + if (requiresStackPointerReference(MF)) { Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) @@ -1262,6 +1262,20 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( return MBB.erase(I); } +/// Returns true if the frame will require a reference to the stack pointer. +/// +/// This is the set of conditions common to setting up the stack pointer in a +/// kernel, and for using a frame pointer in a callable function. +/// +/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm +/// references SP. +static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { + return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); +} + +// The FP for kernels is always known 0, so we never really need to setup an +// explicit register for it. However, DisableFramePointerElim will force us to +// use a register for it. bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1277,8 +1291,31 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || - MFI.hasStackMap() || MFI.hasPatchPoint() || + return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || MF.getSubtarget().getRegisterInfo()->needsStackRealignment(MF) || MF.getTarget().Options.DisableFramePointerElim(MF); } + +// This is essentially a reduced version of hasFP for entry functions. Since the +// stack pointer is known 0 on entry to kernels, we never really need an FP +// register. We may need to initialize the stack pointer depending on the frame +// properties, which logically overlaps many of the cases where an ordinary +// function would require an FP. +bool SIFrameLowering::requiresStackPointerReference( + const MachineFunction &MF) const { + // Callable functions always require a stack pointer reference. + assert(MF.getInfo()->isEntryFunction() && + "only expected to call this for entry points"); + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Entry points ordinarily don't need to initialize SP. We have to set it up + // for callees if there are any. Also note tail calls are impossible/don't + // make any sense for kernels. + if (MFI.hasCalls()) + return true; + + // We still need to initialize the SP if we're doing anything weird that + // references the SP, like variable sized stack objects. + return frameTriviallyRequiresSP(MFI); +} diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index e8943204066..b95dd066278 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -71,6 +71,8 @@ private: public: bool hasFP(const MachineFunction &MF) const override; + + bool requiresStackPointerReference(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index 197068d395d..f854d2facb0 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -15,6 +15,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -26,6 +27,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 @@ -117,6 +119,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -128,6 +131,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 @@ -219,6 +223,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 @@ -231,6 +236,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 diff --git a/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index 0466a61b4b1..3fa2613a450 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -18,12 +18,13 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s33, 0 +; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xc @@ -99,12 +100,13 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s33, 0 +; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 diff --git a/test/CodeGen/AMDGPU/non-entry-alloca.ll b/test/CodeGen/AMDGPU/non-entry-alloca.ll index 0cd60bd8203..58085f89e04 100644 --- a/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -18,6 +18,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s8, 0 @@ -89,6 +90,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0