1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

[AMDGPU][IndirectCalls] Fix register usage propagation for indirect/external calls

This patch computes max SGPRs and VGPRs used by module
in presence of indirect calls and makes that
as register requirement for functions/kernels
which makes indirect calls.

This patch also refactors code AMDGPUSubTarget.cpp
which add a "base" variants of getMaxNumSGPRs which
is used by MachineFunction and new Function version.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D103636
This commit is contained in:
madhur13490 2021-06-03 22:34:10 +05:30
parent 09c0e58fa9
commit 1d0f3b309b
8 changed files with 178 additions and 56 deletions

View File

@ -627,6 +627,21 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
return false;
}
bool AMDGPUAsmPrinter::doInitialization(Module &M) {
NonKernelMaxSGPRs = 0;
NonKernelMaxVGPRs = 0;
// Compute upper bound on the number of SGPRs and VGPRs
// for non-kernel functions.
for (const Function &F : M) {
if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, STM.getMaxNumSGPRs(F));
NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, STM.getMaxNumVGPRs(F));
}
}
return AsmPrinter::doInitialization(M);
}
// TODO: Fold this into emitFunctionBodyStart.
void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
// In the beginning all features are either 'Any' or 'NotSupported',
@ -1020,14 +1035,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
report_fatal_error("invalid call to entry function");
// If this is a call to an external function, we can't do much. Make
// conservative guesses.
// 48 SGPRs - vcc, - flat_scr, -xnack
int MaxSGPRGuess =
47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
TM.getMCSubtargetInfo(), false, ST.hasFlatAddressSpace());
// If this is a call to an external function, we put the
// max values computed in doInitialization().
// Subtract extra SGPRs in case of indirect calls.
// For indirect calls, we take the max for the module
// and use that as the register budget for functions
// which makes an indirect calls. This max value
// includes extra SGPRs too (e.g. flatscratch and vcc).
// which are getting added later.
// Subtract them here so that they don't get added twice.
MaxSGPR = NonKernelMaxSGPRs - ExtraSGPRs - 1;
MaxVGPR = NonKernelMaxVGPRs - 1;
// TODO: handle AGPRs
MaxAGPR = std::max(MaxAGPR, 23);
CalleeFrameSize = std::max(CalleeFrameSize,

View File

@ -58,6 +58,8 @@ private:
void initializeTargetID(const Module &M);
bool doInitialization(Module &M) override;
SIProgramInfo CurrentProgramInfo;
DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
@ -101,6 +103,11 @@ public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer);
// To memoize max SGPR usage of non-kernel functions of the module.
unsigned NonKernelMaxSGPRs = 0;
// To memoize max VGPR usage of non-kernel functions of the module.
unsigned NonKernelMaxVGPRs = 0;
StringRef getPassName() const override;
const MCSubtargetInfo* getGlobalSTI() const;

View File

@ -698,12 +698,12 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
}
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
unsigned
GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
if (MFI.hasFlatScratchInit()) {
if (HasFlatScratchInit) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
@ -715,6 +715,26 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
}
unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
// The logic to detect if the function has
// flat scratch init is same as how MachineFunctionInfo derives.
bool FunctionHasFlatScratchInit = false;
bool HasCalls = F.hasFnAttribute("amdgpu-calls");
bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
if (hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(F.getCallingConv()) &&
(isAmdHsaOrMesa(F) || enableFlatScratch()) &&
!flatScratchIsArchitected()) {
if (HasCalls || HasStackObjects || enableFlatScratch())
FunctionHasFlatScratchInit = true;
}
return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
}
unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs,
unsigned NumVGPRs) const {
@ -728,13 +748,11 @@ unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
return Occupancy;
}
unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
unsigned GCNSubtarget::getBaseMaxNumSGPRs(
const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
// Compute maximum number of SGPRs function can use using default/requested
// minimum number of waves per execution unit.
std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
@ -745,7 +763,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
F, "amdgpu-num-sgpr", MaxNumSGPRs);
// Make sure requested value does not violate subtarget's specifications.
if (Requested && (Requested <= getReservedNumSGPRs(MF)))
if (Requested && (Requested <= ReservedNumSGPRs))
Requested = 0;
// If more SGPRs are required to support the input user/system SGPRs,
@ -755,7 +773,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
// of reserved special registers in total. Theoretically you could re-use
// the last input registers for these special registers, but this would
// require a lot of complexity to deal with the weird aliasing.
unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
unsigned InputNumSGPRs = PreloadedSGPRs;
if (Requested && Requested < InputNumSGPRs)
Requested = InputNumSGPRs;
@ -774,17 +792,43 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
if (hasSGPRInitBug())
MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
MaxAddressableNumSGPRs);
return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
}
unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
getReservedNumSGPRs(MF));
}
static unsigned getMaxNumPreloadedSGPRs() {
// Max number of user SGPRs
unsigned MaxUserSGPRs = 4 + // private segment buffer
2 + // Dispatch ptr
2 + // queue ptr
2 + // kernel segment ptr
2 + // dispatch ID
2 + // flat scratch init
2; // Implicit buffer ptr
// Max number of system SGPRs
unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
1 + // WorkGroupIDY
1 + // WorkGroupIDZ
1 + // WorkGroupInfo
1; // private segment wave byte offset
return MaxUserSGPRs + MaxSystemSGPRs;
}
unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
getReservedNumSGPRs(F));
}
unsigned GCNSubtarget::getBaseMaxNumVGPRs(
const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
// Compute maximum number of VGPRs function can use using default/requested
// minimum number of waves per execution unit.
std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
// Check if maximum number of VGPRs was explicitly requested using
@ -811,6 +855,16 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return MaxNumVGPRs;
}
unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
}
unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
}
void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
int UseOpIdx, SDep &Dep) const {
if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||

View File

@ -1033,9 +1033,24 @@ public:
return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
}
/// \returns Reserved number of SGPRs for given function \p MF.
/// \returns Reserved number of SGPRs. This is common
/// utility function called by MachineFunction and
/// Function variants of getReservedNumSGPRs.
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const;
/// \returns Reserved number of SGPRs for given machine function \p MF.
unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
/// \returns Reserved number of SGPRs for given function \p F.
unsigned getReservedNumSGPRs(const Function &F) const;
/// \returns max num SGPRs. This is the common utility
/// function called by MachineFunction and Function
/// variants of getMaxNumSGPRs.
unsigned getBaseMaxNumSGPRs(const Function &F,
std::pair<unsigned, unsigned> WavesPerEU,
unsigned PreloadedSGPRs,
unsigned ReservedNumSGPRs) const;
/// \returns Maximum number of SGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of SGPRs explicitly
/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
@ -1046,6 +1061,16 @@ public:
/// unit requirement.
unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
/// \returns Maximum number of SGPRs that meets number of waves per execution
/// unit requirement for function \p F, or number of SGPRs explicitly
/// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
///
/// \returns Value that meets number of waves per execution unit requirement
/// if explicitly requested value cannot be converted to integer, violates
/// subtarget's specifications, or does not meet number of waves per execution
/// unit requirement.
unsigned getMaxNumSGPRs(const Function &F) const;
/// \returns VGPR allocation granularity supported by the subtarget.
unsigned getVGPRAllocGranule() const {
return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
@ -1078,6 +1103,20 @@ public:
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
}
/// \returns max num VGPRs. This is the common utility function
/// called by MachineFunction and Function variants of getMaxNumVGPRs.
unsigned getBaseMaxNumVGPRs(const Function &F,
std::pair<unsigned, unsigned> WavesPerEU) const;
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p F, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
///
/// \returns Value that meets number of waves per execution unit requirement
/// if explicitly requested value cannot be converted to integer, violates
/// subtarget's specifications, or does not meet number of waves per execution
/// unit requirement.
unsigned getMaxNumVGPRs(const Function &F) const;
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.

View File

@ -154,21 +154,22 @@ bb:
declare void @undef_func()
; GCN-LABEL: {{^}}kernel_call_undef_func:
; GFX908: .amdhsa_next_free_vgpr 24
; GFX90A: .amdhsa_next_free_vgpr 48
; GFX90A: .amdhsa_accum_offset 24
; GCN: NumVgprs: 24
; GFX908: .amdhsa_next_free_vgpr 128
; GFX90A: .amdhsa_next_free_vgpr 280
; GFX90A: .amdhsa_accum_offset 256
; GCN908: NumVgprs: 128
; GCN90A: NumVgprs: 256
; GCN: NumAgprs: 24
; GFX908: TotalNumVgprs: 24
; GFX90A: TotalNumVgprs: 48
; GFX908: VGPRBlocks: 5
; GFX90A: VGPRBlocks: 5
; GFX908: NumVGPRsForWavesPerEU: 24
; GFX90A: NumVGPRsForWavesPerEU: 48
; GFX90A: AccumOffset: 24
; GFX908: Occupancy: 10
; GFX90A: Occupancy: 8
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5
; GFX908: TotalNumVgprs: 128
; GFX90A: TotalNumVgprs: 280
; GFX908: VGPRBlocks: 31
; GFX90A: VGPRBlocks: 34
; GFX908: NumVGPRsForWavesPerEU: 128
; GFX90A: NumVGPRsForWavesPerEU: 280
; GFX90A: AccumOffset: 256
; GFX908: Occupancy: 2
; GFX90A: Occupancy: 1
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()

View File

@ -147,9 +147,9 @@ attributes #0 = { nounwind }
; GCN: amdpal.pipelines:
; GCN-NEXT: - .registers:
; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}}
; SDAG-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}}
; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}}
; GISEL-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
; GCN-NEXT: .shader_functions:
; GCN-NEXT: dynamic_stack:

View File

@ -227,10 +227,10 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; CI: NumSgprs: 102
; VI-NOBUG: NumSgprs: 102
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 24
; GCN: NumVgprs: 64
define amdgpu_kernel void @count_use_sgpr96_external_call() {
entry:
tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@ -241,10 +241,10 @@ entry:
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; CI: NumSgprs: 102
; VI-NOBUG: NumSgprs: 102
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 24
; GCN: NumVgprs: 64
define amdgpu_kernel void @count_use_sgpr160_external_call() {
entry:
tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@ -255,10 +255,10 @@ entry:
; Make sure there's no assert when a vgpr160 is used.
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; CI: NumSgprs: 102
; VI-NOBUG: NumSgprs: 102
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 24
; GCN: NumVgprs: 64
define amdgpu_kernel void @count_use_vgpr160_external_call() {
entry:
tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1

View File

@ -15,8 +15,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() {
; GCN-NEXT: amd_machine_version_stepping = 0
; GCN-NEXT: kernel_code_entry_byte_offset = 256
; GCN-NEXT: kernel_code_prefetch_byte_size = 0
; GCN-NEXT: granulated_workitem_vgpr_count = 7
; GCN-NEXT: granulated_wavefront_sgpr_count = 5
; GCN-NEXT: granulated_workitem_vgpr_count = 15
; GCN-NEXT: granulated_wavefront_sgpr_count = 12
; GCN-NEXT: priority = 0
; GCN-NEXT: float_mode = 240
; GCN-NEXT: priv = 0
@ -59,8 +59,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() {
; GCN-NEXT: gds_segment_byte_size = 0
; GCN-NEXT: kernarg_segment_byte_size = 0
; GCN-NEXT: workgroup_fbarrier_count = 0
; GCN-NEXT: wavefront_sgpr_count = 48
; GCN-NEXT: workitem_vgpr_count = 32
; GCN-NEXT: wavefront_sgpr_count = 102
; GCN-NEXT: workitem_vgpr_count = 64
; GCN-NEXT: reserved_vgpr_first = 0
; GCN-NEXT: reserved_vgpr_count = 0
; GCN-NEXT: reserved_sgpr_first = 0
@ -111,8 +111,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
; GCN-NEXT: amd_machine_version_stepping = 0
; GCN-NEXT: kernel_code_entry_byte_offset = 256
; GCN-NEXT: kernel_code_prefetch_byte_size = 0
; GCN-NEXT: granulated_workitem_vgpr_count = 7
; GCN-NEXT: granulated_wavefront_sgpr_count = 5
; GCN-NEXT: granulated_workitem_vgpr_count = 15
; GCN-NEXT: granulated_wavefront_sgpr_count = 12
; GCN-NEXT: priority = 0
; GCN-NEXT: float_mode = 240
; GCN-NEXT: priv = 0
@ -155,8 +155,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
; GCN-NEXT: gds_segment_byte_size = 0
; GCN-NEXT: kernarg_segment_byte_size = 0
; GCN-NEXT: workgroup_fbarrier_count = 0
; GCN-NEXT: wavefront_sgpr_count = 48
; GCN-NEXT: workitem_vgpr_count = 32
; GCN-NEXT: wavefront_sgpr_count = 102
; GCN-NEXT: workitem_vgpr_count = 64
; GCN-NEXT: reserved_vgpr_first = 0
; GCN-NEXT: reserved_vgpr_count = 0
; GCN-NEXT: reserved_sgpr_first = 0