[AMDGPU][IndirectCalls] Fix register usage propagation for indirect/external calls

This patch computes max SGPRs and VGPRs used by module in presence of indirect calls and makes that as register requirement for functions/kernels which makes indirect calls. This patch also refactors code AMDGPUSubTarget.cpp which add a "base" variants of getMaxNumSGPRs which is used by MachineFunction and new Function version. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D103636
2024-11-26 12:43:36 +01:00 · 2021-06-03 22:34:10 +05:30 · 2021-06-03 22:34:10 +05:30 · 1d0f3b309b
commit 1d0f3b309b
parent 09c0e58fa9
8 changed files with 178 additions and 56 deletions
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -627,6 +627,21 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
  return false;
 }

+bool AMDGPUAsmPrinter::doInitialization(Module &M) {
+  NonKernelMaxSGPRs = 0;
+  NonKernelMaxVGPRs = 0;
+  // Compute upper bound on the number of SGPRs and VGPRs
+  // for non-kernel functions.
+  for (const Function &F : M) {
+    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
+      const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
+      NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, STM.getMaxNumSGPRs(F));
+      NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, STM.getMaxNumVGPRs(F));
+    }
+  }
+  return AsmPrinter::doInitialization(M);
+}
+
 // TODO: Fold this into emitFunctionBodyStart.
 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
  // In the beginning all features are either 'Any' or 'NotSupported',
@ -1020,14 +1035,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
              AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
            report_fatal_error("invalid call to entry function");

-          // If this is a call to an external function, we can't do much. Make
-          // conservative guesses.
-
-          // 48 SGPRs - vcc, - flat_scr, -xnack
-          int MaxSGPRGuess =
-            47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
-          MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
-          MaxVGPR = std::max(MaxVGPR, 23);
+          unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
+              TM.getMCSubtargetInfo(), false, ST.hasFlatAddressSpace());
+          // If this is a call to an external function, we put the
+          // max values computed in doInitialization().
+          // Subtract extra SGPRs in case of indirect calls.
+          // For indirect calls, we take the max for the module
+          // and use that as the register budget for functions
+          // which makes an indirect calls. This max value
+          // includes extra SGPRs too (e.g. flatscratch and vcc).
+          // which are getting added later.
+          // Subtract them here so that they don't get added twice.
+          MaxSGPR = NonKernelMaxSGPRs - ExtraSGPRs - 1;
+          MaxVGPR = NonKernelMaxVGPRs - 1;
+          // TODO: handle AGPRs
          MaxAGPR = std::max(MaxAGPR, 23);

          CalleeFrameSize = std::max(CalleeFrameSize,
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@ -58,6 +58,8 @@ private:

  void initializeTargetID(const Module &M);

+  bool doInitialization(Module &M) override;
+
  SIProgramInfo CurrentProgramInfo;
  DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;

@ -101,6 +103,11 @@ public:
  explicit AMDGPUAsmPrinter(TargetMachine &TM,
                            std::unique_ptr<MCStreamer> Streamer);

+  // To memoize max SGPR usage of non-kernel functions of the module.
+  unsigned NonKernelMaxSGPRs = 0;
+  // To memoize max VGPR usage of non-kernel functions of the module.
+  unsigned NonKernelMaxVGPRs = 0;
+
  StringRef getPassName() const override;

  const MCSubtargetInfo* getGlobalSTI() const;
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@ -698,12 +698,12 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
  return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
 }

-unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+unsigned
+GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
  if (getGeneration() >= AMDGPUSubtarget::GFX10)
    return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.

-  if (MFI.hasFlatScratchInit()) {
+  if (HasFlatScratchInit) {
    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
@ -715,6 +715,26 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
  return 2; // VCC.
 }

+unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
+}
+
+unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
+  // The logic to detect if the function has
+  // flat scratch init is same as how MachineFunctionInfo derives.
+  bool FunctionHasFlatScratchInit = false;
+  bool HasCalls = F.hasFnAttribute("amdgpu-calls");
+  bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
+  if (hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(F.getCallingConv()) &&
+      (isAmdHsaOrMesa(F) || enableFlatScratch()) &&
+      !flatScratchIsArchitected()) {
+    if (HasCalls || HasStackObjects || enableFlatScratch())
+      FunctionHasFlatScratchInit = true;
+  }
+  return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
+}
+
 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
                                        unsigned NumSGPRs,
                                        unsigned NumVGPRs) const {
@ -728,13 +748,11 @@ unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
  return Occupancy;
 }

-unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
-  const Function &F = MF.getFunction();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
+unsigned GCNSubtarget::getBaseMaxNumSGPRs(
+    const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
+    unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
  // Compute maximum number of SGPRs function can use using default/requested
  // minimum number of waves per execution unit.
-  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);

@ -745,7 +763,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
      F, "amdgpu-num-sgpr", MaxNumSGPRs);

    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && (Requested <= getReservedNumSGPRs(MF)))
+    if (Requested && (Requested <= ReservedNumSGPRs))
      Requested = 0;

    // If more SGPRs are required to support the input user/system SGPRs,
@ -755,7 +773,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
    // of reserved special registers in total. Theoretically you could re-use
    // the last input registers for these special registers, but this would
    // require a lot of complexity to deal with the weird aliasing.
-    unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
+    unsigned InputNumSGPRs = PreloadedSGPRs;
    if (Requested && Requested < InputNumSGPRs)
      Requested = InputNumSGPRs;

@ -774,17 +792,43 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
  if (hasSGPRInitBug())
    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;

-  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
-                  MaxAddressableNumSGPRs);
+  return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
 }

-unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
  const Function &F = MF.getFunction();
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
+                            getReservedNumSGPRs(MF));
+}

+static unsigned getMaxNumPreloadedSGPRs() {
+  // Max number of user SGPRs
+  unsigned MaxUserSGPRs = 4 + // private segment buffer
+                          2 + // Dispatch ptr
+                          2 + // queue ptr
+                          2 + // kernel segment ptr
+                          2 + // dispatch ID
+                          2 + // flat scratch init
+                          2;  // Implicit buffer ptr
+  // Max number of system SGPRs
+  unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
+                            1 + // WorkGroupIDY
+                            1 + // WorkGroupIDZ
+                            1 + // WorkGroupInfo
+                            1;  // private segment wave byte offset
+  return MaxUserSGPRs + MaxSystemSGPRs;
+}
+
+unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
+  return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
+                            getReservedNumSGPRs(F));
+}
+
+unsigned GCNSubtarget::getBaseMaxNumVGPRs(
+    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
  // Compute maximum number of VGPRs function can use using default/requested
  // minimum number of waves per execution unit.
-  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);

  // Check if maximum number of VGPRs was explicitly requested using
@ -811,6 +855,16 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
  return MaxNumVGPRs;
 }

+unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
+  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
+}
+
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+  const Function &F = MF.getFunction();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
+}
+
 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
                                         int UseOpIdx, SDep &Dep) const {
  if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
--- a/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/lib/Target/AMDGPU/GCNSubtarget.h
@ -1033,9 +1033,24 @@ public:
    return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
  }

-  /// \returns Reserved number of SGPRs for given function \p MF.
+  /// \returns Reserved number of SGPRs. This is common
+  /// utility function called by MachineFunction and
+  /// Function variants of getReservedNumSGPRs.
+  unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const;
+  /// \returns Reserved number of SGPRs for given machine function \p MF.
  unsigned getReservedNumSGPRs(const MachineFunction &MF) const;

+  /// \returns Reserved number of SGPRs for given function \p F.
+  unsigned getReservedNumSGPRs(const Function &F) const;
+
+  /// \returns max num SGPRs. This is the common utility
+  /// function called by MachineFunction and Function
+  /// variants of getMaxNumSGPRs.
+  unsigned getBaseMaxNumSGPRs(const Function &F,
+                              std::pair<unsigned, unsigned> WavesPerEU,
+                              unsigned PreloadedSGPRs,
+                              unsigned ReservedNumSGPRs) const;
+
  /// \returns Maximum number of SGPRs that meets number of waves per execution
  /// unit requirement for function \p MF, or number of SGPRs explicitly
  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
@ -1046,6 +1061,16 @@ public:
  /// unit requirement.
  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;

+  /// \returns Maximum number of SGPRs that meets number of waves per execution
+  /// unit requirement for function \p F, or number of SGPRs explicitly
+  /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumSGPRs(const Function &F) const;
+
  /// \returns VGPR allocation granularity supported by the subtarget.
  unsigned getVGPRAllocGranule() const {
    return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
@ -1078,6 +1103,20 @@ public:
    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
  }

+  /// \returns max num VGPRs. This is the common utility function
+  /// called by MachineFunction and Function variants of getMaxNumVGPRs.
+  unsigned getBaseMaxNumVGPRs(const Function &F,
+                              std::pair<unsigned, unsigned> WavesPerEU) const;
+  /// \returns Maximum number of VGPRs that meets number of waves per execution
+  /// unit requirement for function \p F, or number of VGPRs explicitly
+  /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumVGPRs(const Function &F) const;
+
  /// \returns Maximum number of VGPRs that meets number of waves per execution
  /// unit requirement for function \p MF, or number of VGPRs explicitly
  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
--- a/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/test/CodeGen/AMDGPU/agpr-register-count.ll
@ -154,21 +154,22 @@ bb:
 declare void @undef_func()

 ; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GFX908: .amdhsa_next_free_vgpr 24
-; GFX90A: .amdhsa_next_free_vgpr 48
-; GFX90A: .amdhsa_accum_offset 24
-; GCN:    NumVgprs: 24
+; GFX908: .amdhsa_next_free_vgpr 128
+; GFX90A: .amdhsa_next_free_vgpr 280
+; GFX90A: .amdhsa_accum_offset 256
+; GCN908: NumVgprs: 128
+; GCN90A: NumVgprs: 256
 ; GCN:    NumAgprs: 24
-; GFX908: TotalNumVgprs: 24
-; GFX90A: TotalNumVgprs: 48
-; GFX908: VGPRBlocks: 5
-; GFX90A: VGPRBlocks: 5
-; GFX908: NumVGPRsForWavesPerEU: 24
-; GFX90A: NumVGPRsForWavesPerEU: 48
-; GFX90A: AccumOffset: 24
-; GFX908: Occupancy: 10
-; GFX90A: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5
+; GFX908: TotalNumVgprs: 128
+; GFX90A: TotalNumVgprs: 280
+; GFX908: VGPRBlocks: 31
+; GFX90A: VGPRBlocks: 34
+; GFX908: NumVGPRsForWavesPerEU: 128
+; GFX90A: NumVGPRsForWavesPerEU: 280
+; GFX90A: AccumOffset: 256
+; GFX908: Occupancy: 2
+; GFX90A: Occupancy: 1
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63
 define amdgpu_kernel void @kernel_call_undef_func() #0 {
 bb:
  call void @undef_func()
--- a/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/test/CodeGen/AMDGPU/amdpal-callable.ll
@ -147,9 +147,9 @@ attributes #0 = { nounwind }

 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; SDAG-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
+; SDAG-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}}
 ; SDAG-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
-; GISEL-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}}
+; GISEL-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}}
 ; GISEL-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
--- a/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@ -227,10 +227,10 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 48
-; VI-NOBUG: NumSgprs: 48
+; CI: NumSgprs: 102
+; VI-NOBUG: NumSgprs: 102
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 24
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_sgpr96_external_call()  {
 entry:
  tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@ -241,10 +241,10 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 48
-; VI-NOBUG: NumSgprs: 48
+; CI: NumSgprs: 102
+; VI-NOBUG: NumSgprs: 102
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 24
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_sgpr160_external_call()  {
 entry:
  tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@ -255,10 +255,10 @@ entry:
 ; Make sure there's no assert when a vgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_vgpr160_external_call
 ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 48
-; VI-NOBUG: NumSgprs: 48
+; CI: NumSgprs: 102
+; VI-NOBUG: NumSgprs: 102
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 24
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_vgpr160_external_call()  {
 entry:
  tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
--- a/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/test/CodeGen/AMDGPU/indirect-call.ll
@ -15,8 +15,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() {
 ; GCN-NEXT:     amd_machine_version_stepping = 0
 ; GCN-NEXT:     kernel_code_entry_byte_offset = 256
 ; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
-; GCN-NEXT:     granulated_workitem_vgpr_count = 7
-; GCN-NEXT:     granulated_wavefront_sgpr_count = 5
+; GCN-NEXT:     granulated_workitem_vgpr_count = 15
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GCN-NEXT:     priority = 0
 ; GCN-NEXT:     float_mode = 240
 ; GCN-NEXT:     priv = 0
@ -59,8 +59,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() {
 ; GCN-NEXT:     gds_segment_byte_size = 0
 ; GCN-NEXT:     kernarg_segment_byte_size = 0
 ; GCN-NEXT:     workgroup_fbarrier_count = 0
-; GCN-NEXT:     wavefront_sgpr_count = 48
-; GCN-NEXT:     workitem_vgpr_count = 32
+; GCN-NEXT:     wavefront_sgpr_count = 102
+; GCN-NEXT:     workitem_vgpr_count = 64
 ; GCN-NEXT:     reserved_vgpr_first = 0
 ; GCN-NEXT:     reserved_vgpr_count = 0
 ; GCN-NEXT:     reserved_sgpr_first = 0
@ -111,8 +111,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
 ; GCN-NEXT:     amd_machine_version_stepping = 0
 ; GCN-NEXT:     kernel_code_entry_byte_offset = 256
 ; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
-; GCN-NEXT:     granulated_workitem_vgpr_count = 7
-; GCN-NEXT:     granulated_wavefront_sgpr_count = 5
+; GCN-NEXT:     granulated_workitem_vgpr_count = 15
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GCN-NEXT:     priority = 0
 ; GCN-NEXT:     float_mode = 240
 ; GCN-NEXT:     priv = 0
@ -155,8 +155,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
 ; GCN-NEXT:     gds_segment_byte_size = 0
 ; GCN-NEXT:     kernarg_segment_byte_size = 0
 ; GCN-NEXT:     workgroup_fbarrier_count = 0
-; GCN-NEXT:     wavefront_sgpr_count = 48
-; GCN-NEXT:     workitem_vgpr_count = 32
+; GCN-NEXT:     wavefront_sgpr_count = 102
+; GCN-NEXT:     workitem_vgpr_count = 64
 ; GCN-NEXT:     reserved_vgpr_first = 0
 ; GCN-NEXT:     reserved_vgpr_count = 0
 ; GCN-NEXT:     reserved_sgpr_first = 0