[AMDGPU] Fix MaxWorkGroupsPerCU for large workgroups

This patch corrects the maximum workgroups per CU if we have big workgroups (more than 128). This calculation contributes to the occupancy calculation in respect to LDS size. Differential Revision: https://reviews.llvm.org/D29974 llvm-svn: 295134
2024-10-19 11:02:59 +02:00 · 2017-02-15 01:03:59 +00:00 · 2017-02-15 01:03:59 +00:00 · 479d45f82d
commit 479d45f82d
parent 14aaabfcef
2 changed files with 9 additions and 3 deletions
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@ -151,7 +151,11 @@ unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
                               unsigned FlatWorkGroupSize) {
  if (!Features.test(FeatureGCN))
    return 8;
-  return getWavesPerWorkGroup(Features, FlatWorkGroupSize) == 1 ? 40 : 16;
+  unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+  if (N == 1)
+    return 40;
+  N = 40 / N;
+  return std::min(N, 16u);
 }

 unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
--- a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@ -69,7 +69,8 @@ entry:
 }

 ; ALL-LABEL: @occupancy_0(
-; ALL: alloca [5 x i32]
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
 define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
 entry:
  %stack = alloca [5 x i32], align 4
@ -91,7 +92,8 @@ entry:
 }

 ; ALL-LABEL: @occupancy_max(
-; ALL: alloca [5 x i32]
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
 define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
 entry:
  %stack = alloca [5 x i32], align 4