[AMDGPU] Support for "uniform-work-group-size" attribute

Updated the annotate-kernel-features pass to support the propagation of uniform-work-group attribute from the kernel to the called functions. Once this pass is run, all kernels, even the ones which initially did not have the attribute, will be able to indicate weather or not they have uniform work group size depending on the value of the attribute. Differential Revision: https://reviews.llvm.org/D50200 llvm-svn: 348971
2024-11-23 03:02:36 +01:00 · 2018-12-12 20:49:17 +00:00 · 2018-12-12 20:49:17 +00:00 · 57ff848c4d
commit 57ff848c4d
parent 93b0314296
9 changed files with 263 additions and 31 deletions
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@ -46,8 +46,11 @@ namespace {
 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
  const TargetMachine *TM = nullptr;
+  SmallVector<CallGraphNode*, 8> NodeList;

  bool addFeatureAttributes(Function &F);
+  bool processUniformWorkGroupAttribute();
+  bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);

 public:
  static char ID;
@ -186,7 +189,6 @@ static bool handleAttr(Function &Parent, const Function &Callee,
    Parent.addFnAttr(Name);
    return true;
  }
-
  return false;
 }

@ -213,6 +215,56 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
    handleAttr(Parent, Callee, AttrName);
 }

+bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
+  bool Changed = false;
+
+  for (auto *Node : reverse(NodeList)) {
+    Function *Caller = Node->getFunction();
+
+    for (auto I : *Node) {
+      Function *Callee = std::get<1>(I)->getFunction();
+      if (Callee)
+        Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
+       Function &Caller, Function &Callee) {
+
+  // Check for externally defined function
+  if (!Callee.hasExactDefinition()) {
+    Callee.addFnAttr("uniform-work-group-size", "false");
+    if (!Caller.hasFnAttribute("uniform-work-group-size")) 
+      Caller.addFnAttr("uniform-work-group-size", "false");
+     
+    return true;
+  }
+  // Check if the Caller has the attribute
+  if (Caller.hasFnAttribute("uniform-work-group-size")) {
+    // Check if the value of the attribute is true
+    if (Caller.getFnAttribute("uniform-work-group-size")
+        .getValueAsString().equals("true")) {
+      // Propagate the attribute to the Callee, if it does not have it
+      if (!Callee.hasFnAttribute("uniform-work-group-size")) {
+        Callee.addFnAttr("uniform-work-group-size", "true");
+        return true;
+      }
+    } else {
+      Callee.addFnAttr("uniform-work-group-size", "false");
+      return true;
+    }
+  } else {
+    // If the attribute is absent, set it as false
+    Caller.addFnAttr("uniform-work-group-size", "false");
+    Callee.addFnAttr("uniform-work-group-size", "false");
+    return true;
+  }
+  return false;
+}
+
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
  bool HasFlat = ST.hasFlatAddressSpace();
@ -293,15 +345,19 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 }

 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
-  Module &M = SCC.getCallGraph().getModule();
-  Triple TT(M.getTargetTriple());
-
  bool Changed = false;
+ 
  for (CallGraphNode *I : SCC) {
-    Function *F = I->getFunction();
+    // Build a list of CallGraphNodes from most number of uses to least
+    if (I->getNumReferences())
+      NodeList.push_back(I);
+    else
+      processUniformWorkGroupAttribute();
+
+    Function *F = I->getFunction();    
+    // Add feature attributes
    if (!F || F->isDeclaration())
      continue;
-
    Changed |= addFeatureAttributes(*F);
  }

--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -683,6 +683,9 @@ void AMDGPUPassConfig::addIRPasses() {
 }

 void AMDGPUPassConfig::addCodeGenPrepare() {
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn)
+    addPass(createAMDGPUAnnotateKernelFeaturesPass());
+
  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
      EnableLowerKernelArguments)
    addPass(createAMDGPULowerKernelArgumentsPass());
@ -770,7 +773,6 @@ bool GCNPassConfig::addPreISel() {

  // FIXME: We need to run a pass to propagate the attributes when calls are
  // supported.
-  addPass(createAMDGPUAnnotateKernelFeaturesPass());

  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
  // regions formed by them.
--- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@ -244,52 +244,52 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
  ret void
 }

-; HSA: define void @use_implicitarg_ptr() #15 {
+; HSA: define void @use_implicitarg_ptr() #16 {
 define void @use_implicitarg_ptr() #1 {
  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
  store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef
  ret void
 }

-; HSA: define void @func_indirect_use_implicitarg_ptr() #15 {
+; HSA: define void @func_indirect_use_implicitarg_ptr() #16 {
 define void @func_indirect_use_implicitarg_ptr() #1 {
  call void @use_implicitarg_ptr()
  ret void
 }

-; HSA: declare void @external.func() #16
+; HSA: declare void @external.func() #17
 declare void @external.func() #3

-; HSA: define internal void @defined.func() #16 {
+; HSA: define internal void @defined.func() #17 {
 define internal void @defined.func() #3 {
  ret void
 }

-; HSA: define void @func_call_external() #16 {
+; HSA: define void @func_call_external() #17 {
 define void @func_call_external() #3 {
  call void @external.func()
  ret void
 }

-; HSA: define void @func_call_defined() #16 {
+; HSA: define void @func_call_defined() #17 {
 define void @func_call_defined() #3 {
  call void @defined.func()
  ret void
 }

-; HSA: define void @func_call_asm() #16 {
+; HSA: define void @func_call_asm() #18 {
 define void @func_call_asm() #3 {
  call void asm sideeffect "", ""() #3
  ret void
 }

-; HSA: define amdgpu_kernel void @kern_call_external() #17 {
+; HSA: define amdgpu_kernel void @kern_call_external() #19 {
 define amdgpu_kernel void @kern_call_external() #3 {
  call void @external.func()
  ret void
 }

-; HSA: define amdgpu_kernel void @func_kern_defined() #17 {
+; HSA: define amdgpu_kernel void @func_kern_defined() #19 {
 define amdgpu_kernel void @func_kern_defined() #3 {
  call void @defined.func()
  ret void
@ -301,20 +301,22 @@ attributes #2 = { nounwind "target-cpu"="gfx900" }
 attributes #3 = { nounwind }

 ; HSA: attributes #0 = { nounwind readnone speculatable }
-; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" }
-; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" }
-; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" }
-; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" }
-; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" }
-; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" }
-; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" }
-; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" }
-; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" }
+; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" }
-; HSA: attributes #11 = { nounwind "target-cpu"="fiji" }
-; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" }
-; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" }
-; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" }
+; HSA: attributes #11 = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; HSA: attributes #15 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" }
-; HSA: attributes #16 = { nounwind }
-; HSA: attributes #17 = { nounwind "amdgpu-flat-scratch" }
+; HSA: attributes #16 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" }
+; HSA: attributes #18 = { nounwind }
+; HSA: attributes #19 = { nounwind "amdgpu-flat-scratch" "uniform-work-group-size"="false" }
--- a/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@ -0,0 +1,18 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; If the kernel does not have the uniform-work-group-attribute, set both callee and caller as false
+
+; CHECK: define void @foo() #[[FOO:[0-9]+]] {
+define void @foo() #0 {
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel1() #[[FOO]] {
+define amdgpu_kernel void @kernel1() #1 {
+  call void @foo()
+  ret void
+}
+
+attributes #0 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FOO]] = { "uniform-work-group-size"="false" }
--- a/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@ -0,0 +1,24 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; Test to verify if the attribute gets propagated across nested function calls
+
+; CHECK: define void @func1() #[[FUNC:[0-9]+]] {
+define void @func1() #0 {
+  ret void
+}
+
+; CHECK: define void @func2() #[[FUNC]] {
+define void @func2() #1 {
+  call void @func1()
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC:[0-9]+]] {
+define amdgpu_kernel void @kernel3() #2 {
+  call void @func2()
+  ret void
+}
+
+attributes #2 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="true" }
--- a/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@ -0,0 +1,25 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; Two kernels with different values of the uniform-work-group-attribute call the same function
+
+; CHECK: define void @func() #[[FUNC:[0-9]+]] {
+define void @func() #0 {
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] {
+define amdgpu_kernel void @kernel1() #1 {
+  call void @func()
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel2() #[[FUNC]] {
+define amdgpu_kernel void @kernel2() #2 {
+  call void @func()
+  ret void
+}
+
+attributes #1 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="false" }
+; CHECK: attributes #[[KERNEL1]] = { "uniform-work-group-size"="true" }
--- a/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@ -0,0 +1,33 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; Propagate the uniform-work-group-attribute from the kernel to callee if it doesn't have it
+; CHECK: define void @func() #[[FUNC:[0-9]+]] {
+define void @func() #0 {
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] {
+define amdgpu_kernel void @kernel1() #1 {
+  call void @func()
+  ret void
+}
+
+; External declaration of a function
+; CHECK: define weak_odr void @weak_func() #[[FUNC]] {
+define weak_odr void @weak_func() #0 {
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel2() #[[KERNEL2:[0-9]+]] {
+define amdgpu_kernel void @kernel2() #2 {
+  call void @weak_func()
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "uniform-work-group-size"="false" }
+attributes #2 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FUNC]] = { nounwind "uniform-work-group-size"="false" }
+; CHECK: attributes #[[KERNEL1]] = { "uniform-work-group-size"="false" }
+; CHECK: attributes #[[KERNEL2]] = { "uniform-work-group-size"="true" }
--- a/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; Test to ensure recursive functions exhibit proper behaviour
+; Test to generate fibonacci numbers
+
+; CHECK: define i32 @fib(i32 %n) #[[FIB:[0-9]+]] {
+define i32 @fib(i32 %n) #0 {
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %exit, label %cont1
+
+cont1:
+  %cmp2 = icmp eq i32 %n, 1
+  br i1 %cmp2, label %exit, label %cont2
+
+cont2:
+  %nm1 = sub i32 %n, 1
+  %fibm1 = call i32 @fib(i32 %nm1)
+  %nm2 = sub i32 %n, 2
+  %fibm2 = call i32 @fib(i32 %nm2)
+  %retval = add i32 %fibm1, %fibm2
+
+  ret i32 %retval
+
+exit:
+  ret i32 1
+}
+
+; CHECK: define amdgpu_kernel void @kernel(i32 addrspace(1)* %m) #[[FIB]] {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %m) #1 {
+  %r = call i32 @fib(i32 5)
+  store i32 %r, i32 addrspace(1)* %m
+  ret void
+}
+
+attributes #1 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FIB]] = { "uniform-work-group-size"="true" }
--- a/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@ -0,0 +1,35 @@
+; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s 
+
+; CHECK: define void @func1() #[[FUNC:[0-9]+]] {
+define void @func1() #0 {
+  ret void
+}
+
+; CHECK: define void @func4() #[[FUNC]] {
+define void @func4() #1 {
+  ret void
+}
+
+; CHECK: define void @func2() #[[FUNC]] {
+define void @func2() #1 {
+  call void @func4()
+  call void @func1()
+  ret void
+}
+
+; CHECK: define void @func3() #[[FUNC]] {
+define void @func3() #1 {
+  call void @func1()
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC]] {
+define amdgpu_kernel void @kernel3() #2 {
+  call void @func2()
+  call void @func3()
+  ret void
+}
+
+attributes #2 = { "uniform-work-group-size"="true" }
+
+; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="true" }