[AMDGPU] [IndirectCalls] Don't propagate attributes to address taken functions and their callees

Don't propagate launch bound related attributes to address taken functions and their callees. The idea is to do a traversal over the call graph starting at address taken functions and erase the attributes set by previous logic i.e. process(). This two phase approach makes sure that we don't miss out on deep nested callees from address taken functions as a function might be called directly as well as indirectly. This patch is also reattempt to D94585 as latent issues are fixed in hasAddressTaken function in the recent past. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D103138
2024-11-22 10:42:39 +01:00 · 2021-05-26 11:17:03 +05:30 · 2021-05-26 11:17:03 +05:30 · aa50d7ed0d
commit aa50d7ed0d
parent f0ab92c1c6
7 changed files with 267 additions and 12 deletions
--- a/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@ -30,11 +30,13 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include <queue>

 #define DEBUG_TYPE "amdgpu-propagate-attributes"

@ -113,12 +115,17 @@ class AMDGPUPropagateAttributes {
  // Clone functions as needed or just set attributes.
  bool AllowClone;

+  CallGraph *ModuleCG = nullptr;
+
  // Option propagation roots.
  SmallSet<Function *, 32> Roots;

  // Clones of functions with their attributes.
  SmallVector<Clone, 32> Clones;

+  // To memoize address taken functions.
+  SmallSet<Function *, 32> AddressTakenFunctions;
+
  // Find a clone with required features.
  Function *findFunction(const FnProperties &PropsNeeded,
                         Function *OrigF);
@ -139,14 +146,23 @@ class AMDGPUPropagateAttributes {
  bool process();

 public:
-  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
-    TM(TM), AllowClone(AllowClone) {}
+  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone)
+      : TM(TM), AllowClone(AllowClone) {}

  // Use F as a root and propagate its attributes.
  bool process(Function &F);

  // Propagate attributes starting from kernel functions.
-  bool process(Module &M);
+  bool process(Module &M, CallGraph *CG);
+
+  // Remove attributes from F.
+  // This is used in presence of address taken functions.
+  bool removeAttributes(Function *F);
+
+  // Handle call graph rooted at address taken functions.
+  // This function will erase all attributes present
+  // on all functions called from address taken functions transitively.
+  bool handleAddressTakenFunctions(CallGraph *CG);
 };

 // Allows to propagate attributes early, but no clonning is allowed as it must
@ -182,6 +198,7 @@ public:
      *PassRegistry::getPassRegistry());
  }

+  void getAnalysisUsage(AnalysisUsage &AU) const override;
  bool runOnModule(Module &M) override;
 };

@ -199,6 +216,49 @@ INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
                "Late propagate attributes from kernels to functions",
                false, false)

+bool AMDGPUPropagateAttributes::removeAttributes(Function *F) {
+  bool Changed = false;
+  if (!F)
+    return Changed;
+  LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n');
+  for (unsigned I = 0; I < NumAttr; ++I) {
+    if (F->hasFnAttribute(AttributeNames[I])) {
+      F->removeFnAttr(AttributeNames[I]);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) {
+  assert(ModuleCG && "Call graph not present");
+
+  bool Changed = false;
+  SmallSet<CallGraphNode *, 32> Visited;
+
+  for (Function *F : AddressTakenFunctions) {
+    CallGraphNode *CGN = (*CG)[F];
+    if (!Visited.count(CGN)) {
+      Changed |= removeAttributes(F);
+      Visited.insert(CGN);
+    }
+
+    std::queue<CallGraphNode *> SubGraph;
+    SubGraph.push(CGN);
+    while (!SubGraph.empty()) {
+      CallGraphNode *CGN = SubGraph.front();
+      SubGraph.pop();
+      if (!Visited.count(CGN)) {
+        Changed |= removeAttributes(CGN->getFunction());
+        Visited.insert(CGN);
+      }
+      for (auto N : *CGN)
+        SubGraph.push(N.second);
+    }
+  }
+  return Changed;
+}
+
 Function *
 AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
                                        Function *OrigF) {
@ -210,11 +270,12 @@ AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
  return nullptr;
 }

-bool AMDGPUPropagateAttributes::process(Module &M) {
+bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) {
  for (auto &F : M.functions())
    if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
      Roots.insert(&F);

+  ModuleCG = CG;
  return process();
 }

@ -240,6 +301,9 @@ bool AMDGPUPropagateAttributes::process() {
      if (F.isDeclaration())
        continue;

+      if (F.hasAddressTaken(nullptr, true, true, true))
+        AddressTakenFunctions.insert(&F);
+
      const FnProperties CalleeProps(*TM, F);
      SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
      SmallSet<CallBase *, 32> Visited;
@ -310,6 +374,12 @@ bool AMDGPUPropagateAttributes::process() {
  Roots.clear();
  Clones.clear();

+  // Keep the post processing related to indirect
+  // calls separate to handle them gracefully.
+  // The core traversal need not be affected by this.
+  if (AllowClone)
+    Changed |= handleAddressTakenFunctions(ModuleCG);
+
  return Changed;
 }

@ -389,6 +459,10 @@ bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
  return AMDGPUPropagateAttributes(TM, false).process(F);
 }

+void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<CallGraphWrapperPass>();
+}
+
 bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
  if (!TM) {
    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
@ -397,8 +471,8 @@ bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {

    TM = &TPC->getTM<TargetMachine>();
  }
-
-  return AMDGPUPropagateAttributes(TM, true).process(M);
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  return AMDGPUPropagateAttributes(TM, true).process(M, &CG);
 }

 FunctionPass
@ -423,8 +497,9 @@ AMDGPUPropagateAttributesEarlyPass::run(Function &F,
 }

 PreservedAnalyses
-AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) {
-  return AMDGPUPropagateAttributes(&TM, true).process(M)
-             ? PreservedAnalyses::none()
-             : PreservedAnalyses::all();
+AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) {
+  AMDGPUPropagateAttributes APA(&TM, true);
+  CallGraph &CG = MAM.getResult<CallGraphAnalysis>(M);
+  const bool Changed = APA.process(M, &CG);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
--- a/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/test/CodeGen/AMDGPU/opt-pipeline.ll
@ -65,6 +65,7 @@
 ; GCN-O1-NEXT:     AMDGPU Printf lowering
 ; GCN-O1-NEXT:       FunctionPass Manager
 ; GCN-O1-NEXT:         Dominator Tree Construction
+; GCN-O1-NEXT:     CallGraph Construction
 ; GCN-O1-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O1-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; GCN-O1-NEXT:       FunctionPass Manager
@ -374,6 +375,7 @@
 ; GCN-O2-NEXT:     AMDGPU Printf lowering
 ; GCN-O2-NEXT:       FunctionPass Manager
 ; GCN-O2-NEXT:         Dominator Tree Construction
+; GCN-O2-NEXT:     CallGraph Construction
 ; GCN-O2-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O2-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; GCN-O2-NEXT:       FunctionPass Manager
@ -728,6 +730,7 @@
 ; GCN-O3-NEXT:     AMDGPU Printf lowering
 ; GCN-O3-NEXT:       FunctionPass Manager
 ; GCN-O3-NEXT:         Dominator Tree Construction
+; GCN-O3-NEXT:     CallGraph Construction
 ; GCN-O3-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O3-NEXT:     FunctionPass Manager
 ; GCN-O3-NEXT:       Dominator Tree Construction
--- a/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
+++ b/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
@ -0,0 +1,55 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; CHECK-LABEL: define float @common() {
+define float @common() {
+   ret float 0.0
+}
+
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+  %direct_call = call contract float @common()
+  ret float %direct_call
+}
+
+; CHECK-LABEL: define float @bar() {
+define float @bar() {
+   ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz() {
+define float @baz() {
+   ret float 0.0
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float ()* @foo, float ()** %fn
+  br label %sw.epilog
+
+sw.bb2:
+  store float ()* @bar, float ()** %fn
+  br label %sw.epilog
+
+sw.bb3:
+  store float ()* @baz, float ()** %fn
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float ()*, float ()** %fn
+  %direct_call = call contract float @common()
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
--- a/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
+++ b/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
@ -0,0 +1,53 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test to check if we skip propgating attributes even if
+; a function is called directly as well as
+; indirectly. "baz" is called directly as well indirectly.
+
+; CHECK-LABEL: define float @foo()
+define float @foo() #1 {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @bar()
+define float @bar() #1 {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz()
+define float @baz() #1 {
+ ret float 0.0
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float ()* @foo, float ()** %fn
+  br label %sw.epilog
+
+sw.bb2:
+  store float ()* @bar, float ()** %fn
+  br label %sw.epilog
+
+sw.bb3:
+  store float ()* @baz, float ()** %fn
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float ()*, float ()** %fn
+  %direct_call = call contract float @baz()
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
--- a/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
+++ b/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
@ -30,11 +30,14 @@ define private void @f(void ()* nocapture %0) #0 {
 ; In order to expose this bug, it is necessary that `g` have one of the
 ; propagated attributes, so that a clone and substitution would take place if g
 ; were actually the function being called.
-; CHECK-DAG: define private void @g.1() #1
-; CHECK-DAG: define internal void @g() #2
+; CHECK-DAG: define private void @g.1() #0
+; CHECK-DAG: define internal void @g() #1
 define private void @g() #1 {
    ret void
 }

 attributes #0 = { noinline }
 attributes #1 = { noinline "amdgpu-waves-per-eu"="1,10" }
+
+; CHECK: attributes #0 = { noinline }
+; CHECK-NEXT: attributes #1 = { noinline "target-features"="+enable-ds128,+enable-prt-strict-null,+flat-address-space,+flat-for-global,+load-store-opt,+promote-alloca,+trap-handler,+unaligned-access-mode,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64" }
--- a/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
+++ b/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
@ -0,0 +1,34 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test to check if we skip attributes on address
+; taken functions and its callees.
+
+; CHECK-LABEL: define float @bar() {
+define float @bar() {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz() {
+define float @baz() {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+  %v1 = call contract float @bar()
+  %v2 = call contract float @baz()
+  %v3 = fadd float %v1, %v2
+  ret float %v3
+}
+
+; CHECK-LABEL: define amdgpu_kernel void @kernel(float* %result, i32 %type) #0 {
+define amdgpu_kernel void @kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
--- a/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll
+++ b/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll
@ -0,0 +1,32 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test attributes on a function which
+; is called indirectly from two kernels
+; having different launch bounds.
+
+; This function should not have any attributes on it.
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+   ret float 0.0
+}
+
+define amdgpu_kernel void @kernel1(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+define amdgpu_kernel void @kernel2(float *%result, i32 %type) #2 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,512" }