From aa50d7ed0d260ee5e837a44ac4cf40f1548a7a14 Mon Sep 17 00:00:00 2001 From: madhur13490 Date: Wed, 26 May 2021 11:17:03 +0530 Subject: [PATCH] [AMDGPU] [IndirectCalls] Don't propagate attributes to address taken functions and their callees Don't propagate launch bound related attributes to address taken functions and their callees. The idea is to do a traversal over the call graph starting at address taken functions and erase the attributes set by previous logic i.e. process(). This two phase approach makes sure that we don't miss out on deep nested callees from address taken functions as a function might be called directly as well as indirectly. This patch is also reattempt to D94585 as latent issues are fixed in hasAddressTaken function in the recent past. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D103138 --- .../AMDGPU/AMDGPUPropagateAttributes.cpp | 95 +++++++++++++++++-- test/CodeGen/AMDGPU/opt-pipeline.ll | 3 + .../propagate-attributes-common-callees.ll | 55 +++++++++++ ...ttributes-direct-indirect-common-callee.ll | 53 +++++++++++ ...te-attributes-function-pointer-argument.ll | 7 +- .../AMDGPU/propagate-attributes-indirect.ll | 34 +++++++ .../propagate-attributes-launch-bounds.ll | 32 +++++++ 7 files changed, 267 insertions(+), 12 deletions(-) create mode 100644 test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll create mode 100644 test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll create mode 100644 test/CodeGen/AMDGPU/propagate-attributes-indirect.ll create mode 100644 test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll diff --git a/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp index 0e4c26170a8..fbad7d9fdcf 100644 --- a/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -30,11 +30,13 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/InstrTypes.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Cloning.h" +#include #define DEBUG_TYPE "amdgpu-propagate-attributes" @@ -113,12 +115,17 @@ class AMDGPUPropagateAttributes { // Clone functions as needed or just set attributes. bool AllowClone; + CallGraph *ModuleCG = nullptr; + // Option propagation roots. SmallSet Roots; // Clones of functions with their attributes. SmallVector Clones; + // To memoize address taken functions. + SmallSet AddressTakenFunctions; + // Find a clone with required features. Function *findFunction(const FnProperties &PropsNeeded, Function *OrigF); @@ -139,14 +146,23 @@ class AMDGPUPropagateAttributes { bool process(); public: - AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) : - TM(TM), AllowClone(AllowClone) {} + AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) + : TM(TM), AllowClone(AllowClone) {} // Use F as a root and propagate its attributes. bool process(Function &F); // Propagate attributes starting from kernel functions. - bool process(Module &M); + bool process(Module &M, CallGraph *CG); + + // Remove attributes from F. + // This is used in presence of address taken functions. + bool removeAttributes(Function *F); + + // Handle call graph rooted at address taken functions. + // This function will erase all attributes present + // on all functions called from address taken functions transitively. + bool handleAddressTakenFunctions(CallGraph *CG); }; // Allows to propagate attributes early, but no clonning is allowed as it must @@ -182,6 +198,7 @@ public: *PassRegistry::getPassRegistry()); } + void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnModule(Module &M) override; }; @@ -199,6 +216,49 @@ INITIALIZE_PASS(AMDGPUPropagateAttributesLate, "Late propagate attributes from kernels to functions", false, false) +bool AMDGPUPropagateAttributes::removeAttributes(Function *F) { + bool Changed = false; + if (!F) + return Changed; + LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n'); + for (unsigned I = 0; I < NumAttr; ++I) { + if (F->hasFnAttribute(AttributeNames[I])) { + F->removeFnAttr(AttributeNames[I]); + Changed = true; + } + } + return Changed; +} + +bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) { + assert(ModuleCG && "Call graph not present"); + + bool Changed = false; + SmallSet Visited; + + for (Function *F : AddressTakenFunctions) { + CallGraphNode *CGN = (*CG)[F]; + if (!Visited.count(CGN)) { + Changed |= removeAttributes(F); + Visited.insert(CGN); + } + + std::queue SubGraph; + SubGraph.push(CGN); + while (!SubGraph.empty()) { + CallGraphNode *CGN = SubGraph.front(); + SubGraph.pop(); + if (!Visited.count(CGN)) { + Changed |= removeAttributes(CGN->getFunction()); + Visited.insert(CGN); + } + for (auto N : *CGN) + SubGraph.push(N.second); + } + } + return Changed; +} + Function * AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded, Function *OrigF) { @@ -210,11 +270,12 @@ AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded, return nullptr; } -bool AMDGPUPropagateAttributes::process(Module &M) { +bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) { for (auto &F : M.functions()) if (AMDGPU::isEntryFunctionCC(F.getCallingConv())) Roots.insert(&F); + ModuleCG = CG; return process(); } @@ -240,6 +301,9 @@ bool AMDGPUPropagateAttributes::process() { if (F.isDeclaration()) continue; + if (F.hasAddressTaken(nullptr, true, true, true)) + AddressTakenFunctions.insert(&F); + const FnProperties CalleeProps(*TM, F); SmallVector, 32> ToReplace; SmallSet Visited; @@ -310,6 +374,12 @@ bool AMDGPUPropagateAttributes::process() { Roots.clear(); Clones.clear(); + // Keep the post processing related to indirect + // calls separate to handle them gracefully. + // The core traversal need not be affected by this. + if (AllowClone) + Changed |= handleAddressTakenFunctions(ModuleCG); + return Changed; } @@ -389,6 +459,10 @@ bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { return AMDGPUPropagateAttributes(TM, false).process(F); } +void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { if (!TM) { auto *TPC = getAnalysisIfAvailable(); @@ -397,8 +471,8 @@ bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { TM = &TPC->getTM(); } - - return AMDGPUPropagateAttributes(TM, true).process(M); + CallGraph &CG = getAnalysis().getCallGraph(); + return AMDGPUPropagateAttributes(TM, true).process(M, &CG); } FunctionPass @@ -423,8 +497,9 @@ AMDGPUPropagateAttributesEarlyPass::run(Function &F, } PreservedAnalyses -AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) { - return AMDGPUPropagateAttributes(&TM, true).process(M) - ? PreservedAnalyses::none() - : PreservedAnalyses::all(); +AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) { + AMDGPUPropagateAttributes APA(&TM, true); + CallGraph &CG = MAM.getResult(M); + const bool Changed = APA.process(M, &CG); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/test/CodeGen/AMDGPU/opt-pipeline.ll b/test/CodeGen/AMDGPU/opt-pipeline.ll index c18898164e2..25051b885f9 100644 --- a/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -65,6 +65,7 @@ ; GCN-O1-NEXT: AMDGPU Printf lowering ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Late propagate attributes from kernels to functions ; GCN-O1-NEXT: Interprocedural Sparse Conditional Constant Propagation ; GCN-O1-NEXT: FunctionPass Manager @@ -374,6 +375,7 @@ ; GCN-O2-NEXT: AMDGPU Printf lowering ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Late propagate attributes from kernels to functions ; GCN-O2-NEXT: Interprocedural Sparse Conditional Constant Propagation ; GCN-O2-NEXT: FunctionPass Manager @@ -728,6 +730,7 @@ ; GCN-O3-NEXT: AMDGPU Printf lowering ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Late propagate attributes from kernels to functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll b/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll new file mode 100644 index 00000000000..394a390980a --- /dev/null +++ b/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll @@ -0,0 +1,55 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; CHECK-LABEL: define float @common() { +define float @common() { + ret float 0.0 +} + +; CHECK-LABEL: define float @foo() { +define float @foo() { + %direct_call = call contract float @common() + ret float %direct_call +} + +; CHECK-LABEL: define float @bar() { +define float @bar() { + ret float 0.0 +} + +; CHECK-LABEL: define float @baz() { +define float @baz() { + ret float 0.0 +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { + %fn = alloca float ()* + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float ()* @foo, float ()** %fn + br label %sw.epilog + +sw.bb2: + store float ()* @bar, float ()** %fn + br label %sw.epilog + +sw.bb3: + store float ()* @baz, float ()** %fn + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float ()*, float ()** %fn + %direct_call = call contract float @common() + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll b/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll new file mode 100644 index 00000000000..f3e88ea1290 --- /dev/null +++ b/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; Test to check if we skip propgating attributes even if +; a function is called directly as well as +; indirectly. "baz" is called directly as well indirectly. + +; CHECK-LABEL: define float @foo() +define float @foo() #1 { + ret float 0.0 +} + +; CHECK-LABEL: define float @bar() +define float @bar() #1 { + ret float 0.0 +} + +; CHECK-LABEL: define float @baz() +define float @baz() #1 { + ret float 0.0 +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { + %fn = alloca float ()* + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float ()* @foo, float ()** %fn + br label %sw.epilog + +sw.bb2: + store float ()* @bar, float ()** %fn + br label %sw.epilog + +sw.bb3: + store float ()* @baz, float ()** %fn + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float ()*, float ()** %fn + %direct_call = call contract float @baz() + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll b/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll index 370c5cc17e9..44d52fb934f 100644 --- a/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll +++ b/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll @@ -30,11 +30,14 @@ define private void @f(void ()* nocapture %0) #0 { ; In order to expose this bug, it is necessary that `g` have one of the ; propagated attributes, so that a clone and substitution would take place if g ; were actually the function being called. -; CHECK-DAG: define private void @g.1() #1 -; CHECK-DAG: define internal void @g() #2 +; CHECK-DAG: define private void @g.1() #0 +; CHECK-DAG: define internal void @g() #1 define private void @g() #1 { ret void } attributes #0 = { noinline } attributes #1 = { noinline "amdgpu-waves-per-eu"="1,10" } + +; CHECK: attributes #0 = { noinline } +; CHECK-NEXT: attributes #1 = { noinline "target-features"="+enable-ds128,+enable-prt-strict-null,+flat-address-space,+flat-for-global,+load-store-opt,+promote-alloca,+trap-handler,+unaligned-access-mode,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64" } diff --git a/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll b/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll new file mode 100644 index 00000000000..2401f4c81a7 --- /dev/null +++ b/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll @@ -0,0 +1,34 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; Test to check if we skip attributes on address +; taken functions and its callees. + +; CHECK-LABEL: define float @bar() { +define float @bar() { + ret float 0.0 +} + +; CHECK-LABEL: define float @baz() { +define float @baz() { + ret float 0.0 +} + +; CHECK-LABEL: define float @foo() { +define float @foo() { + %v1 = call contract float @bar() + %v2 = call contract float @baz() + %v3 = fadd float %v1, %v2 + ret float %v3 +} + +; CHECK-LABEL: define amdgpu_kernel void @kernel(float* %result, i32 %type) #0 { +define amdgpu_kernel void @kernel(float *%result, i32 %type) #1 { + %fn = alloca float ()* + store float ()* @foo, float ()** %fn + %fp = load float ()*, float ()** %fn + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll b/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll new file mode 100644 index 00000000000..362c40293c3 --- /dev/null +++ b/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; Test attributes on a function which +; is called indirectly from two kernels +; having different launch bounds. + +; This function should not have any attributes on it. +; CHECK-LABEL: define float @foo() { +define float @foo() { + ret float 0.0 +} + +define amdgpu_kernel void @kernel1(float *%result, i32 %type) #1 { + %fn = alloca float ()* + store float ()* @foo, float ()** %fn + %fp = load float ()*, float ()** %fn + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +define amdgpu_kernel void @kernel2(float *%result, i32 %type) #2 { + %fn = alloca float ()* + store float ()* @foo, float ()** %fn + %fp = load float ()*, float ()** %fn + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #2 = { "amdgpu-flat-work-group-size"="1,512" }