1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 10:42:39 +01:00

[AMDGPU] [IndirectCalls] Don't propagate attributes to address taken functions and their callees

Don't propagate launch bound related attributes to
address taken functions and their callees. The idea
is to do a traversal over the call graph starting at
address taken functions and erase the attributes
set by previous logic i.e. process().

This two phase approach makes sure that we don't
miss out on deep nested callees from address taken
functions as a function might be called directly as
well as indirectly.

This patch is also reattempt to D94585 as latent issues
are fixed in hasAddressTaken function in the recent
past.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D103138
This commit is contained in:
madhur13490 2021-05-26 11:17:03 +05:30
parent f0ab92c1c6
commit aa50d7ed0d
7 changed files with 267 additions and 12 deletions

View File

@ -30,11 +30,13 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <queue>
#define DEBUG_TYPE "amdgpu-propagate-attributes"
@ -113,12 +115,17 @@ class AMDGPUPropagateAttributes {
// Clone functions as needed or just set attributes.
bool AllowClone;
CallGraph *ModuleCG = nullptr;
// Option propagation roots.
SmallSet<Function *, 32> Roots;
// Clones of functions with their attributes.
SmallVector<Clone, 32> Clones;
// To memoize address taken functions.
SmallSet<Function *, 32> AddressTakenFunctions;
// Find a clone with required features.
Function *findFunction(const FnProperties &PropsNeeded,
Function *OrigF);
@ -139,14 +146,23 @@ class AMDGPUPropagateAttributes {
bool process();
public:
AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
TM(TM), AllowClone(AllowClone) {}
AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone)
: TM(TM), AllowClone(AllowClone) {}
// Use F as a root and propagate its attributes.
bool process(Function &F);
// Propagate attributes starting from kernel functions.
bool process(Module &M);
bool process(Module &M, CallGraph *CG);
// Remove attributes from F.
// This is used in presence of address taken functions.
bool removeAttributes(Function *F);
// Handle call graph rooted at address taken functions.
// This function will erase all attributes present
// on all functions called from address taken functions transitively.
bool handleAddressTakenFunctions(CallGraph *CG);
};
// Allows to propagate attributes early, but no clonning is allowed as it must
@ -182,6 +198,7 @@ public:
*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool runOnModule(Module &M) override;
};
@ -199,6 +216,49 @@ INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
"Late propagate attributes from kernels to functions",
false, false)
bool AMDGPUPropagateAttributes::removeAttributes(Function *F) {
bool Changed = false;
if (!F)
return Changed;
LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n');
for (unsigned I = 0; I < NumAttr; ++I) {
if (F->hasFnAttribute(AttributeNames[I])) {
F->removeFnAttr(AttributeNames[I]);
Changed = true;
}
}
return Changed;
}
bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) {
assert(ModuleCG && "Call graph not present");
bool Changed = false;
SmallSet<CallGraphNode *, 32> Visited;
for (Function *F : AddressTakenFunctions) {
CallGraphNode *CGN = (*CG)[F];
if (!Visited.count(CGN)) {
Changed |= removeAttributes(F);
Visited.insert(CGN);
}
std::queue<CallGraphNode *> SubGraph;
SubGraph.push(CGN);
while (!SubGraph.empty()) {
CallGraphNode *CGN = SubGraph.front();
SubGraph.pop();
if (!Visited.count(CGN)) {
Changed |= removeAttributes(CGN->getFunction());
Visited.insert(CGN);
}
for (auto N : *CGN)
SubGraph.push(N.second);
}
}
return Changed;
}
Function *
AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
Function *OrigF) {
@ -210,11 +270,12 @@ AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
return nullptr;
}
bool AMDGPUPropagateAttributes::process(Module &M) {
bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) {
for (auto &F : M.functions())
if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
Roots.insert(&F);
ModuleCG = CG;
return process();
}
@ -240,6 +301,9 @@ bool AMDGPUPropagateAttributes::process() {
if (F.isDeclaration())
continue;
if (F.hasAddressTaken(nullptr, true, true, true))
AddressTakenFunctions.insert(&F);
const FnProperties CalleeProps(*TM, F);
SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
SmallSet<CallBase *, 32> Visited;
@ -310,6 +374,12 @@ bool AMDGPUPropagateAttributes::process() {
Roots.clear();
Clones.clear();
// Keep the post processing related to indirect
// calls separate to handle them gracefully.
// The core traversal need not be affected by this.
if (AllowClone)
Changed |= handleAddressTakenFunctions(ModuleCG);
return Changed;
}
@ -389,6 +459,10 @@ bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
return AMDGPUPropagateAttributes(TM, false).process(F);
}
void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<CallGraphWrapperPass>();
}
bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
if (!TM) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
@ -397,8 +471,8 @@ bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
TM = &TPC->getTM<TargetMachine>();
}
return AMDGPUPropagateAttributes(TM, true).process(M);
CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
return AMDGPUPropagateAttributes(TM, true).process(M, &CG);
}
FunctionPass
@ -423,8 +497,9 @@ AMDGPUPropagateAttributesEarlyPass::run(Function &F,
}
PreservedAnalyses
AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) {
return AMDGPUPropagateAttributes(&TM, true).process(M)
? PreservedAnalyses::none()
: PreservedAnalyses::all();
AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) {
AMDGPUPropagateAttributes APA(&TM, true);
CallGraph &CG = MAM.getResult<CallGraphAnalysis>(M);
const bool Changed = APA.process(M, &CG);
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}

View File

@ -65,6 +65,7 @@
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Late propagate attributes from kernels to functions
; GCN-O1-NEXT: Interprocedural Sparse Conditional Constant Propagation
; GCN-O1-NEXT: FunctionPass Manager
@ -374,6 +375,7 @@
; GCN-O2-NEXT: AMDGPU Printf lowering
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Late propagate attributes from kernels to functions
; GCN-O2-NEXT: Interprocedural Sparse Conditional Constant Propagation
; GCN-O2-NEXT: FunctionPass Manager
@ -728,6 +730,7 @@
; GCN-O3-NEXT: AMDGPU Printf lowering
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Late propagate attributes from kernels to functions
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction

View File

@ -0,0 +1,55 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
; CHECK-LABEL: define float @common() {
define float @common() {
ret float 0.0
}
; CHECK-LABEL: define float @foo() {
define float @foo() {
%direct_call = call contract float @common()
ret float %direct_call
}
; CHECK-LABEL: define float @bar() {
define float @bar() {
ret float 0.0
}
; CHECK-LABEL: define float @baz() {
define float @baz() {
ret float 0.0
}
define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
%fn = alloca float ()*
switch i32 %type, label %sw.default [
i32 1, label %sw.bb
i32 2, label %sw.bb2
i32 3, label %sw.bb3
]
sw.bb:
store float ()* @foo, float ()** %fn
br label %sw.epilog
sw.bb2:
store float ()* @bar, float ()** %fn
br label %sw.epilog
sw.bb3:
store float ()* @baz, float ()** %fn
br label %sw.epilog
sw.default:
br label %sw.epilog
sw.epilog:
%fp = load float ()*, float ()** %fn
%direct_call = call contract float @common()
%indirect_call = call contract float %fp()
store float %indirect_call, float* %result
ret void
}
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }

View File

@ -0,0 +1,53 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
; Test to check if we skip propgating attributes even if
; a function is called directly as well as
; indirectly. "baz" is called directly as well indirectly.
; CHECK-LABEL: define float @foo()
define float @foo() #1 {
ret float 0.0
}
; CHECK-LABEL: define float @bar()
define float @bar() #1 {
ret float 0.0
}
; CHECK-LABEL: define float @baz()
define float @baz() #1 {
ret float 0.0
}
define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
%fn = alloca float ()*
switch i32 %type, label %sw.default [
i32 1, label %sw.bb
i32 2, label %sw.bb2
i32 3, label %sw.bb3
]
sw.bb:
store float ()* @foo, float ()** %fn
br label %sw.epilog
sw.bb2:
store float ()* @bar, float ()** %fn
br label %sw.epilog
sw.bb3:
store float ()* @baz, float ()** %fn
br label %sw.epilog
sw.default:
br label %sw.epilog
sw.epilog:
%fp = load float ()*, float ()** %fn
%direct_call = call contract float @baz()
%indirect_call = call contract float %fp()
store float %indirect_call, float* %result
ret void
}
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }

View File

@ -30,11 +30,14 @@ define private void @f(void ()* nocapture %0) #0 {
; In order to expose this bug, it is necessary that `g` have one of the
; propagated attributes, so that a clone and substitution would take place if g
; were actually the function being called.
; CHECK-DAG: define private void @g.1() #1
; CHECK-DAG: define internal void @g() #2
; CHECK-DAG: define private void @g.1() #0
; CHECK-DAG: define internal void @g() #1
define private void @g() #1 {
ret void
}
attributes #0 = { noinline }
attributes #1 = { noinline "amdgpu-waves-per-eu"="1,10" }
; CHECK: attributes #0 = { noinline }
; CHECK-NEXT: attributes #1 = { noinline "target-features"="+enable-ds128,+enable-prt-strict-null,+flat-address-space,+flat-for-global,+load-store-opt,+promote-alloca,+trap-handler,+unaligned-access-mode,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64" }

View File

@ -0,0 +1,34 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
; Test to check if we skip attributes on address
; taken functions and its callees.
; CHECK-LABEL: define float @bar() {
define float @bar() {
ret float 0.0
}
; CHECK-LABEL: define float @baz() {
define float @baz() {
ret float 0.0
}
; CHECK-LABEL: define float @foo() {
define float @foo() {
%v1 = call contract float @bar()
%v2 = call contract float @baz()
%v3 = fadd float %v1, %v2
ret float %v3
}
; CHECK-LABEL: define amdgpu_kernel void @kernel(float* %result, i32 %type) #0 {
define amdgpu_kernel void @kernel(float *%result, i32 %type) #1 {
%fn = alloca float ()*
store float ()* @foo, float ()** %fn
%fp = load float ()*, float ()** %fn
%indirect_call = call contract float %fp()
store float %indirect_call, float* %result
ret void
}
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }

View File

@ -0,0 +1,32 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
; Test attributes on a function which
; is called indirectly from two kernels
; having different launch bounds.
; This function should not have any attributes on it.
; CHECK-LABEL: define float @foo() {
define float @foo() {
ret float 0.0
}
define amdgpu_kernel void @kernel1(float *%result, i32 %type) #1 {
%fn = alloca float ()*
store float ()* @foo, float ()** %fn
%fp = load float ()*, float ()** %fn
%indirect_call = call contract float %fp()
store float %indirect_call, float* %result
ret void
}
define amdgpu_kernel void @kernel2(float *%result, i32 %type) #2 {
%fn = alloca float ()*
store float ()* @foo, float ()** %fn
%fp = load float ()*, float ()** %fn
%indirect_call = call contract float %fp()
store float %indirect_call, float* %result
ret void
}
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,512" }