mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
2618da3d3b
Relying on any MachineFunction state in the MachineFunctionInfo constructor is hazardous, because the construction time is unclear and determined by the first use. The function may be only partially constructed, which is part of why we have many of these hacky string attributes to track what we need for ABI lowering. For SelectionDAG, all stack objects are created up-front before calling convention lowering so stack objects are visible at construction time. For GlobalISel, none of the IR function has been visited yet and the allocas haven't been added to the MachineFrameInfo yet. This should fix failing to set flat_scratch_init in GlobalISel when needed. This pass really needs to be turned into some kind of analysis, but I haven't found a nice way use one here.
406 lines
12 KiB
C++
406 lines
12 KiB
C++
//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file This pass adds target attributes to functions which use intrinsics
|
|
/// which will impact calling convention lowering.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUSubtarget.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/ADT/Triple.h"
|
|
#include "llvm/Analysis/CallGraph.h"
|
|
#include "llvm/Analysis/CallGraphSCCPass.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/Constant.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/Instruction.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/IR/Type.h"
|
|
#include "llvm/IR/Use.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
|
|
private:
|
|
const TargetMachine *TM = nullptr;
|
|
SmallVector<CallGraphNode*, 8> NodeList;
|
|
|
|
bool addFeatureAttributes(Function &F);
|
|
bool processUniformWorkGroupAttribute();
|
|
bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
|
|
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
|
|
|
|
bool doInitialization(CallGraph &CG) override;
|
|
bool runOnSCC(CallGraphSCC &SCC) override;
|
|
|
|
StringRef getPassName() const override {
|
|
return "AMDGPU Annotate Kernel Features";
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesAll();
|
|
CallGraphSCCPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
static bool visitConstantExpr(const ConstantExpr *CE);
|
|
static bool visitConstantExprsRecursively(
|
|
const Constant *EntryC,
|
|
SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
|
|
bool HasApertureRegs);
|
|
};
|
|
|
|
} // end anonymous namespace
|
|
|
|
char AMDGPUAnnotateKernelFeatures::ID = 0;
|
|
|
|
char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
|
|
|
|
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
|
|
"Add AMDGPU function attributes", false, false)
|
|
|
|
|
|
// The queue ptr is only needed when casting to flat, not from it.
|
|
static bool castRequiresQueuePtr(unsigned SrcAS) {
|
|
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
|
|
}
|
|
|
|
static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
|
|
return castRequiresQueuePtr(ASC->getSrcAddressSpace());
|
|
}
|
|
|
|
static bool isDSAddress(const Constant *C) {
|
|
const GlobalValue *GV = dyn_cast<GlobalValue>(C);
|
|
if (!GV)
|
|
return false;
|
|
unsigned AS = GV->getAddressSpace();
|
|
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
|
|
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
|
|
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
|
|
return castRequiresQueuePtr(SrcAS);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
|
|
const Constant *EntryC,
|
|
SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
|
|
bool IsFunc, bool HasApertureRegs) {
|
|
|
|
if (!ConstantExprVisited.insert(EntryC).second)
|
|
return false;
|
|
|
|
SmallVector<const Constant *, 16> Stack;
|
|
Stack.push_back(EntryC);
|
|
|
|
while (!Stack.empty()) {
|
|
const Constant *C = Stack.pop_back_val();
|
|
|
|
// We need to trap on DS globals in non-entry functions.
|
|
if (IsFunc && isDSAddress(C))
|
|
return true;
|
|
|
|
// Check this constant expression.
|
|
if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
|
|
if (!HasApertureRegs && visitConstantExpr(CE))
|
|
return true;
|
|
}
|
|
|
|
// Visit all sub-expressions.
|
|
for (const Use &U : C->operands()) {
|
|
const auto *OpC = dyn_cast<Constant>(U);
|
|
if (!OpC)
|
|
continue;
|
|
|
|
if (!ConstantExprVisited.insert(OpC).second)
|
|
continue;
|
|
|
|
Stack.push_back(OpC);
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// We do not need to note the x workitem or workgroup id because they are always
|
|
// initialized.
|
|
//
|
|
// TODO: We should not add the attributes if the known compile time workgroup
|
|
// size is 1 for y/z.
|
|
static StringRef intrinsicToAttrName(Intrinsic::ID ID,
|
|
bool &NonKernelOnly,
|
|
bool &IsQueuePtr) {
|
|
switch (ID) {
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
NonKernelOnly = true;
|
|
return "amdgpu-work-item-id-x";
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
|
NonKernelOnly = true;
|
|
return "amdgpu-work-group-id-x";
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
case Intrinsic::r600_read_tidig_y:
|
|
return "amdgpu-work-item-id-y";
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
case Intrinsic::r600_read_tidig_z:
|
|
return "amdgpu-work-item-id-z";
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
|
case Intrinsic::r600_read_tgid_y:
|
|
return "amdgpu-work-group-id-y";
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
|
case Intrinsic::r600_read_tgid_z:
|
|
return "amdgpu-work-group-id-z";
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
|
return "amdgpu-dispatch-ptr";
|
|
case Intrinsic::amdgcn_dispatch_id:
|
|
return "amdgpu-dispatch-id";
|
|
case Intrinsic::amdgcn_kernarg_segment_ptr:
|
|
return "amdgpu-kernarg-segment-ptr";
|
|
case Intrinsic::amdgcn_implicitarg_ptr:
|
|
return "amdgpu-implicitarg-ptr";
|
|
case Intrinsic::amdgcn_queue_ptr:
|
|
case Intrinsic::amdgcn_is_shared:
|
|
case Intrinsic::amdgcn_is_private:
|
|
// TODO: Does not require queue ptr on gfx9+
|
|
case Intrinsic::trap:
|
|
case Intrinsic::debugtrap:
|
|
IsQueuePtr = true;
|
|
return "amdgpu-queue-ptr";
|
|
default:
|
|
return "";
|
|
}
|
|
}
|
|
|
|
static bool handleAttr(Function &Parent, const Function &Callee,
|
|
StringRef Name) {
|
|
if (Callee.hasFnAttribute(Name)) {
|
|
Parent.addFnAttr(Name);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
|
|
bool &NeedQueuePtr) {
|
|
// X ids unnecessarily propagated to kernels.
|
|
static constexpr StringLiteral AttrNames[] = {
|
|
"amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
|
|
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
|
|
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
|
|
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
|
|
"amdgpu-implicitarg-ptr"};
|
|
|
|
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
|
|
NeedQueuePtr = true;
|
|
|
|
for (StringRef AttrName : AttrNames)
|
|
handleAttr(Parent, Callee, AttrName);
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
|
|
bool Changed = false;
|
|
|
|
for (auto *Node : reverse(NodeList)) {
|
|
Function *Caller = Node->getFunction();
|
|
|
|
for (auto I : *Node) {
|
|
Function *Callee = std::get<1>(I)->getFunction();
|
|
if (Callee)
|
|
Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
|
|
}
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
|
|
Function &Caller, Function &Callee) {
|
|
|
|
// Check for externally defined function
|
|
if (!Callee.hasExactDefinition()) {
|
|
Callee.addFnAttr("uniform-work-group-size", "false");
|
|
if (!Caller.hasFnAttribute("uniform-work-group-size"))
|
|
Caller.addFnAttr("uniform-work-group-size", "false");
|
|
|
|
return true;
|
|
}
|
|
// Check if the Caller has the attribute
|
|
if (Caller.hasFnAttribute("uniform-work-group-size")) {
|
|
// Check if the value of the attribute is true
|
|
if (Caller.getFnAttribute("uniform-work-group-size")
|
|
.getValueAsString().equals("true")) {
|
|
// Propagate the attribute to the Callee, if it does not have it
|
|
if (!Callee.hasFnAttribute("uniform-work-group-size")) {
|
|
Callee.addFnAttr("uniform-work-group-size", "true");
|
|
return true;
|
|
}
|
|
} else {
|
|
Callee.addFnAttr("uniform-work-group-size", "false");
|
|
return true;
|
|
}
|
|
} else {
|
|
// If the attribute is absent, set it as false
|
|
Caller.addFnAttr("uniform-work-group-size", "false");
|
|
Callee.addFnAttr("uniform-work-group-size", "false");
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
|
|
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
|
|
bool HasApertureRegs = ST.hasApertureRegs();
|
|
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
|
|
|
|
bool HaveStackObjects = false;
|
|
bool Changed = false;
|
|
bool NeedQueuePtr = false;
|
|
bool HaveCall = false;
|
|
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
|
|
|
|
for (BasicBlock &BB : F) {
|
|
for (Instruction &I : BB) {
|
|
if (isa<AllocaInst>(I)) {
|
|
HaveStackObjects = true;
|
|
continue;
|
|
}
|
|
|
|
if (auto *CB = dyn_cast<CallBase>(&I)) {
|
|
const Function *Callee =
|
|
dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
|
|
|
|
// TODO: Do something with indirect calls.
|
|
if (!Callee) {
|
|
if (!CB->isInlineAsm())
|
|
HaveCall = true;
|
|
continue;
|
|
}
|
|
|
|
Intrinsic::ID IID = Callee->getIntrinsicID();
|
|
if (IID == Intrinsic::not_intrinsic) {
|
|
HaveCall = true;
|
|
copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
|
|
Changed = true;
|
|
} else {
|
|
bool NonKernelOnly = false;
|
|
|
|
if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
|
|
F.addFnAttr("amdgpu-kernarg-segment-ptr");
|
|
} else {
|
|
StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
|
|
NeedQueuePtr);
|
|
if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
|
|
F.addFnAttr(AttrName);
|
|
Changed = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
|
|
continue;
|
|
|
|
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
|
|
if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
|
|
NeedQueuePtr = true;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
for (const Use &U : I.operands()) {
|
|
const auto *OpC = dyn_cast<Constant>(U);
|
|
if (!OpC)
|
|
continue;
|
|
|
|
if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
|
|
HasApertureRegs)) {
|
|
NeedQueuePtr = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (NeedQueuePtr) {
|
|
F.addFnAttr("amdgpu-queue-ptr");
|
|
Changed = true;
|
|
}
|
|
|
|
// TODO: We could refine this to captured pointers that could possibly be
|
|
// accessed by flat instructions. For now this is mostly a poor way of
|
|
// estimating whether there are calls before argument lowering.
|
|
if (!IsFunc && HaveCall) {
|
|
F.addFnAttr("amdgpu-calls");
|
|
Changed = true;
|
|
}
|
|
|
|
if (HaveStackObjects) {
|
|
F.addFnAttr("amdgpu-stack-objects");
|
|
Changed = true;
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
|
|
bool Changed = false;
|
|
|
|
for (CallGraphNode *I : SCC) {
|
|
// Build a list of CallGraphNodes from most number of uses to least
|
|
if (I->getNumReferences())
|
|
NodeList.push_back(I);
|
|
else {
|
|
processUniformWorkGroupAttribute();
|
|
NodeList.clear();
|
|
}
|
|
|
|
Function *F = I->getFunction();
|
|
// Add feature attributes
|
|
if (!F || F->isDeclaration())
|
|
continue;
|
|
Changed |= addFeatureAttributes(*F);
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
|
|
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
|
|
if (!TPC)
|
|
report_fatal_error("TargetMachine is required");
|
|
|
|
TM = &TPC->getTM<TargetMachine>();
|
|
return false;
|
|
}
|
|
|
|
Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
|
|
return new AMDGPUAnnotateKernelFeatures();
|
|
}
|