//===- AMDGPUAttributor.cpp -----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file This pass uses Attributor framework to deduce AMDGPU attributes. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/Attributor.h" #define DEBUG_TYPE "amdgpu-attributor" using namespace llvm; static constexpr StringLiteral ImplicitAttrNames[] = { // X ids unnecessarily propagated to kernels. "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; // We do not need to note the x workitem or workgroup id because they are always // initialized. // // TODO: We should not add the attributes if the known compile time workgroup // size is 1 for y/z. static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) { switch (ID) { case Intrinsic::amdgcn_workitem_id_x: NonKernelOnly = true; return "amdgpu-work-item-id-x"; case Intrinsic::amdgcn_workgroup_id_x: NonKernelOnly = true; return "amdgpu-work-group-id-x"; case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: return "amdgpu-work-item-id-y"; case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: return "amdgpu-work-item-id-z"; case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: return "amdgpu-work-group-id-y"; case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: return "amdgpu-work-group-id-z"; case Intrinsic::amdgcn_dispatch_ptr: return "amdgpu-dispatch-ptr"; case Intrinsic::amdgcn_dispatch_id: return "amdgpu-dispatch-id"; case Intrinsic::amdgcn_kernarg_segment_ptr: return "amdgpu-kernarg-segment-ptr"; case Intrinsic::amdgcn_implicitarg_ptr: return "amdgpu-implicitarg-ptr"; case Intrinsic::amdgcn_queue_ptr: case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: // TODO: Does not require queue ptr on gfx9+ case Intrinsic::trap: case Intrinsic::debugtrap: IsQueuePtr = true; return "amdgpu-queue-ptr"; default: return ""; } } static bool castRequiresQueuePtr(unsigned SrcAS) { return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; } static bool isDSAddress(const Constant *C) { const GlobalValue *GV = dyn_cast(C); if (!GV) return false; unsigned AS = GV->getAddressSpace(); return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; } class AMDGPUInformationCache : public InformationCache { public: AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, BumpPtrAllocator &Allocator, SetVector *CGSCC, TargetMachine &TM) : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {} TargetMachine &TM; enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 }; /// Check if the subtarget has aperture regs. bool hasApertureRegs(Function &F) { const GCNSubtarget &ST = TM.getSubtarget(F); return ST.hasApertureRegs(); } private: /// Check if the ConstantExpr \p CE requires queue ptr attribute. static bool visitConstExpr(const ConstantExpr *CE) { if (CE->getOpcode() == Instruction::AddrSpaceCast) { unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); return castRequiresQueuePtr(SrcAS); } return false; } /// Get the constant access bitmap for \p C. uint8_t getConstantAccess(const Constant *C) { auto It = ConstantStatus.find(C); if (It != ConstantStatus.end()) return It->second; uint8_t Result = 0; if (isDSAddress(C)) Result = DS_GLOBAL; if (const auto *CE = dyn_cast(C)) if (visitConstExpr(CE)) Result |= ADDR_SPACE_CAST; for (const Use &U : C->operands()) { const auto *OpC = dyn_cast(U); if (!OpC) continue; Result |= getConstantAccess(OpC); } return Result; } public: /// Returns true if \p Fn needs a queue ptr attribute because of \p C. bool needsQueuePtr(const Constant *C, Function &Fn) { bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); bool HasAperture = hasApertureRegs(Fn); // No need to explore the constants. if (!IsNonEntryFunc && HasAperture) return false; uint8_t Access = getConstantAccess(C); // We need to trap on DS globals in non-entry functions. if (IsNonEntryFunc && (Access & DS_GLOBAL)) return true; return !HasAperture && (Access & ADDR_SPACE_CAST); } private: /// Used to determine if the Constant needs a queue ptr attribute. DenseMap ConstantStatus; }; struct AAAMDAttributes : public StateWrapper { using Base = StateWrapper; AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {} /// Create an abstract attribute view for the position \p IRP. static AAAMDAttributes &createForPosition(const IRPosition &IRP, Attributor &A); /// See AbstractAttribute::getName(). const std::string getName() const override { return "AAAMDAttributes"; } /// See AbstractAttribute::getIdAddr(). const char *getIdAddr() const override { return &ID; } /// This function should return true if the type of the \p AA is /// AAAMDAttributes. static bool classof(const AbstractAttribute *AA) { return (AA->getIdAddr() == &ID); } virtual const DenseSet &getAttributes() const = 0; /// Unique ID (due to the unique address) static const char ID; }; const char AAAMDAttributes::ID = 0; struct AAAMDWorkGroupSize : public StateWrapper { using Base = StateWrapper; AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} /// Create an abstract attribute view for the position \p IRP. static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP, Attributor &A); /// See AbstractAttribute::getName(). const std::string getName() const override { return "AAAMDWorkGroupSize"; } /// See AbstractAttribute::getIdAddr(). const char *getIdAddr() const override { return &ID; } /// This function should return true if the type of the \p AA is /// AAAMDAttributes. static bool classof(const AbstractAttribute *AA) { return (AA->getIdAddr() == &ID); } /// Unique ID (due to the unique address) static const char ID; }; const char AAAMDWorkGroupSize::ID = 0; struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize { AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) : AAAMDWorkGroupSize(IRP, A) {} void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); CallingConv::ID CC = F->getCallingConv(); if (CC != CallingConv::AMDGPU_KERNEL) return; bool InitialValue = false; if (F->hasFnAttribute("uniform-work-group-size")) InitialValue = F->getFnAttribute("uniform-work-group-size") .getValueAsString() .equals("true"); if (InitialValue) indicateOptimisticFixpoint(); else indicatePessimisticFixpoint(); } ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Change = ChangeStatus::UNCHANGED; auto CheckCallSite = [&](AbstractCallSite CS) { Function *Caller = CS.getInstruction()->getFunction(); LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] Call " << Caller->getName() << "->" << getAssociatedFunction()->getName() << "\n"); const auto &CallerInfo = A.getAAFor( *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); Change = Change | clampStateAndIndicateChange(this->getState(), CallerInfo.getState()); return true; }; bool AllCallSitesKnown = true; if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) indicatePessimisticFixpoint(); return Change; } ChangeStatus manifest(Attributor &A) override { SmallVector AttrList; LLVMContext &Ctx = getAssociatedFunction()->getContext(); AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size", getAssumed() ? "true" : "false")); return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, /* ForceReplace */ true); } bool isValidState() const override { // This state is always valid, even when the state is false. return true; } const std::string getAsStr() const override { return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]"; } /// See AbstractAttribute::trackStatistics() void trackStatistics() const override {} }; AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP, Attributor &A) { if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A); llvm_unreachable("AAAMDWorkGroupSize is only valid for function position"); } struct AAAMDAttributesFunction : public AAAMDAttributes { AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A) : AAAMDAttributes(IRP, A) {} void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); CallingConv::ID CC = F->getCallingConv(); bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); // Don't add attributes to instrinsics if (F->isIntrinsic()) { indicatePessimisticFixpoint(); return; } // Ignore functions with graphics calling conventions, these are currently // not allowed to have kernel arguments. if (AMDGPU::isGraphics(F->getCallingConv())) { indicatePessimisticFixpoint(); return; } for (StringRef Attr : ImplicitAttrNames) { if (F->hasFnAttribute(Attr)) Attributes.insert(Attr); } // TODO: We shouldn't need this in the future. if (CallingConvSupportsAllImplicits && F->hasAddressTaken(nullptr, true, true, true)) { for (StringRef AttrName : ImplicitAttrNames) { Attributes.insert(AttrName); } } } ChangeStatus updateImpl(Attributor &A) override { Function *F = getAssociatedFunction(); ChangeStatus Change = ChangeStatus::UNCHANGED; bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); CallingConv::ID CC = F->getCallingConv(); bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); auto &InfoCache = static_cast(A.getInfoCache()); auto AddAttribute = [&](StringRef AttrName) { if (Attributes.insert(AttrName).second) Change = ChangeStatus::CHANGED; }; // Check for Intrinsics and propagate attributes. const AACallEdges &AAEdges = A.getAAFor( *this, this->getIRPosition(), DepClassTy::REQUIRED); // We have to assume that we can reach a function with these attributes. // We do not consider inline assembly as a unknown callee. if (CallingConvSupportsAllImplicits && AAEdges.hasNonAsmUnknownCallee()) { for (StringRef AttrName : ImplicitAttrNames) { AddAttribute(AttrName); } } bool NeedsQueuePtr = false; bool HasCall = false; for (Function *Callee : AAEdges.getOptimisticEdges()) { Intrinsic::ID IID = Callee->getIntrinsicID(); if (IID != Intrinsic::not_intrinsic) { if (!IsNonEntryFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { AddAttribute("amdgpu-kernarg-segment-ptr"); continue; } bool NonKernelOnly = false; StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, NeedsQueuePtr); if (!AttrName.empty() && (IsNonEntryFunc || !NonKernelOnly)) AddAttribute(AttrName); continue; } HasCall = true; const AAAMDAttributes &AAAMD = A.getAAFor( *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); const DenseSet &CalleeAttributes = AAAMD.getAttributes(); // Propagate implicit attributes from called function. for (StringRef AttrName : ImplicitAttrNames) if (CalleeAttributes.count(AttrName)) AddAttribute(AttrName); } HasCall |= AAEdges.hasUnknownCallee(); if (!IsNonEntryFunc && HasCall) AddAttribute("amdgpu-calls"); // Check the function body. auto CheckAlloca = [&](Instruction &I) { AddAttribute("amdgpu-stack-objects"); return false; }; bool UsedAssumedInformation = false; A.checkForAllInstructions(CheckAlloca, *this, {Instruction::Alloca}, UsedAssumedInformation); // If we found that we need amdgpu-queue-ptr, nothing else to do. if (NeedsQueuePtr || Attributes.count("amdgpu-queue-ptr")) { AddAttribute("amdgpu-queue-ptr"); return Change; } auto CheckAddrSpaceCasts = [&](Instruction &I) { unsigned SrcAS = static_cast(I).getSrcAddressSpace(); if (castRequiresQueuePtr(SrcAS)) { NeedsQueuePtr = true; return false; } return true; }; bool HasApertureRegs = InfoCache.hasApertureRegs(*F); // `checkForAllInstructions` is much more cheaper than going through all // instructions, try it first. // amdgpu-queue-ptr is not needed if aperture regs is present. if (!HasApertureRegs) A.checkForAllInstructions(CheckAddrSpaceCasts, *this, {Instruction::AddrSpaceCast}, UsedAssumedInformation); // If we found that we need amdgpu-queue-ptr, nothing else to do. if (NeedsQueuePtr) { AddAttribute("amdgpu-queue-ptr"); return Change; } if (!IsNonEntryFunc && HasApertureRegs) return Change; for (BasicBlock &BB : *F) { for (Instruction &I : BB) { for (const Use &U : I.operands()) { if (const auto *C = dyn_cast(U)) { if (InfoCache.needsQueuePtr(C, *F)) { AddAttribute("amdgpu-queue-ptr"); return Change; } } } } } return Change; } ChangeStatus manifest(Attributor &A) override { SmallVector AttrList; LLVMContext &Ctx = getAssociatedFunction()->getContext(); for (StringRef AttrName : Attributes) AttrList.push_back(Attribute::get(Ctx, AttrName)); return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, /* ForceReplace */ true); } const std::string getAsStr() const override { return "AMDInfo[" + std::to_string(Attributes.size()) + "]"; } const DenseSet &getAttributes() const override { return Attributes; } /// See AbstractAttribute::trackStatistics() void trackStatistics() const override {} private: DenseSet Attributes; }; AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, Attributor &A) { if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) return *new (A.Allocator) AAAMDAttributesFunction(IRP, A); llvm_unreachable("AAAMDAttributes is only valid for function position"); } class AMDGPUAttributor : public ModulePass { public: AMDGPUAttributor() : ModulePass(ID) {} /// doInitialization - Virtual method overridden by subclasses to do /// any necessary initialization before any pass is run. bool doInitialization(Module &) override { auto *TPC = getAnalysisIfAvailable(); if (!TPC) report_fatal_error("TargetMachine is required"); TM = &TPC->getTM(); return false; } bool runOnModule(Module &M) override { SetVector Functions; AnalysisGetter AG; for (Function &F : M) Functions.insert(&F); CallGraphUpdater CGUpdater; BumpPtrAllocator Allocator; AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); Attributor A(Functions, InfoCache, CGUpdater); for (Function &F : M) { A.getOrCreateAAFor(IRPosition::function(F)); A.getOrCreateAAFor(IRPosition::function(F)); } ChangeStatus Change = A.run(); return Change == ChangeStatus::CHANGED; } StringRef getPassName() const override { return "AMDGPU Attributor"; } TargetMachine *TM; static char ID; }; char AMDGPUAttributor::ID = 0; Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); } INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)