From 37c462f96a52a75ab49d72c6169ff00a90f8b64e Mon Sep 17 00:00:00 2001 From: hsmahesha Date: Mon, 21 Jun 2021 10:55:23 +0530 Subject: [PATCH] [AMDGPU] Replace non-kernel function uses of LDS globals by pointers. The main motivation behind pointer replacement of LDS use within non-kernel functions is - to *avoid* subsequent LDS lowering pass from directly packing LDS (assume large LDS) into a struct type which would otherwise cause allocating huge memory for struct instance within every kernel. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D103225 --- lib/Target/AMDGPU/AMDGPU.h | 9 + .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 7 + .../AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp | 460 ++++++++++++++++++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 19 +- lib/Target/AMDGPU/CMakeLists.txt | 1 + lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp | 185 +++++++ lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h | 20 + test/CodeGen/AMDGPU/llc-pipeline.ll | 5 + .../replace-lds-by-ptr-call-diamond-shape.ll | 88 ++++ ...lace-lds-by-ptr-call-selected_functions.ll | 130 +++++ ...lace-lds-by-ptr-ignore-global-scope-use.ll | 53 ++ ...place-lds-by-ptr-ignore-inline-asm-call.ll | 30 ++ ...-lds-by-ptr-ignore-kernel-only-used-lds.ll | 25 + ...ace-lds-by-ptr-ignore-not-reachable-lds.ll | 28 ++ .../replace-lds-by-ptr-ignore-small-lds.ll | 31 ++ ...-lds-by-ptr-indirect-call-diamond-shape.ll | 95 ++++ ...by-ptr-indirect-call-selected_functions.ll | 151 ++++++ ...ds-by-ptr-indirect-call-signature-match.ll | 94 ++++ .../AMDGPU/replace-lds-by-ptr-lds-offsets.ll | 214 ++++++++ .../replace-lds-by-ptr-use-multiple-lds.ll | 66 +++ .../AMDGPU/replace-lds-by-ptr-use-same-lds.ll | 53 ++ ...place-lds-by-ptr-use-within-const-expr1.ll | 54 ++ ...place-lds-by-ptr-use-within-const-expr2.ll | 58 +++ .../replace-lds-by-ptr-use-within-phi-inst.ll | 93 ++++ 24 files changed, 1968 insertions(+), 1 deletion(-) create mode 100644 lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll create mode 100644 test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index a38d0a779bd..2cfda5533db 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -71,6 +71,7 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +ModulePass *createAMDGPUReplaceLDSUseWithPointerPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); @@ -146,6 +147,14 @@ private: TargetMachine &TM; }; +void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &); +extern char &AMDGPUReplaceLDSUseWithPointerID; + +struct AMDGPUReplaceLDSUseWithPointerPass + : PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPULowerModuleLDSPass(PassRegistry &); extern char &AMDGPULowerModuleLDSID; diff --git a/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 937daea6bc2..48e3ad68a4f 100644 --- a/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -24,6 +24,13 @@ // A possible future refinement is to specialise the structure per-kernel, so // that fields can be elided based on more expensive analysis. // +// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct +// type which would cause allocating huge memory for struct instance within +// every kernel. Hence, before running this pass, it is advisable to run the +// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within +// non-kernel functions by pointers and thereby minimizes the unnecessary per +// kernel allocation of LDS memory. +// //===----------------------------------------------------------------------===// #include "AMDGPU.h" diff --git a/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp new file mode 100644 index 00000000000..ca35cc5da5b --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -0,0 +1,460 @@ +//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass replaces all the uses of LDS within non-kernel functions by +// corresponding pointer counter-parts. +// +// The main motivation behind this pass is - to *avoid* subsequent LDS lowering +// pass from directly packing LDS (assume large LDS) into a struct type which +// would otherwise cause allocating huge memory for struct instance within every +// kernel. +// +// Brief sketch of the algorithm implemented in this pass is as below: +// +// 1. Collect all the LDS defined in the module which qualify for pointer +// replacement, say it is, LDSGlobals set. +// +// 2. Collect all the reachable callees for each kernel defined in the module, +// say it is, KernelToCallees map. +// +// 3. FOR (each global GV from LDSGlobals set) DO +// LDSUsedNonKernels = Collect all non-kernel functions which use GV. +// FOR (each kernel K in KernelToCallees map) DO +// ReachableCallees = KernelToCallees[K] +// ReachableAndLDSUsedCallees = +// SetIntersect(LDSUsedNonKernels, ReachableCallees) +// IF (ReachableAndLDSUsedCallees is not empty) THEN +// Pointer = Create a pointer to point-to GV if not created. +// Initialize Pointer to point-to GV within kernel K. +// ENDIF +// ENDFOR +// Replace all uses of GV within non kernel functions by Pointer. +// ENFOR +// +// LLVM IR example: +// +// Input IR: +// +// @lds = internal addrspace(3) global [4 x i32] undef, align 16 +// +// define internal void @f0() { +// entry: +// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, +// i32 0, i32 0 +// ret void +// } +// +// define protected amdgpu_kernel void @k0() { +// entry: +// call void @f0() +// ret void +// } +// +// Output IR: +// +// @lds = internal addrspace(3) global [4 x i32] undef, align 16 +// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +// +// define internal void @f0() { +// entry: +// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2 +// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, +// i32 0, i32 0 +// ret void +// } +// +// define protected amdgpu_kernel void @k0() { +// entry: +// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16), +// i16 addrspace(3)* @lds.ptr, align 2 +// call void @f0() +// ret void +// } +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPULDSUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/ReplaceConstant.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer" + +using namespace llvm; + +namespace { + +class ReplaceLDSUseImpl { + Module &M; + LLVMContext &Ctx; + const DataLayout &DL; + Constant *LDSMemBaseAddr; + + DenseMap LDSToPointer; + DenseMap> LDSToNonKernels; + DenseMap> KernelToCallees; + DenseMap> KernelToLDSPointers; + DenseMap KernelToInitBB; + DenseMap> + FunctionToLDSToReplaceInst; + + // Collect LDS which requires their uses to be replaced by pointer. + std::vector collectLDSRequiringPointerReplace() { + // Collect LDS which requires module lowering. + std::vector LDSGlobals = AMDGPU::findVariablesToLower(M); + + // Remove LDS which don't qualify for replacement. + LDSGlobals.erase(std::remove_if(LDSGlobals.begin(), LDSGlobals.end(), + [&](GlobalVariable *GV) { + return shouldIgnorePointerReplacement(GV); + }), + LDSGlobals.end()); + + return LDSGlobals; + } + + // Returns true if uses of given LDS global within non-kernel functions should + // be keep as it is without pointer replacement. + bool shouldIgnorePointerReplacement(GlobalVariable *GV) { + // LDS whose size is very small and doesn`t exceed pointer size is not worth + // replacing. + if (DL.getTypeAllocSize(GV->getValueType()) <= 2) + return true; + + // LDS which is not used from non-kernel function scope or it is used from + // global scope does not qualify for replacement. + LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV); + return LDSToNonKernels[GV].empty(); + + // FIXME: When GV is used within all (or within most of the kernels), then + // it does not make sense to create a pointer for it. + } + + // Insert new global LDS pointer which points to LDS. + GlobalVariable *createLDSPointer(GlobalVariable *GV) { + // LDS pointer which points to LDS is already created? return it. + auto PointerEntry = LDSToPointer.insert(std::make_pair(GV, nullptr)); + if (!PointerEntry.second) + return PointerEntry.first->second; + + // We need to create new LDS pointer which points to LDS. + // + // Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to + // 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address. + auto *I16Ty = Type::getInt16Ty(Ctx); + GlobalVariable *LDSPointer = new GlobalVariable( + M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty), + GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS); + + LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer)); + + // Mark that an associated LDS pointer is created for LDS. + LDSToPointer[GV] = LDSPointer; + + return LDSPointer; + } + + // Split entry basic block in such a way that only lane 0 of each wave does + // the LDS pointer initialization, and return newly created basic block. + BasicBlock *activateLaneZero(Function *K) { + // If the entry basic block of kernel K is already splitted, then return + // newly created basic block. + auto BasicBlockEntry = KernelToInitBB.insert(std::make_pair(K, nullptr)); + if (!BasicBlockEntry.second) + return BasicBlockEntry.first->second; + + // Split entry basic block of kernel K. + auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + Value *Mbcnt = + Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + {Builder.getInt32(-1), Builder.getInt32(0)}); + Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0)); + Instruction *WB = cast( + Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {})); + + BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent(); + + // Mark that the entry basic block of kernel K is splitted. + KernelToInitBB[K] = NBB; + + return NBB; + } + + // Within given kernel, initialize given LDS pointer to point to given LDS. + void initializeLDSPointer(Function *K, GlobalVariable *GV, + GlobalVariable *LDSPointer) { + // If LDS pointer is already initialized within K, then nothing to do. + auto PointerEntry = KernelToLDSPointers.insert( + std::make_pair(K, SmallPtrSet())); + if (!PointerEntry.second) + if (PointerEntry.first->second.contains(LDSPointer)) + return; + + // Insert instructions at EI which initialize LDS pointer to point-to LDS + // within kernel K. + // + // That is, convert pointer type of GV to i16, and then store this converted + // i16 value within LDSPointer which is of type i16*. + auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt())); + IRBuilder<> Builder(EI); + Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)), + LDSPointer); + + // Mark that LDS pointer is initialized within kernel K. + KernelToLDSPointers[K].insert(LDSPointer); + } + + // We have created an LDS pointer for LDS, and initialized it to point-to LDS + // within all relevent kernels. Now replace all the uses of LDS within + // non-kernel functions by LDS pointer. + void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) { + SmallVector LDSUsers(GV->users()); + for (auto *U : LDSUsers) { + // When `U` is a constant expression, it is possible that same constant + // expression exists within multiple instructions, and within multiple + // non-kernel functions. Collect all those non-kernel functions and all + // those instructions within which `U` exist. + auto FunctionToInsts = + AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/); + + for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end(); + FI != FE; ++FI) { + Function *F = FI->first; + auto &Insts = FI->second; + for (auto *I : Insts) { + // If `U` is a constant expression, then we need to break the + // associated instruction into a set of separate instructions by + // converting constant expressions into instructions. + SmallPtrSet UserInsts; + + if (U == I) { + // `U` is an instruction, conversion from constant expression to + // set of instructions is *not* required. + UserInsts.insert(I); + } else { + // `U` is a constant expression, convert it into corresponding set + // of instructions. + auto *CE = cast(U); + convertConstantExprsToInstructions(I, CE, &UserInsts); + } + + // Go through all the user instrutions, if LDS exist within them as an + // operand, then replace it by replace instruction. + for (auto *II : UserInsts) { + auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer); + II->replaceUsesOfWith(GV, ReplaceInst); + } + } + } + } + } + + // Create a set of replacement instructions which together replace LDS within + // non-kernel function F by accessing LDS indirectly using LDS pointer. + Value *getReplacementInst(Function *F, GlobalVariable *GV, + GlobalVariable *LDSPointer) { + // If the instruction which replaces LDS within F is already created, then + // return it. + auto LDSEntry = FunctionToLDSToReplaceInst.insert( + std::make_pair(F, DenseMap())); + if (!LDSEntry.second) { + auto ReplaceInstEntry = + LDSEntry.first->second.insert(std::make_pair(GV, nullptr)); + if (!ReplaceInstEntry.second) + return ReplaceInstEntry.first->second; + } + + // Get the instruction insertion point within the beginning of the entry + // block of current non-kernel function. + auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + // Insert required set of instructions which replace LDS within F. + auto *V = Builder.CreateBitCast( + Builder.CreateGEP( + LDSMemBaseAddr, + Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)), + GV->getType()); + + // Mark that the replacement instruction which replace LDS within F is + // created. + FunctionToLDSToReplaceInst[F][GV] = V; + + return V; + } + +public: + ReplaceLDSUseImpl(Module &M) + : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) { + LDSMemBaseAddr = Constant::getIntegerValue( + PointerType::get(Type::getInt8Ty(M.getContext()), + AMDGPUAS::LOCAL_ADDRESS), + APInt(32, 0)); + } + + // Entry-point function which interface ReplaceLDSUseImpl with outside of the + // class. + bool replaceLDSUse(); + +private: + // For a given LDS from collected LDS globals set, replace its non-kernel + // function scope uses by pointer. + bool replaceLDSUse(GlobalVariable *GV); +}; + +// For given LDS from collected LDS globals set, replace its non-kernel function +// scope uses by pointer. +bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { + // Holds all those non-kernel functions within which LDS is being accessed. + SmallPtrSet &LDSAccessors = LDSToNonKernels[GV]; + + // The LDS pointer which points to LDS and replaces all the uses of LDS. + GlobalVariable *LDSPointer = nullptr; + + // Traverse through each kernel K, check and if required, initialize the + // LDS pointer to point to LDS within K. + for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE; + ++KI) { + Function *K = KI->first; + SmallPtrSet Callees = KI->second; + + // Compute reachable and LDS used callees for kernel K. + set_intersect(Callees, LDSAccessors); + + // None of the LDS accessing non-kernel functions are reachable from + // kernel K. Hence, no need to initialize LDS pointer within kernel K. + if (Callees.empty()) + continue; + + // We have found reachable and LDS used callees for kernel K, and we need to + // initialize LDS pointer within kernel K, and we need to replace LDS use + // within those callees by LDS pointer. + // + // But, first check if LDS pointer is already created, if not create one. + LDSPointer = createLDSPointer(GV); + + // Initialize LDS pointer to point to LDS within kernel K. + initializeLDSPointer(K, GV, LDSPointer); + } + + // We have not found reachable and LDS used callees for any of the kernels, + // and hence we have not created LDS pointer. + if (!LDSPointer) + return false; + + // We have created an LDS pointer for LDS, and initialized it to point-to LDS + // within all relevent kernels. Now replace all the uses of LDS within + // non-kernel functions by LDS pointer. + replaceLDSUseByPointer(GV, LDSPointer); + + return true; +} + +// Entry-point function which interface ReplaceLDSUseImpl with outside of the +// class. +bool ReplaceLDSUseImpl::replaceLDSUse() { + // Collect LDS which requires their uses to be replaced by pointer. + std::vector LDSGlobals = + collectLDSRequiringPointerReplace(); + + // No LDS to pointer-replace. Nothing to do. + if (LDSGlobals.empty()) + return false; + + // Collect reachable callee set for each kernel defined in the module. + AMDGPU::collectReachableCallees(M, KernelToCallees); + + if (KernelToCallees.empty()) { + // Either module does not have any kernel definitions, or none of the kernel + // has a call to non-kernel functions, or we could not resolve any of the + // call sites to proper non-kernel functions, because of the situations like + // inline asm calls. Nothing to replace. + return false; + } + + // For every LDS from collected LDS globals set, replace its non-kernel + // function scope use by pointer. + bool Changed = false; + for (auto *GV : LDSGlobals) + Changed |= replaceLDSUse(GV); + + return Changed; +} + +class AMDGPUReplaceLDSUseWithPointer : public ModulePass { +public: + static char ID; + + AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) { + initializeAMDGPUReplaceLDSUseWithPointerPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } +}; + +} // namespace + +char AMDGPUReplaceLDSUseWithPointer::ID = 0; +char &llvm::AMDGPUReplaceLDSUseWithPointerID = + AMDGPUReplaceLDSUseWithPointer::ID; + +INITIALIZE_PASS_BEGIN( + AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, + "Replace within non-kernel function use of LDS with pointer", + false /*only look at the cfg*/, false /*analysis pass*/) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END( + AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, + "Replace within non-kernel function use of LDS with pointer", + false /*only look at the cfg*/, false /*analysis pass*/) + +bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) { + ReplaceLDSUseImpl LDSUseReplacer{M}; + return LDSUseReplacer.replaceLDSUse(); +} + +ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() { + return new AMDGPUReplaceLDSUseWithPointer(); +} + +PreservedAnalyses +AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) { + ReplaceLDSUseImpl LDSUseReplacer{M}; + LDSUseReplacer.replaceLDSUse(); + return PreservedAnalyses::all(); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 81a86b2ac1d..63b3a8d3b29 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -193,6 +193,11 @@ static cl::opt EnableStructurizerWorkarounds( cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableLDSReplaceWithPointer( + "amdgpu-enable-lds-replace-with-pointer", + cl::desc("Enable LDS replace with pointer pass"), cl::init(true), + cl::Hidden); + static cl::opt EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), @@ -240,6 +245,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); + initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); @@ -505,6 +511,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPUAlwaysInlinePass()); return true; } + if (PassName == "amdgpu-replace-lds-use-with-pointer") { + PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); + return true; + } if (PassName == "amdgpu-lower-module-lds") { PM.addPass(AMDGPULowerModuleLDSPass()); return true; @@ -889,8 +899,15 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); // Can increase LDS used by kernel so runs before PromoteAlloca - if (EnableLowerModuleLDS) + if (EnableLowerModuleLDS) { + // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the + // pass "amdgpu-lower-module-lds", and also it required to be run only if + // "amdgpu-lower-module-lds" pass is enabled. + if (EnableLDSReplaceWithPointer) + addPass(createAMDGPUReplaceLDSUseWithPointerPass()); + addPass(createAMDGPULowerModuleLDSPass()); + } if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 41d58d5b76b..bf44ad6a000 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUPropagateAttributes.cpp AMDGPURegBankCombiner.cpp AMDGPURegisterBankInfo.cpp + AMDGPUReplaceLDSUseWithPointer.cpp AMDGPURewriteOutArguments.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp diff --git a/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp index fd704faab2a..a8a023f4b54 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -12,7 +12,9 @@ #include "AMDGPULDSUtils.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" #include "llvm/IR/ReplaceConstant.h" @@ -22,6 +24,189 @@ namespace llvm { namespace AMDGPU { +// An helper class for collecting all reachable callees for each kernel defined +// within the module. +class CollectReachableCallees { + Module &M; + CallGraph CG; + SmallPtrSet AddressTakenFunctions; + + // Collect all address taken functions within the module. + void collectAddressTakenFunctions() { + auto *ECNode = CG.getExternalCallingNode(); + + for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) { + auto *CGN = GI->second; + auto *F = CGN->getFunction(); + if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F)) + continue; + AddressTakenFunctions.insert(CGN); + } + } + + // For a given caller node, collect all reachable callee nodes. + SmallPtrSet collectCGNodes(CallGraphNode *CGN) { + SmallPtrSet CGNodes; + + for (scc_iterator I = scc_begin(CGN); !I.isAtEnd(); ++I) { + const std::vector &SCC = *I; + assert(!SCC.empty() && "SCC with no functions?"); + for (auto *CGNode : SCC) + CGNodes.insert(CGNode); + } + + return CGNodes; + } + + // For given kernel, collect all its reachable non-kernel functions. + SmallPtrSet collectReachableCallees(Function *K) { + SmallPtrSet ReachableCallees; + + // Call graph node which represents this kernel. + auto *KCGN = CG[K]; + + // Collect all reachable call graph nodes from the node representing this + // kernel. + SmallPtrSet CGNodes = collectCGNodes(KCGN); + + // Go through collected reachable nodes, visit all thier call sites, if the + // call site is direct, add corresponding callee to reachable callee set, if + // it is indirect, resolve the indirect call site to potential reachable + // callees, add them to reachable callee set, and repeat the process for the + // newly added potential callee nodes. + // + // FIXME: Need to handle bit-casted function pointers. + // + SmallVector CGNStack(CGNodes.begin(), CGNodes.end()); + SmallPtrSet VisitedCGNodes; + while (!CGNStack.empty()) { + auto *CGN = CGNStack.pop_back_val(); + + if (!VisitedCGNodes.insert(CGN).second) + continue; + + for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { + auto *RCB = cast(GI->first.getValue()); + auto *RCGN = GI->second; + + if (auto *DCallee = RCGN->getFunction()) { + ReachableCallees.insert(DCallee); + } else if (RCB->isIndirectCall()) { + auto *RCBFTy = RCB->getFunctionType(); + for (auto *ACGN : AddressTakenFunctions) { + auto *ACallee = ACGN->getFunction(); + if (ACallee->getFunctionType() == RCBFTy) { + ReachableCallees.insert(ACallee); + SmallPtrSet IGCNNodes = collectCGNodes(ACGN); + for (auto *IGCN : IGCNNodes) + CGNStack.push_back(IGCN); + } + } + } + } + } + + return ReachableCallees; + } + +public: + explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { + // Collect address taken functions. + collectAddressTakenFunctions(); + } + + void collectReachableCallees( + DenseMap> &KernelToCallees) { + // Collect reachable callee set for each kernel defined in the module. + for (Function &F : M.functions()) { + if (!AMDGPU::isKernelCC(&F)) + continue; + Function *K = &F; + KernelToCallees[K] = collectReachableCallees(K); + } + } +}; + +void collectReachableCallees( + Module &M, + DenseMap> &KernelToCallees) { + CollectReachableCallees CRC{M}; + CRC.collectReachableCallees(KernelToCallees); +} + +SmallPtrSet collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { + SmallPtrSet LDSAccessors; + SmallVector UserStack(GV->users()); + SmallPtrSet VisitedUsers; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable which is initialized with LDS. Ignore LDS. + if (isa(U)) + return SmallPtrSet(); + + // Recursively explore constant users. + if (isa(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction, if it belongs to a non-kernel function F, + // then collect F. + Function *F = cast(U)->getFunction(); + if (!AMDGPU::isKernelCC(F)) + LDSAccessors.insert(F); + } + + return LDSAccessors; +} + +DenseMap> +getFunctionToInstsMap(User *U, bool CollectKernelInsts) { + DenseMap> FunctionToInsts; + SmallVector UserStack; + SmallPtrSet VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa(UU)) + continue; + + if (isa(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + auto *I = cast(UU); + Function *F = I->getFunction(); + if (CollectKernelInsts) { + if (!AMDGPU::isKernelCC(F)) { + continue; + } + } else { + if (AMDGPU::isKernelCC(F)) { + continue; + } + } + + FunctionToInsts.insert(std::make_pair(F, SmallPtrSet())); + FunctionToInsts[F].insert(I); + } + + return FunctionToInsts; +} + bool isKernelCC(const Function *Func) { return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } diff --git a/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h index 95011ee3e62..ffcafb9b76c 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -14,6 +14,8 @@ #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H #include "AMDGPU.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Constants.h" namespace llvm { @@ -21,6 +23,24 @@ class ConstantExpr; namespace AMDGPU { +/// Collect reachable callees for each kernel defined in the module \p M and +/// return collected callees at \p KernelToCallees. +void collectReachableCallees( + Module &M, + DenseMap> &KernelToCallees); + +/// For the given LDS global \p GV, visit all its users and collect all +/// non-kernel functions within which \p GV is used and return collected list of +/// such non-kernel functions. +SmallPtrSet collectNonKernelAccessorsOfLDS(GlobalVariable *GV); + +/// Collect all the instructions where user \p U belongs to. \p U could be +/// instruction itself or it could be a constant expression which is used within +/// an instruction. If \p CollectKernelInsts is true, collect instructions only +/// from kernels, otherwise collect instructions only from non-kernel functions. +DenseMap> +getFunctionToInstsMap(User *U, bool CollectKernelInsts); + bool isKernelCC(const Function *Func); Align getAlign(DataLayout const &DL, const GlobalVariable *GV); diff --git a/test/CodeGen/AMDGPU/llc-pipeline.ll b/test/CodeGen/AMDGPU/llc-pipeline.ll index 698891aaec1..d3d2b694960 100644 --- a/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -42,6 +42,7 @@ ; GCN-O0-NEXT: Inliner for always_inline functions ; GCN-O0-NEXT: A No-Op Barrier Pass ; GCN-O0-NEXT: Lower OpenCL enqueued blocks +; GCN-O0-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Dominator Tree Construction @@ -192,6 +193,7 @@ ; GCN-O1-NEXT: Inliner for always_inline functions ; GCN-O1-NEXT: A No-Op Barrier Pass ; GCN-O1-NEXT: Lower OpenCL enqueued blocks +; GCN-O1-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces @@ -438,6 +440,7 @@ ; GCN-O1-OPTS-NEXT: Inliner for always_inline functions ; GCN-O1-OPTS-NEXT: A No-Op Barrier Pass ; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks +; GCN-O1-OPTS-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces @@ -717,6 +720,7 @@ ; GCN-O2-NEXT: Inliner for always_inline functions ; GCN-O2-NEXT: A No-Op Barrier Pass ; GCN-O2-NEXT: Lower OpenCL enqueued blocks +; GCN-O2-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces @@ -997,6 +1001,7 @@ ; GCN-O3-NEXT: Inliner for always_inline functions ; GCN-O3-NEXT: A No-Op Barrier Pass ; GCN-O3-NEXT: Lower OpenCL enqueued blocks +; GCN-O3-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll new file mode 100644 index 00000000000..5498809e198 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll @@ -0,0 +1,88 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; The lds global @lds_used_within_func is used within non-kernel function @func_uses_lds +; which is recheable from kernel @kernel_reaches_lds, hence pointer replacement takes place +; for @lds_used_within_func. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 + +; Pointer should be created. +; CHECK: @lds_used_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @func_uses_lds() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_func.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 + ret void +} + +; No change +define internal void @func_does_not_use_lds_3() { +; CHECK-LABEL: entry: +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} + +; No change +define internal void @func_does_not_use_lds_2() { +; CHECK-LABEL: entry: +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} + +; No change +define internal void @func_does_not_use_lds_1() { +; CHECK-LABEL: entry: +; CHECK: call void @func_does_not_use_lds_2() +; CHECK: call void @func_does_not_use_lds_3() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_2() + call void @func_does_not_use_lds_3() + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_reaches_lds() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_func to i16), i16 addrspace(3)* @lds_used_within_func.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @func_does_not_use_lds_1() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_1() + ret void +} + +; No change here since this kernel does not reach @func_uses_lds which uses lds. +define protected amdgpu_kernel void @kernel_does_not_reach_lds() { +; CHECK-LABEL: entry: +; CHECK: ret void +entry: + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll new file mode 100644 index 00000000000..abc17deaebe --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll @@ -0,0 +1,130 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There are three lds globals defined here, and these three lds are used respectively within +; three non-kernel functions. There are three kernels, which call two of the non-kernel functions. +; Hence pointer replacement should take place for all three lds, and pointer initialization within +; kernel should selectively happen depending on which lds is reachable from the kernel. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4 +; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4 +@lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function_3() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [3 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_2() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [2 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_1() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [1 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @function_3() +; CHECK: call void @function_1() +; CHECK: ret void +entry: + call void @function_3() + call void @function_1() + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @function_2() +; CHECK: call void @function_3() +; CHECK: ret void +entry: + call void @function_2() + call void @function_3() + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @function_1() +; CHECK: call void @function_2() +; CHECK: ret void +entry: + call void @function_1() + call void @function_2() + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll new file mode 100644 index 00000000000..34a97da86ab --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; None of lds are pointer-replaced since they are all used in global scope in one or the other way. +; + +; CHECK: @lds = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2 +; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 +; CHECK: @lds.3 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@lds = internal addrspace(3) global [4 x i32] undef, align 4 +@lds.1 = addrspace(3) global i16 undef, align 2 +@lds.2 = addrspace(3) global i32 undef, align 4 +@lds.3 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 + +; CHECK: @global_var = addrspace(1) global float* addrspacecast (float addrspace(3)* bitcast ([4 x i32] addrspace(3)* @lds to float addrspace(3)*) to float*), align 8 +; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @alias.to.lds.3 = alias [1 x i8], [1 x i8] addrspace(3)* @lds.3 +@global_var = addrspace(1) global float* addrspacecast ([4 x i32] addrspace(3)* @lds to float*), align 8 +@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@alias.to.lds.3 = alias [1 x i8], [1 x i8] addrspace(3)* @lds.3 + +; CHECK-NOT: @lds.ptr +; CHECK-NOT: @lds.1.ptr +; CHECK-NOT: @lds.2.ptr +; CHECK-NOT: @lds.3.ptr + +define void @f0() { +; CHECK-LABEL: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 0 +; CHECK: %ld1 = load i16, i16 addrspace(3)* @lds.1 +; CHECK: %ld2 = load i32, i32 addrspace(3)* @lds.2 +; CHECK: %gep2 = getelementptr inbounds [1 x i8], [1 x i8] addrspace(3)* @lds.3, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 0 + %ld1 = load i16, i16 addrspace(3)* @lds.1 + %ld2 = load i32, i32 addrspace(3)* @lds.2 + %gep2 = getelementptr inbounds [1 x i8], [1 x i8] addrspace(3)* @lds.3, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call void @f0() +; CHECK: ret void +entry: + call void @f0() + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll new file mode 100644 index 00000000000..580b3752f88 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll @@ -0,0 +1,30 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; We do not know what to do with inline asm call, we ignore it, hence pointer replacement for +; @used_only_within_func does not take place. +; + +; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 +@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 + +; CHECK-NOT: @used_only_within_func.ptr + +define void @f0(i32 %x) { +; CHECK-LABEL: entry: +; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_func, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_func, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call i32 asm "s_mov_b32 $0, 0", "=s"() +; CHECK: ret void +entry: + call i32 asm "s_mov_b32 $0, 0", "=s"() + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll new file mode 100644 index 00000000000..34a624b7bc2 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll @@ -0,0 +1,25 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @used_only_within_kern is used only within kernel @k0, hence pointer replacement +; does not take place for @used_only_within_kern. +; + +; CHECK: @used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 +@used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 + +; CHECK-NOT: @used_only_within_kern.ptr + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: %mul = mul i32 %ld, 2 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +entry: + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + %mul = mul i32 %ld, 2 + store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll new file mode 100644 index 00000000000..385d78c8223 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll @@ -0,0 +1,28 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @not-reachable-lds is used within non-kernel function @f0, but @f0 is *not* +; reachable from kernel @k, hence pointer replacement does not take place. +; + +; CHECK: @not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4 +@not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4 + +; CHECK-NOT: @not-reachable-lds.ptr + +define internal void @f0() { +; CHECK-LABEL: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: ret void +entry: + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll new file mode 100644 index 00000000000..7f731774756 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @small_lds is used within non-kernel function @f0, and @f0 is reachable +; from kernel @k0, but since @small_lds too small for pointer replacement, pointer +; replacement does not take place. +; + +; CHECK: @small_lds = addrspace(3) global i8 undef, align 1 +@small_lds = addrspace(3) global i8 undef, align 1 + +; CHECK-NOT: @small_lds.ptr + +define void @f0() { +; CHECK-LABEL: entry: +; CHECK: store i8 1, i8 addrspace(3)* @small_lds, align 1 +; CHECK: ret void +entry: + store i8 1, i8 addrspace(3)* @small_lds, align 1 + ret void +} + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call void @f0() +; CHECK: ret void +entry: + call void @f0() + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll new file mode 100644 index 00000000000..df3cfe7c8f3 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll @@ -0,0 +1,95 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; The lds global @lds_used_within_func is used within non-kernel function @func_uses_lds +; which is *indirectly* recheable from kernel @kernel_reaches_lds, hence pointer replacement +; takes place for @lds_used_within_func. + +; Original LDS should exit. +; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 + +; Function pointer should exist as it is. +; CHECK: @ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8 +@ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8 + +; Pointer should be created. +; CHECK: @lds_used_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @func_uses_lds() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_func.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 + ret void +} + +; No change +define internal void @func_does_not_use_lds_3() { +; CHECK-LABEL: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} + +; No change +define internal void @func_does_not_use_lds_2() { +; CHECK-LABEL: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} + +; No change +define internal void @func_does_not_use_lds_1() { +; CHECK-LABEL: entry: +; CHECK: call void @func_does_not_use_lds_2() +; CHECK: call void @func_does_not_use_lds_3() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_2() + call void @func_does_not_use_lds_3() + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_reaches_lds() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_func to i16), i16 addrspace(3)* @lds_used_within_func.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @func_does_not_use_lds_1() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_1() + ret void +} + +; No change here since this kernel does not reach @func_uses_lds which uses lds. +define protected amdgpu_kernel void @kernel_does_not_reach_lds() { +; CHECK-LABEL: entry: +; CHECK: ret void +entry: + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll new file mode 100644 index 00000000000..41ab55cc108 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll @@ -0,0 +1,151 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There are three lds globals defined here, and these three lds are used respectively within +; three non-kernel functions. There are three kernels, which *indirectly* call two of the +; non-kernel functions. Hence pointer replacement should take place for all three lds, and +; pointer initialization within kernel should selectively happen depending on which lds is +; reachable from the kernel. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 + +; Function pointers should exist. +; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8 +; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8 +@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8 +@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8 + +; Pointers should be created. +; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function_3(i8 %c) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_2(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_1(float %f) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 +; CHECK: %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 +; CHECK: call void %fptr3(i8 1) +; CHECK: call void %fptr1(float 2.000000e+00) +; CHECK: ret void +entry: + %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 + %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 + call void %fptr3(i8 1) + call void %fptr1(float 2.0) + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 +; CHECK: %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 +; CHECK: call void %fptr2(i16 3) +; CHECK: call void %fptr3(i8 4) +; CHECK: ret void +entry: + %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 + %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 + call void %fptr2(i16 3) + call void %fptr3(i8 4) + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 +; CHECK: %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 +; CHECK: call void %fptr1(float 5.000000e+00) +; CHECK: call void %fptr2(i16 6) +; CHECK: ret void +entry: + %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 + %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 + call void %fptr1(float 5.0) + call void %fptr2(i16 6) + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll new file mode 100644 index 00000000000..671f989424e --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll @@ -0,0 +1,94 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There are three lds globals defined here, and these three lds are used respectively within +; three non-kernel functions. There is one kernel which *indirectly* calls one of the non-kernel +; functions. But since all the three non-kernel functions have same signature, all three +; non-kernel functions are resolved as potential callees for indirect call-site. Hence we land-up +; pointer replacement for three lds globals. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 + +; Function pointers should exist. +; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8 +; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8 +@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8 +@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8 + +; Pointers should be created. +; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function_3(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_2(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_1(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_indirectly_calls_function_1() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: %fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8 +; CHECK: call void %fptr1(i16 6) +; CHECK: ret void +entry: + %fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8 + call void %fptr1(i16 6) + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll new file mode 100644 index 00000000000..de439da28e7 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll @@ -0,0 +1,214 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck --check-prefix=POINTER-REPLACE %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-lower-module-lds < %s | FileCheck --check-prefix=LOWER_LDS %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s + +; +; DESCRIPTION: +; +; 1. There are three lds defined - @lds.1, @lds.2 and @lds.3, which are of types i32, i64, and [2 x i64]. +; @lds.3 is aliased to to @alias.to.lds.3 +; 2. @lds.1 is used in function @f1, and @lds.2 is used in function @f2, @alias.to.lds.3 is used in kernel @k1. + +; 3. Pointer-replacement pass replaces @lds.1 and @lds.2 by pointers @lds.1.ptr and @lds.2.ptr respectively. +; However it does not touch @lds.3 since it is used in global scope (aliased). +; +; 4. LDS-lowering pass sees use of @lds.1.ptr in function @f1, use of @lds.2.ptr in function @f2, and use of +; @lds.3 (via alias @alias.to.lds.3) in kernel @k1. Hence it module lowers these lds into struct instance +; @llvm.amdgcn.module.lds. +; +; The struct member order is - [lds.3, lds.1.ptr, lds.2.ptr]. Since @llvm.amdgcn.module.lds itself is allocated +; on address 0, lds.3 is allocated on address 0, lds.1.ptr is allocated on address 16, and lds.2.ptr is allocated +; on address 18. +; +; Again LDS-lowering pass sees use of @lds.1 and @lds.2 in kernel. Hence it kernel lowers these lds into struct +; instance @llvm.amdgcn.kernel.k1.lds. +; +; The struct member order is - [@lds.2, @lds.1]. By now, already (16 + 2 + 2) 20 byte of memory allocated, @lds.2 +; is allocated on address 24 since it needs to be allocated on 8 byte boundary, and @lds.1 is allocated on address +; 32. +; +; 5. Hence the final GCN ISA looks as below: +; +; Within kernel @k1: +; address 24 is stored in address 18. +; address 32 is stored in address 16 +; +; Within function @f1: +; address 32 is loaded from address 16 +; +; Within function @f2: +; address 24 is loaded from address 18 +; + + +; POINTER-REPLACE: @lds.1 = addrspace(3) global i32 undef, align 4 +; POINTER-REPLACE: @lds.2 = addrspace(3) global i64 undef, align 8 +; POINTER-REPLACE: @lds.3 = addrspace(3) global [2 x i64] undef, align 16 +; POINTER-REPLACE: @lds.1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER-REPLACE: @lds.2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER-REPLACE: @alias.to.lds.3 = alias [2 x i64], [2 x i64] addrspace(3)* @lds.3 + + +; LOWER_LDS-NOT: @lds.1 +; LOWER_LDS-NOT: @lds.2 +; LOWER_LDS-NOT: @lds.3 +; LOWER_LDS: %llvm.amdgcn.module.lds.t = type { [2 x i64], i16, i16 } +; LOWER_LDS: %llvm.amdgcn.kernel.k1.lds.t = type { i64, i32 } +; LOWER_LDS: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 16 +; LOWER_LDS: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; LOWER_LDS: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 8 +; LOWER_LDS: @alias.to.lds.3 = alias [2 x i64], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) + +@lds.1 = addrspace(3) global i32 undef, align 4 +@lds.2 = addrspace(3) global i64 undef, align 8 +@lds.3 = addrspace(3) global [2 x i64] undef, align 16 +@alias.to.lds.3 = alias [2 x i64], [2 x i64] addrspace(3)* @lds.3 + +; POINTER-REPLACE-LABEL: @f1 +; POINTER-REPLACE: %1 = load i16, i16 addrspace(3)* @lds.1.ptr, align 2 +; POINTER-REPLACE: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1 +; POINTER-REPLACE: %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)* +; POINTER-REPLACE: store i32 7, i32 addrspace(3)* %3, align 4 +; POINTER-REPLACE: ret void + + +; LOWER_LDS-LABEL: @f1 +; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; LOWER_LDS: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1 +; LOWER_LDS: %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)* +; LOWER_LDS: store i32 7, i32 addrspace(3)* %3, align 4 +; LOWER_LDS: ret void + + +; GCN-LABEL: f1: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN: v_mov_b32_e32 v0, 0 +; GCN: ds_read_i16 v0, v0 offset:16 +; GCN: v_mov_b32_e32 v1, 7 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: ds_write_b32 v0, v1 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_setpc_b64 s[30:31] +define void @f1() { + store i32 7, i32 addrspace(3)* @lds.1 + ret void +} + +; POINTER-REPLACE-LABEL: @f2 +; POINTER-REPLACE: %1 = load i16, i16 addrspace(3)* @lds.2.ptr, align 2 +; POINTER-REPLACE: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1 +; POINTER-REPLACE: %3 = bitcast i8 addrspace(3)* %2 to i64 addrspace(3)* +; POINTER-REPLACE: store i64 15, i64 addrspace(3)* %3, align 4 +; POINTER-REPLACE: ret void + + +; LOWER_LDS-LABEL: @f2 +; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; LOWER_LDS: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1 +; LOWER_LDS: %3 = bitcast i8 addrspace(3)* %2 to i64 addrspace(3)* +; LOWER_LDS: store i64 15, i64 addrspace(3)* %3, align 4 +; LOWER_LDS: ret void + + +; GCN-LABEL: f2: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN: v_mov_b32_e32 v1, 0 +; GCN: ds_read_i16 v2, v1 offset:18 +; GCN: v_mov_b32_e32 v0, 15 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: ds_write_b64 v2, v[0:1] +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_setpc_b64 s[30:31] +define void @f2() { + store i64 15, i64 addrspace(3)* @lds.2 + ret void +} + +; POINTER-REPLACE-LABEL: @k1 +; POINTER-REPLACE: %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; POINTER-REPLACE: %2 = icmp eq i32 %1, 0 +; POINTER-REPLACE: br i1 %2, label %3, label %4 +; +; POINTER-REPLACE-LABEL: 3: +; POINTER-REPLACE: store i16 ptrtoint (i64 addrspace(3)* @lds.2 to i16), i16 addrspace(3)* @lds.2.ptr, align 2 +; POINTER-REPLACE: store i16 ptrtoint (i32 addrspace(3)* @lds.1 to i16), i16 addrspace(3)* @lds.1.ptr, align 2 +; POINTER-REPLACE: br label %4 +; +; POINTER-REPLACE-LABEL: 4: +; POINTER-REPLACE: call void @llvm.amdgcn.wave.barrier() +; POINTER-REPLACE: %bc = bitcast [2 x i64] addrspace(3)* @alias.to.lds.3 to i8 addrspace(3)* +; POINTER-REPLACE: store i8 3, i8 addrspace(3)* %bc, align 2 +; POINTER-REPLACE: call void @f1() +; POINTER-REPLACE: call void @f2() +; POINTER-REPLACE: ret void + + +; LOWER_LDS-LABEL: @k1 +; LOWER_LDS: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; LOWER_LDS: %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; LOWER_LDS: %2 = icmp eq i32 %1, 0 +; LOWER_LDS: br i1 %2, label %3, label %6 +; +; LOWER_LDS-LABEL: 3: +; LOWER_LDS: %4 = ptrtoint i64 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i16 +; LOWER_LDS: store i16 %4, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; LOWER_LDS: %5 = ptrtoint i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i16 +; LOWER_LDS: store i16 %5, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; LOWER_LDS: br label %6 +; +; LOWER_LDS-LABEL: 6: +; LOWER_LDS: call void @llvm.amdgcn.wave.barrier() +; LOWER_LDS: %bc = bitcast [2 x i64] addrspace(3)* @alias.to.lds.3 to i8 addrspace(3)* +; LOWER_LDS: store i8 3, i8 addrspace(3)* %bc, align 2 +; LOWER_LDS: call void @f1() +; LOWER_LDS: call void @f2() +; LOWER_LDS: ret void + + +; GCN-LABEL: k1: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; GCN: s_mov_b32 s11, 0xe00000 +; GCN: s_add_u32 s8, s8, s1 +; GCN: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GCN: s_addc_u32 s9, s9, 0 +; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN: s_mov_b32 s32, 0 +; GCN: s_and_saveexec_b64 s[0:1], vcc +; GCN: s_cbranch_execz BB2_2 +; GCN: v_mov_b32_e32 v0, 24 +; GCN: v_mov_b32_e32 v1, 0 +; GCN: ds_write_b16 v1, v0 offset:18 +; GCN: v_mov_b32_e32 v0, 32 +; GCN: ds_write_b16 v1, v0 offset:16 +; GCN-LABEL: BB2_2: +; GCN: s_or_b64 exec, exec, s[0:1] +; GCN: s_getpc_b64 s[0:1] +; GCN: s_add_u32 s0, s0, f1@gotpcrel32@lo+4 +; GCN: s_addc_u32 s1, s1, f1@gotpcrel32@hi+12 +; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GCN: s_mov_b64 s[0:1], s[8:9] +; GCN: s_mov_b64 s[2:3], s[10:11] +; GCN: v_mov_b32_e32 v0, alias.to.lds.3@abs32@lo +; GCN: v_mov_b32_e32 v1, 3 +; ; wave barrier +; GCN: ds_write_b8 v0, v1 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: s_getpc_b64 s[0:1] +; GCN: s_add_u32 s0, s0, f2@gotpcrel32@lo+4 +; GCN: s_addc_u32 s1, s1, f2@gotpcrel32@hi+12 +; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GCN: s_mov_b64 s[0:1], s[8:9] +; GCN: s_mov_b64 s[2:3], s[10:11] +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: s_endpgm +define amdgpu_kernel void @k1() { + %bc = bitcast [2 x i64] addrspace(3)* @alias.to.lds.3 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %bc, align 2 + call void @f1() + call void @f2() + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll new file mode 100644 index 00000000000..314f9094ce5 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll @@ -0,0 +1,66 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There are three lds globals defined here, and these three lds are used within a single +; non-kernel function, and this non-kernel function is reachable from kernel. Hence pointer +; replacement is required for all three lds globals. +; + +; Original LDS should exist. +; CHECK: @lds1 = internal addrspace(3) global [1 x i32] undef, align 4 +; CHECK: @lds2 = internal addrspace(3) global [2 x i32] undef, align 4 +; CHECK: @lds3 = internal addrspace(3) global [3 x i32] undef, align 4 +@lds1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds2 = internal addrspace(3) global [2 x i32] undef, align 4 +@lds3 = internal addrspace(3) global [3 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @lds1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds3.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [3 x i32] addrspace(3)* +; CHECK: %3 = load i16, i16 addrspace(3)* @lds2.ptr, align 2 +; CHECK: %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3 +; CHECK: %5 = bitcast i8 addrspace(3)* %4 to [2 x i32] addrspace(3)* +; CHECK: %6 = load i16, i16 addrspace(3)* @lds1.ptr, align 2 +; CHECK: %7 = getelementptr i8, i8 addrspace(3)* null, i16 %6 +; CHECK: %8 = bitcast i8 addrspace(3)* %7 to [1 x i32] addrspace(3)* +; CHECK: %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %8, i32 0, i32 0 +; CHECK: %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* %5, i32 0, i32 0 +; CHECK: %gep3 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @lds2, i32 0, i32 0 + %gep3 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* @lds3, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added; +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds3 to i16), i16 addrspace(3)* @lds3.ptr, align 2 +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds2 to i16), i16 addrspace(3)* @lds2.ptr, align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds1 to i16), i16 addrspace(3)* @lds1.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @function() +; CHECK: ret void +entry: + call void @function() + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll new file mode 100644 index 00000000000..453b5fd73ee --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There is one lds global defined here, and this lds is used within a single non-kernel +; function multiple times, and this non-kernel function is reachable from kernel. Hence +; pointer takes place. But important note is - store-to/load-from pointer should happen +; only once irrespective of number of uses. +; + +; Original LDS should exist. +; CHECK: @lds1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds1 = internal addrspace(3) global [1 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @lds1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds1.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [1 x i32] addrspace(3)* +; CHECK: %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %gep2 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %gep3 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + %gep2 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + %gep3 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added; +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds1 to i16), i16 addrspace(3)* @lds1.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @function() +; CHECK: ret void +entry: + call void @function() + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll new file mode 100644 index 00000000000..763af6a1b5a --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll @@ -0,0 +1,54 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There is one lds global defined here, and this lds is used within a single non-kernel +; function, as an operand of nested constant expression, and this non-kernel function is +; reachable from kernel. Hence nested constant expression should to be converted into a +; series of instructons and pointer replacement should take place. +; + +; Original LDS should exist. +; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 +@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @used_only_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define void @f0(i32 %x) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @used_only_within_func.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %5, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: store i32 %x, i32* %7, align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; Pointer initialization code shoud be added +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @used_only_within_func to i16), i16 addrspace(3)* @used_only_within_func.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @f0(i32 0) +; CHECK: ret void +entry: + call void @f0(i32 0) + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll new file mode 100644 index 00000000000..24bcee35b8e --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll @@ -0,0 +1,58 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; There is one lds global defined here, and this lds is used within a single non-kernel +; function, as an operand of nested constant expression, and this non-kernel function is +; reachable from kernel. Hence nested constant expression should to be converted into a +; series of instructons and pointer replacement should take place. But, important note +; is - only constant expression operands which uses lds should be converted into +; instructions, other constant expression operands which do not use lds should be left +; untouched. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 + +; Non-LDS global should exist as it is. +; CHECK: @global_var = internal addrspace(1) global [4 x i32] undef, align 4 +@global_var = internal addrspace(1) global [4 x i32] undef, align 4 + +; Pointer should be created. +; CHECK: @lds_used_within_function.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 2 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i32 +; CHECK: %6 = add i32 %5, ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32) +; CHECK: ret void +entry: + %0 = add i32 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 2) to i32*) to i32), ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32) + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %1 = icmp eq i32 %0, 0 +; CHECK: br i1 %1, label %2, label %3 +; +; CHECK-LABEL: 2: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function to i16), i16 addrspace(3)* @lds_used_within_function.ptr, align 2 +; CHECK: br label %3 +; +; CHECK-LABEL: 3: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @function() +; CHECK: ret void +entry: + call void @function() + ret void +} diff --git a/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll new file mode 100644 index 00000000000..616439ac655 --- /dev/null +++ b/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll @@ -0,0 +1,93 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; Replace lds globals used within phi instruction. +; + +; Original LDS should exist. +; CHECK: @lds.1 = addrspace(3) global i32 undef, align 4 +; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 +@lds.1 = addrspace(3) global i32 undef, align 4 +@lds.2 = addrspace(3) global i32 undef, align 4 + +; Pointers should be created. +; CHECK: @lds.1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds.2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +define void @f0(i32 %arg) { +; CHECK-LABEL: bb: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds.2.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to i32 addrspace(3)* +; CHECK: %3 = load i16, i16 addrspace(3)* @lds.1.ptr, align 2 +; CHECK: %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3 +; CHECK: %5 = bitcast i8 addrspace(3)* %4 to i32 addrspace(3)* +; CHECK: %id = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: %my.tmp = sub i32 %id, %arg +; CHECK: br label %bb1 +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %my.tmp = sub i32 %id, %arg + br label %bb1 + +; CHECK-LABEL: bb1: +; CHECK: %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ] +; CHECK: %6 = icmp ne i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), %5 +; CHECK: %lsr.iv.next = add i32 %lsr.iv, 1 +; CHECK: %cmp0 = icmp slt i32 %lsr.iv.next, 0 +; CHECK: br i1 %cmp0, label %bb4, label %Flow +bb1: + %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + br i1 %cmp0, label %bb4, label %Flow + +; CHECK-LABEL: bb4: +; CHECK: %load = load volatile i32, i32 addrspace(1)* undef, align 4 +; CHECK: %cmp1 = icmp sge i32 %my.tmp, %load +; CHECK: br label %Flow +bb4: + %load = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp1 = icmp sge i32 %my.tmp, %load + br label %Flow + +; CHECK-LABEL: Flow: +; CHECK: %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] +; CHECK: %my.tmp3 = phi i32 addrspace(3)* [ %2, %bb4 ], [ %5, %bb1 ] +; CHECK: %my.tmp4 = phi i1 [ %cmp1, %bb4 ], [ %6, %bb1 ] +; CHECK: br i1 %my.tmp4, label %bb9, label %bb1 +Flow: + %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] + %my.tmp3 = phi i32 addrspace(3)* [@lds.2, %bb4 ], [ @lds.1, %bb1 ] + %my.tmp4 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds.1), %bb1 ] + br i1 %my.tmp4, label %bb9, label %bb1 + +; CHECK-LABEL: bb9: +; CHECK: store volatile i32 7, i32 addrspace(3)* undef, align 4 +; CHECK: ret void +bb9: + store volatile i32 7, i32 addrspace(3)* undef + ret void +} + +; CHECK-LABEL: @k0 +; CHECK: %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %2 = icmp eq i32 %1, 0 +; CHECK: br i1 %2, label %3, label %4 +; +; CHECK-LABEL: 3: +; CHECK: store i16 ptrtoint (i32 addrspace(3)* @lds.2 to i16), i16 addrspace(3)* @lds.2.ptr, align 2 +; CHECK: store i16 ptrtoint (i32 addrspace(3)* @lds.1 to i16), i16 addrspace(3)* @lds.1.ptr, align 2 +; CHECK: br label %4 +; +; CHECK-LABEL: 4: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: call void @f0(i32 %arg) +; CHECK: ret void +define amdgpu_kernel void @k0(i32 %arg) { + call void @f0(i32 %arg) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()