mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
[AMDGPU] Replace non-kernel function uses of LDS globals by pointers.
The main motivation behind pointer replacement of LDS use within non-kernel functions is - to *avoid* subsequent LDS lowering pass from directly packing LDS (assume large LDS) into a struct type which would otherwise cause allocating huge memory for struct instance within every kernel. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D103225
This commit is contained in:
parent
0317b20934
commit
37c462f96a
@ -71,6 +71,7 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass();
|
||||
FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
|
||||
ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
|
||||
FunctionPass *createAMDGPURewriteOutArgumentsPass();
|
||||
ModulePass *createAMDGPUReplaceLDSUseWithPointerPass();
|
||||
ModulePass *createAMDGPULowerModuleLDSPass();
|
||||
FunctionPass *createSIModeRegisterPass();
|
||||
|
||||
@ -146,6 +147,14 @@ private:
|
||||
TargetMachine &TM;
|
||||
};
|
||||
|
||||
void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &);
|
||||
extern char &AMDGPUReplaceLDSUseWithPointerID;
|
||||
|
||||
struct AMDGPUReplaceLDSUseWithPointerPass
|
||||
: PassInfoMixin<AMDGPUReplaceLDSUseWithPointerPass> {
|
||||
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
|
||||
};
|
||||
|
||||
void initializeAMDGPULowerModuleLDSPass(PassRegistry &);
|
||||
extern char &AMDGPULowerModuleLDSID;
|
||||
|
||||
|
@ -24,6 +24,13 @@
|
||||
// A possible future refinement is to specialise the structure per-kernel, so
|
||||
// that fields can be elided based on more expensive analysis.
|
||||
//
|
||||
// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
|
||||
// type which would cause allocating huge memory for struct instance within
|
||||
// every kernel. Hence, before running this pass, it is advisable to run the
|
||||
// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
|
||||
// non-kernel functions by pointers and thereby minimizes the unnecessary per
|
||||
// kernel allocation of LDS memory.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
|
460
lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
Normal file
460
lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
Normal file
@ -0,0 +1,460 @@
|
||||
//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This pass replaces all the uses of LDS within non-kernel functions by
|
||||
// corresponding pointer counter-parts.
|
||||
//
|
||||
// The main motivation behind this pass is - to *avoid* subsequent LDS lowering
|
||||
// pass from directly packing LDS (assume large LDS) into a struct type which
|
||||
// would otherwise cause allocating huge memory for struct instance within every
|
||||
// kernel.
|
||||
//
|
||||
// Brief sketch of the algorithm implemented in this pass is as below:
|
||||
//
|
||||
// 1. Collect all the LDS defined in the module which qualify for pointer
|
||||
// replacement, say it is, LDSGlobals set.
|
||||
//
|
||||
// 2. Collect all the reachable callees for each kernel defined in the module,
|
||||
// say it is, KernelToCallees map.
|
||||
//
|
||||
// 3. FOR (each global GV from LDSGlobals set) DO
|
||||
// LDSUsedNonKernels = Collect all non-kernel functions which use GV.
|
||||
// FOR (each kernel K in KernelToCallees map) DO
|
||||
// ReachableCallees = KernelToCallees[K]
|
||||
// ReachableAndLDSUsedCallees =
|
||||
// SetIntersect(LDSUsedNonKernels, ReachableCallees)
|
||||
// IF (ReachableAndLDSUsedCallees is not empty) THEN
|
||||
// Pointer = Create a pointer to point-to GV if not created.
|
||||
// Initialize Pointer to point-to GV within kernel K.
|
||||
// ENDIF
|
||||
// ENDFOR
|
||||
// Replace all uses of GV within non kernel functions by Pointer.
|
||||
// ENFOR
|
||||
//
|
||||
// LLVM IR example:
|
||||
//
|
||||
// Input IR:
|
||||
//
|
||||
// @lds = internal addrspace(3) global [4 x i32] undef, align 16
|
||||
//
|
||||
// define internal void @f0() {
|
||||
// entry:
|
||||
// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds,
|
||||
// i32 0, i32 0
|
||||
// ret void
|
||||
// }
|
||||
//
|
||||
// define protected amdgpu_kernel void @k0() {
|
||||
// entry:
|
||||
// call void @f0()
|
||||
// ret void
|
||||
// }
|
||||
//
|
||||
// Output IR:
|
||||
//
|
||||
// @lds = internal addrspace(3) global [4 x i32] undef, align 16
|
||||
// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
//
|
||||
// define internal void @f0() {
|
||||
// entry:
|
||||
// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2
|
||||
// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2,
|
||||
// i32 0, i32 0
|
||||
// ret void
|
||||
// }
|
||||
//
|
||||
// define protected amdgpu_kernel void @k0() {
|
||||
// entry:
|
||||
// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16),
|
||||
// i16 addrspace(3)* @lds.ptr, align 2
|
||||
// call void @f0()
|
||||
// ret void
|
||||
// }
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "GCNSubtarget.h"
|
||||
#include "Utils/AMDGPUBaseInfo.h"
|
||||
#include "Utils/AMDGPULDSUtils.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/SetOperations.h"
|
||||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/DerivedTypes.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
||||
#include "llvm/IR/ReplaceConstant.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/ModuleUtils.h"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
class ReplaceLDSUseImpl {
|
||||
Module &M;
|
||||
LLVMContext &Ctx;
|
||||
const DataLayout &DL;
|
||||
Constant *LDSMemBaseAddr;
|
||||
|
||||
DenseMap<GlobalVariable *, GlobalVariable *> LDSToPointer;
|
||||
DenseMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSToNonKernels;
|
||||
DenseMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees;
|
||||
DenseMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSPointers;
|
||||
DenseMap<Function *, BasicBlock *> KernelToInitBB;
|
||||
DenseMap<Function *, DenseMap<GlobalVariable *, Value *>>
|
||||
FunctionToLDSToReplaceInst;
|
||||
|
||||
// Collect LDS which requires their uses to be replaced by pointer.
|
||||
std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
|
||||
// Collect LDS which requires module lowering.
|
||||
std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M);
|
||||
|
||||
// Remove LDS which don't qualify for replacement.
|
||||
LDSGlobals.erase(std::remove_if(LDSGlobals.begin(), LDSGlobals.end(),
|
||||
[&](GlobalVariable *GV) {
|
||||
return shouldIgnorePointerReplacement(GV);
|
||||
}),
|
||||
LDSGlobals.end());
|
||||
|
||||
return LDSGlobals;
|
||||
}
|
||||
|
||||
// Returns true if uses of given LDS global within non-kernel functions should
|
||||
// be keep as it is without pointer replacement.
|
||||
bool shouldIgnorePointerReplacement(GlobalVariable *GV) {
|
||||
// LDS whose size is very small and doesn`t exceed pointer size is not worth
|
||||
// replacing.
|
||||
if (DL.getTypeAllocSize(GV->getValueType()) <= 2)
|
||||
return true;
|
||||
|
||||
// LDS which is not used from non-kernel function scope or it is used from
|
||||
// global scope does not qualify for replacement.
|
||||
LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV);
|
||||
return LDSToNonKernels[GV].empty();
|
||||
|
||||
// FIXME: When GV is used within all (or within most of the kernels), then
|
||||
// it does not make sense to create a pointer for it.
|
||||
}
|
||||
|
||||
// Insert new global LDS pointer which points to LDS.
|
||||
GlobalVariable *createLDSPointer(GlobalVariable *GV) {
|
||||
// LDS pointer which points to LDS is already created? return it.
|
||||
auto PointerEntry = LDSToPointer.insert(std::make_pair(GV, nullptr));
|
||||
if (!PointerEntry.second)
|
||||
return PointerEntry.first->second;
|
||||
|
||||
// We need to create new LDS pointer which points to LDS.
|
||||
//
|
||||
// Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to
|
||||
// 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address.
|
||||
auto *I16Ty = Type::getInt16Ty(Ctx);
|
||||
GlobalVariable *LDSPointer = new GlobalVariable(
|
||||
M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty),
|
||||
GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal,
|
||||
AMDGPUAS::LOCAL_ADDRESS);
|
||||
|
||||
LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
|
||||
LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer));
|
||||
|
||||
// Mark that an associated LDS pointer is created for LDS.
|
||||
LDSToPointer[GV] = LDSPointer;
|
||||
|
||||
return LDSPointer;
|
||||
}
|
||||
|
||||
// Split entry basic block in such a way that only lane 0 of each wave does
|
||||
// the LDS pointer initialization, and return newly created basic block.
|
||||
BasicBlock *activateLaneZero(Function *K) {
|
||||
// If the entry basic block of kernel K is already splitted, then return
|
||||
// newly created basic block.
|
||||
auto BasicBlockEntry = KernelToInitBB.insert(std::make_pair(K, nullptr));
|
||||
if (!BasicBlockEntry.second)
|
||||
return BasicBlockEntry.first->second;
|
||||
|
||||
// Split entry basic block of kernel K.
|
||||
auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt()));
|
||||
IRBuilder<> Builder(EI);
|
||||
|
||||
Value *Mbcnt =
|
||||
Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
|
||||
{Builder.getInt32(-1), Builder.getInt32(0)});
|
||||
Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0));
|
||||
Instruction *WB = cast<Instruction>(
|
||||
Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}));
|
||||
|
||||
BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent();
|
||||
|
||||
// Mark that the entry basic block of kernel K is splitted.
|
||||
KernelToInitBB[K] = NBB;
|
||||
|
||||
return NBB;
|
||||
}
|
||||
|
||||
// Within given kernel, initialize given LDS pointer to point to given LDS.
|
||||
void initializeLDSPointer(Function *K, GlobalVariable *GV,
|
||||
GlobalVariable *LDSPointer) {
|
||||
// If LDS pointer is already initialized within K, then nothing to do.
|
||||
auto PointerEntry = KernelToLDSPointers.insert(
|
||||
std::make_pair(K, SmallPtrSet<GlobalVariable *, 8>()));
|
||||
if (!PointerEntry.second)
|
||||
if (PointerEntry.first->second.contains(LDSPointer))
|
||||
return;
|
||||
|
||||
// Insert instructions at EI which initialize LDS pointer to point-to LDS
|
||||
// within kernel K.
|
||||
//
|
||||
// That is, convert pointer type of GV to i16, and then store this converted
|
||||
// i16 value within LDSPointer which is of type i16*.
|
||||
auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt()));
|
||||
IRBuilder<> Builder(EI);
|
||||
Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)),
|
||||
LDSPointer);
|
||||
|
||||
// Mark that LDS pointer is initialized within kernel K.
|
||||
KernelToLDSPointers[K].insert(LDSPointer);
|
||||
}
|
||||
|
||||
// We have created an LDS pointer for LDS, and initialized it to point-to LDS
|
||||
// within all relevent kernels. Now replace all the uses of LDS within
|
||||
// non-kernel functions by LDS pointer.
|
||||
void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) {
|
||||
SmallVector<User *, 8> LDSUsers(GV->users());
|
||||
for (auto *U : LDSUsers) {
|
||||
// When `U` is a constant expression, it is possible that same constant
|
||||
// expression exists within multiple instructions, and within multiple
|
||||
// non-kernel functions. Collect all those non-kernel functions and all
|
||||
// those instructions within which `U` exist.
|
||||
auto FunctionToInsts =
|
||||
AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/);
|
||||
|
||||
for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end();
|
||||
FI != FE; ++FI) {
|
||||
Function *F = FI->first;
|
||||
auto &Insts = FI->second;
|
||||
for (auto *I : Insts) {
|
||||
// If `U` is a constant expression, then we need to break the
|
||||
// associated instruction into a set of separate instructions by
|
||||
// converting constant expressions into instructions.
|
||||
SmallPtrSet<Instruction *, 8> UserInsts;
|
||||
|
||||
if (U == I) {
|
||||
// `U` is an instruction, conversion from constant expression to
|
||||
// set of instructions is *not* required.
|
||||
UserInsts.insert(I);
|
||||
} else {
|
||||
// `U` is a constant expression, convert it into corresponding set
|
||||
// of instructions.
|
||||
auto *CE = cast<ConstantExpr>(U);
|
||||
convertConstantExprsToInstructions(I, CE, &UserInsts);
|
||||
}
|
||||
|
||||
// Go through all the user instrutions, if LDS exist within them as an
|
||||
// operand, then replace it by replace instruction.
|
||||
for (auto *II : UserInsts) {
|
||||
auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer);
|
||||
II->replaceUsesOfWith(GV, ReplaceInst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a set of replacement instructions which together replace LDS within
|
||||
// non-kernel function F by accessing LDS indirectly using LDS pointer.
|
||||
Value *getReplacementInst(Function *F, GlobalVariable *GV,
|
||||
GlobalVariable *LDSPointer) {
|
||||
// If the instruction which replaces LDS within F is already created, then
|
||||
// return it.
|
||||
auto LDSEntry = FunctionToLDSToReplaceInst.insert(
|
||||
std::make_pair(F, DenseMap<GlobalVariable *, Value *>()));
|
||||
if (!LDSEntry.second) {
|
||||
auto ReplaceInstEntry =
|
||||
LDSEntry.first->second.insert(std::make_pair(GV, nullptr));
|
||||
if (!ReplaceInstEntry.second)
|
||||
return ReplaceInstEntry.first->second;
|
||||
}
|
||||
|
||||
// Get the instruction insertion point within the beginning of the entry
|
||||
// block of current non-kernel function.
|
||||
auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt()));
|
||||
IRBuilder<> Builder(EI);
|
||||
|
||||
// Insert required set of instructions which replace LDS within F.
|
||||
auto *V = Builder.CreateBitCast(
|
||||
Builder.CreateGEP(
|
||||
LDSMemBaseAddr,
|
||||
Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)),
|
||||
GV->getType());
|
||||
|
||||
// Mark that the replacement instruction which replace LDS within F is
|
||||
// created.
|
||||
FunctionToLDSToReplaceInst[F][GV] = V;
|
||||
|
||||
return V;
|
||||
}
|
||||
|
||||
public:
|
||||
ReplaceLDSUseImpl(Module &M)
|
||||
: M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {
|
||||
LDSMemBaseAddr = Constant::getIntegerValue(
|
||||
PointerType::get(Type::getInt8Ty(M.getContext()),
|
||||
AMDGPUAS::LOCAL_ADDRESS),
|
||||
APInt(32, 0));
|
||||
}
|
||||
|
||||
// Entry-point function which interface ReplaceLDSUseImpl with outside of the
|
||||
// class.
|
||||
bool replaceLDSUse();
|
||||
|
||||
private:
|
||||
// For a given LDS from collected LDS globals set, replace its non-kernel
|
||||
// function scope uses by pointer.
|
||||
bool replaceLDSUse(GlobalVariable *GV);
|
||||
};
|
||||
|
||||
// For given LDS from collected LDS globals set, replace its non-kernel function
|
||||
// scope uses by pointer.
|
||||
bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
|
||||
// Holds all those non-kernel functions within which LDS is being accessed.
|
||||
SmallPtrSet<Function *, 8> &LDSAccessors = LDSToNonKernels[GV];
|
||||
|
||||
// The LDS pointer which points to LDS and replaces all the uses of LDS.
|
||||
GlobalVariable *LDSPointer = nullptr;
|
||||
|
||||
// Traverse through each kernel K, check and if required, initialize the
|
||||
// LDS pointer to point to LDS within K.
|
||||
for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE;
|
||||
++KI) {
|
||||
Function *K = KI->first;
|
||||
SmallPtrSet<Function *, 8> Callees = KI->second;
|
||||
|
||||
// Compute reachable and LDS used callees for kernel K.
|
||||
set_intersect(Callees, LDSAccessors);
|
||||
|
||||
// None of the LDS accessing non-kernel functions are reachable from
|
||||
// kernel K. Hence, no need to initialize LDS pointer within kernel K.
|
||||
if (Callees.empty())
|
||||
continue;
|
||||
|
||||
// We have found reachable and LDS used callees for kernel K, and we need to
|
||||
// initialize LDS pointer within kernel K, and we need to replace LDS use
|
||||
// within those callees by LDS pointer.
|
||||
//
|
||||
// But, first check if LDS pointer is already created, if not create one.
|
||||
LDSPointer = createLDSPointer(GV);
|
||||
|
||||
// Initialize LDS pointer to point to LDS within kernel K.
|
||||
initializeLDSPointer(K, GV, LDSPointer);
|
||||
}
|
||||
|
||||
// We have not found reachable and LDS used callees for any of the kernels,
|
||||
// and hence we have not created LDS pointer.
|
||||
if (!LDSPointer)
|
||||
return false;
|
||||
|
||||
// We have created an LDS pointer for LDS, and initialized it to point-to LDS
|
||||
// within all relevent kernels. Now replace all the uses of LDS within
|
||||
// non-kernel functions by LDS pointer.
|
||||
replaceLDSUseByPointer(GV, LDSPointer);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Entry-point function which interface ReplaceLDSUseImpl with outside of the
|
||||
// class.
|
||||
bool ReplaceLDSUseImpl::replaceLDSUse() {
|
||||
// Collect LDS which requires their uses to be replaced by pointer.
|
||||
std::vector<GlobalVariable *> LDSGlobals =
|
||||
collectLDSRequiringPointerReplace();
|
||||
|
||||
// No LDS to pointer-replace. Nothing to do.
|
||||
if (LDSGlobals.empty())
|
||||
return false;
|
||||
|
||||
// Collect reachable callee set for each kernel defined in the module.
|
||||
AMDGPU::collectReachableCallees(M, KernelToCallees);
|
||||
|
||||
if (KernelToCallees.empty()) {
|
||||
// Either module does not have any kernel definitions, or none of the kernel
|
||||
// has a call to non-kernel functions, or we could not resolve any of the
|
||||
// call sites to proper non-kernel functions, because of the situations like
|
||||
// inline asm calls. Nothing to replace.
|
||||
return false;
|
||||
}
|
||||
|
||||
// For every LDS from collected LDS globals set, replace its non-kernel
|
||||
// function scope use by pointer.
|
||||
bool Changed = false;
|
||||
for (auto *GV : LDSGlobals)
|
||||
Changed |= replaceLDSUse(GV);
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
class AMDGPUReplaceLDSUseWithPointer : public ModulePass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) {
|
||||
initializeAMDGPUReplaceLDSUseWithPointerPass(
|
||||
*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnModule(Module &M) override;
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetPassConfig>();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char AMDGPUReplaceLDSUseWithPointer::ID = 0;
|
||||
char &llvm::AMDGPUReplaceLDSUseWithPointerID =
|
||||
AMDGPUReplaceLDSUseWithPointer::ID;
|
||||
|
||||
INITIALIZE_PASS_BEGIN(
|
||||
AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
|
||||
"Replace within non-kernel function use of LDS with pointer",
|
||||
false /*only look at the cfg*/, false /*analysis pass*/)
|
||||
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
||||
INITIALIZE_PASS_END(
|
||||
AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
|
||||
"Replace within non-kernel function use of LDS with pointer",
|
||||
false /*only look at the cfg*/, false /*analysis pass*/)
|
||||
|
||||
bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) {
|
||||
ReplaceLDSUseImpl LDSUseReplacer{M};
|
||||
return LDSUseReplacer.replaceLDSUse();
|
||||
}
|
||||
|
||||
ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() {
|
||||
return new AMDGPUReplaceLDSUseWithPointer();
|
||||
}
|
||||
|
||||
PreservedAnalyses
|
||||
AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) {
|
||||
ReplaceLDSUseImpl LDSUseReplacer{M};
|
||||
LDSUseReplacer.replaceLDSUse();
|
||||
return PreservedAnalyses::all();
|
||||
}
|
@ -193,6 +193,11 @@ static cl::opt<bool> EnableStructurizerWorkarounds(
|
||||
cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool> EnableLDSReplaceWithPointer(
|
||||
"amdgpu-enable-lds-replace-with-pointer",
|
||||
cl::desc("Enable LDS replace with pointer pass"), cl::init(true),
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool, true> EnableLowerModuleLDS(
|
||||
"amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
|
||||
cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
|
||||
@ -240,6 +245,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
||||
initializeAMDGPULateCodeGenPreparePass(*PR);
|
||||
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
|
||||
initializeAMDGPUPropagateAttributesLatePass(*PR);
|
||||
initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
|
||||
initializeAMDGPULowerModuleLDSPass(*PR);
|
||||
initializeAMDGPURewriteOutArgumentsPass(*PR);
|
||||
initializeAMDGPUUnifyMetadataPass(*PR);
|
||||
@ -505,6 +511,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
|
||||
PM.addPass(AMDGPUAlwaysInlinePass());
|
||||
return true;
|
||||
}
|
||||
if (PassName == "amdgpu-replace-lds-use-with-pointer") {
|
||||
PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
|
||||
return true;
|
||||
}
|
||||
if (PassName == "amdgpu-lower-module-lds") {
|
||||
PM.addPass(AMDGPULowerModuleLDSPass());
|
||||
return true;
|
||||
@ -889,8 +899,15 @@ void AMDGPUPassConfig::addIRPasses() {
|
||||
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
|
||||
|
||||
// Can increase LDS used by kernel so runs before PromoteAlloca
|
||||
if (EnableLowerModuleLDS)
|
||||
if (EnableLowerModuleLDS) {
|
||||
// The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
|
||||
// pass "amdgpu-lower-module-lds", and also it required to be run only if
|
||||
// "amdgpu-lower-module-lds" pass is enabled.
|
||||
if (EnableLDSReplaceWithPointer)
|
||||
addPass(createAMDGPUReplaceLDSUseWithPointerPass());
|
||||
|
||||
addPass(createAMDGPULowerModuleLDSPass());
|
||||
}
|
||||
|
||||
if (TM.getOptLevel() > CodeGenOpt::None) {
|
||||
addPass(createInferAddressSpacesPass());
|
||||
|
@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen
|
||||
AMDGPUPropagateAttributes.cpp
|
||||
AMDGPURegBankCombiner.cpp
|
||||
AMDGPURegisterBankInfo.cpp
|
||||
AMDGPUReplaceLDSUseWithPointer.cpp
|
||||
AMDGPURewriteOutArguments.cpp
|
||||
AMDGPUSubtarget.cpp
|
||||
AMDGPUTargetMachine.cpp
|
||||
|
@ -12,7 +12,9 @@
|
||||
|
||||
#include "AMDGPULDSUtils.h"
|
||||
#include "Utils/AMDGPUBaseInfo.h"
|
||||
#include "llvm/ADT/SCCIterator.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
#include "llvm/Analysis/CallGraph.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/ReplaceConstant.h"
|
||||
|
||||
@ -22,6 +24,189 @@ namespace llvm {
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
// An helper class for collecting all reachable callees for each kernel defined
|
||||
// within the module.
|
||||
class CollectReachableCallees {
|
||||
Module &M;
|
||||
CallGraph CG;
|
||||
SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
|
||||
|
||||
// Collect all address taken functions within the module.
|
||||
void collectAddressTakenFunctions() {
|
||||
auto *ECNode = CG.getExternalCallingNode();
|
||||
|
||||
for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
|
||||
auto *CGN = GI->second;
|
||||
auto *F = CGN->getFunction();
|
||||
if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
|
||||
continue;
|
||||
AddressTakenFunctions.insert(CGN);
|
||||
}
|
||||
}
|
||||
|
||||
// For a given caller node, collect all reachable callee nodes.
|
||||
SmallPtrSet<CallGraphNode *, 8> collectCGNodes(CallGraphNode *CGN) {
|
||||
SmallPtrSet<CallGraphNode *, 8> CGNodes;
|
||||
|
||||
for (scc_iterator<CallGraphNode *> I = scc_begin(CGN); !I.isAtEnd(); ++I) {
|
||||
const std::vector<CallGraphNode *> &SCC = *I;
|
||||
assert(!SCC.empty() && "SCC with no functions?");
|
||||
for (auto *CGNode : SCC)
|
||||
CGNodes.insert(CGNode);
|
||||
}
|
||||
|
||||
return CGNodes;
|
||||
}
|
||||
|
||||
// For given kernel, collect all its reachable non-kernel functions.
|
||||
SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
|
||||
SmallPtrSet<Function *, 8> ReachableCallees;
|
||||
|
||||
// Call graph node which represents this kernel.
|
||||
auto *KCGN = CG[K];
|
||||
|
||||
// Collect all reachable call graph nodes from the node representing this
|
||||
// kernel.
|
||||
SmallPtrSet<CallGraphNode *, 8> CGNodes = collectCGNodes(KCGN);
|
||||
|
||||
// Go through collected reachable nodes, visit all thier call sites, if the
|
||||
// call site is direct, add corresponding callee to reachable callee set, if
|
||||
// it is indirect, resolve the indirect call site to potential reachable
|
||||
// callees, add them to reachable callee set, and repeat the process for the
|
||||
// newly added potential callee nodes.
|
||||
//
|
||||
// FIXME: Need to handle bit-casted function pointers.
|
||||
//
|
||||
SmallVector<CallGraphNode *, 8> CGNStack(CGNodes.begin(), CGNodes.end());
|
||||
SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
|
||||
while (!CGNStack.empty()) {
|
||||
auto *CGN = CGNStack.pop_back_val();
|
||||
|
||||
if (!VisitedCGNodes.insert(CGN).second)
|
||||
continue;
|
||||
|
||||
for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
|
||||
auto *RCB = cast<CallBase>(GI->first.getValue());
|
||||
auto *RCGN = GI->second;
|
||||
|
||||
if (auto *DCallee = RCGN->getFunction()) {
|
||||
ReachableCallees.insert(DCallee);
|
||||
} else if (RCB->isIndirectCall()) {
|
||||
auto *RCBFTy = RCB->getFunctionType();
|
||||
for (auto *ACGN : AddressTakenFunctions) {
|
||||
auto *ACallee = ACGN->getFunction();
|
||||
if (ACallee->getFunctionType() == RCBFTy) {
|
||||
ReachableCallees.insert(ACallee);
|
||||
SmallPtrSet<CallGraphNode *, 8> IGCNNodes = collectCGNodes(ACGN);
|
||||
for (auto *IGCN : IGCNNodes)
|
||||
CGNStack.push_back(IGCN);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ReachableCallees;
|
||||
}
|
||||
|
||||
public:
|
||||
explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
|
||||
// Collect address taken functions.
|
||||
collectAddressTakenFunctions();
|
||||
}
|
||||
|
||||
void collectReachableCallees(
|
||||
DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
|
||||
// Collect reachable callee set for each kernel defined in the module.
|
||||
for (Function &F : M.functions()) {
|
||||
if (!AMDGPU::isKernelCC(&F))
|
||||
continue;
|
||||
Function *K = &F;
|
||||
KernelToCallees[K] = collectReachableCallees(K);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void collectReachableCallees(
|
||||
Module &M,
|
||||
DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
|
||||
CollectReachableCallees CRC{M};
|
||||
CRC.collectReachableCallees(KernelToCallees);
|
||||
}
|
||||
|
||||
SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
|
||||
SmallPtrSet<Function *, 8> LDSAccessors;
|
||||
SmallVector<User *, 8> UserStack(GV->users());
|
||||
SmallPtrSet<User *, 8> VisitedUsers;
|
||||
|
||||
while (!UserStack.empty()) {
|
||||
auto *U = UserStack.pop_back_val();
|
||||
|
||||
// `U` is already visited? continue to next one.
|
||||
if (!VisitedUsers.insert(U).second)
|
||||
continue;
|
||||
|
||||
// `U` is a global variable which is initialized with LDS. Ignore LDS.
|
||||
if (isa<GlobalValue>(U))
|
||||
return SmallPtrSet<Function *, 8>();
|
||||
|
||||
// Recursively explore constant users.
|
||||
if (isa<Constant>(U)) {
|
||||
append_range(UserStack, U->users());
|
||||
continue;
|
||||
}
|
||||
|
||||
// `U` should be an instruction, if it belongs to a non-kernel function F,
|
||||
// then collect F.
|
||||
Function *F = cast<Instruction>(U)->getFunction();
|
||||
if (!AMDGPU::isKernelCC(F))
|
||||
LDSAccessors.insert(F);
|
||||
}
|
||||
|
||||
return LDSAccessors;
|
||||
}
|
||||
|
||||
DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
|
||||
getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
|
||||
DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
|
||||
SmallVector<User *, 8> UserStack;
|
||||
SmallPtrSet<User *, 8> VisitedUsers;
|
||||
|
||||
UserStack.push_back(U);
|
||||
|
||||
while (!UserStack.empty()) {
|
||||
auto *UU = UserStack.pop_back_val();
|
||||
|
||||
if (!VisitedUsers.insert(UU).second)
|
||||
continue;
|
||||
|
||||
if (isa<GlobalValue>(UU))
|
||||
continue;
|
||||
|
||||
if (isa<Constant>(UU)) {
|
||||
append_range(UserStack, UU->users());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto *I = cast<Instruction>(UU);
|
||||
Function *F = I->getFunction();
|
||||
if (CollectKernelInsts) {
|
||||
if (!AMDGPU::isKernelCC(F)) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (AMDGPU::isKernelCC(F)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
|
||||
FunctionToInsts[F].insert(I);
|
||||
}
|
||||
|
||||
return FunctionToInsts;
|
||||
}
|
||||
|
||||
bool isKernelCC(const Function *Func) {
|
||||
return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
|
||||
}
|
||||
|
@ -14,6 +14,8 @@
|
||||
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
@ -21,6 +23,24 @@ class ConstantExpr;
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
/// Collect reachable callees for each kernel defined in the module \p M and
|
||||
/// return collected callees at \p KernelToCallees.
|
||||
void collectReachableCallees(
|
||||
Module &M,
|
||||
DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees);
|
||||
|
||||
/// For the given LDS global \p GV, visit all its users and collect all
|
||||
/// non-kernel functions within which \p GV is used and return collected list of
|
||||
/// such non-kernel functions.
|
||||
SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
|
||||
|
||||
/// Collect all the instructions where user \p U belongs to. \p U could be
|
||||
/// instruction itself or it could be a constant expression which is used within
|
||||
/// an instruction. If \p CollectKernelInsts is true, collect instructions only
|
||||
/// from kernels, otherwise collect instructions only from non-kernel functions.
|
||||
DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
|
||||
getFunctionToInstsMap(User *U, bool CollectKernelInsts);
|
||||
|
||||
bool isKernelCC(const Function *Func);
|
||||
|
||||
Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
|
||||
|
@ -42,6 +42,7 @@
|
||||
; GCN-O0-NEXT: Inliner for always_inline functions
|
||||
; GCN-O0-NEXT: A No-Op Barrier Pass
|
||||
; GCN-O0-NEXT: Lower OpenCL enqueued blocks
|
||||
; GCN-O0-NEXT: Replace within non-kernel function use of LDS with pointer
|
||||
; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions
|
||||
; GCN-O0-NEXT: FunctionPass Manager
|
||||
; GCN-O0-NEXT: Dominator Tree Construction
|
||||
@ -192,6 +193,7 @@
|
||||
; GCN-O1-NEXT: Inliner for always_inline functions
|
||||
; GCN-O1-NEXT: A No-Op Barrier Pass
|
||||
; GCN-O1-NEXT: Lower OpenCL enqueued blocks
|
||||
; GCN-O1-NEXT: Replace within non-kernel function use of LDS with pointer
|
||||
; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions
|
||||
; GCN-O1-NEXT: FunctionPass Manager
|
||||
; GCN-O1-NEXT: Infer address spaces
|
||||
@ -438,6 +440,7 @@
|
||||
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
|
||||
; GCN-O1-OPTS-NEXT: A No-Op Barrier Pass
|
||||
; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks
|
||||
; GCN-O1-OPTS-NEXT: Replace within non-kernel function use of LDS with pointer
|
||||
; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions
|
||||
; GCN-O1-OPTS-NEXT: FunctionPass Manager
|
||||
; GCN-O1-OPTS-NEXT: Infer address spaces
|
||||
@ -717,6 +720,7 @@
|
||||
; GCN-O2-NEXT: Inliner for always_inline functions
|
||||
; GCN-O2-NEXT: A No-Op Barrier Pass
|
||||
; GCN-O2-NEXT: Lower OpenCL enqueued blocks
|
||||
; GCN-O2-NEXT: Replace within non-kernel function use of LDS with pointer
|
||||
; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
; GCN-O2-NEXT: Infer address spaces
|
||||
@ -997,6 +1001,7 @@
|
||||
; GCN-O3-NEXT: Inliner for always_inline functions
|
||||
; GCN-O3-NEXT: A No-Op Barrier Pass
|
||||
; GCN-O3-NEXT: Lower OpenCL enqueued blocks
|
||||
; GCN-O3-NEXT: Replace within non-kernel function use of LDS with pointer
|
||||
; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
; GCN-O3-NEXT: Infer address spaces
|
||||
|
88
test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll
Normal file
88
test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll
Normal file
@ -0,0 +1,88 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; The lds global @lds_used_within_func is used within non-kernel function @func_uses_lds
|
||||
; which is recheable from kernel @kernel_reaches_lds, hence pointer replacement takes place
|
||||
; for @lds_used_within_func.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; Pointer should be created.
|
||||
; CHECK: @lds_used_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @func_uses_lds() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_func.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; No change
|
||||
define internal void @func_does_not_use_lds_3() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: call void @func_uses_lds()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @func_uses_lds()
|
||||
ret void
|
||||
}
|
||||
|
||||
; No change
|
||||
define internal void @func_does_not_use_lds_2() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: call void @func_uses_lds()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @func_uses_lds()
|
||||
ret void
|
||||
}
|
||||
|
||||
; No change
|
||||
define internal void @func_does_not_use_lds_1() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: call void @func_does_not_use_lds_2()
|
||||
; CHECK: call void @func_does_not_use_lds_3()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @func_does_not_use_lds_2()
|
||||
call void @func_does_not_use_lds_3()
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_reaches_lds() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_func to i16), i16 addrspace(3)* @lds_used_within_func.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @func_does_not_use_lds_1()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @func_does_not_use_lds_1()
|
||||
ret void
|
||||
}
|
||||
|
||||
; No change here since this kernel does not reach @func_uses_lds which uses lds.
|
||||
define protected amdgpu_kernel void @kernel_does_not_reach_lds() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
ret void
|
||||
}
|
@ -0,0 +1,130 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; There are three lds globals defined here, and these three lds are used respectively within
|
||||
; three non-kernel functions. There are three kernels, which call two of the non-kernel functions.
|
||||
; Hence pointer replacement should take place for all three lds, and pointer initialization within
|
||||
; kernel should selectively happen depending on which lds is reachable from the kernel.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4
|
||||
; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4
|
||||
; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4
|
||||
@lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4
|
||||
@lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4
|
||||
@lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4
|
||||
|
||||
; Pointers should be created.
|
||||
; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_3() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [3 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_2() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [2 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_1() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [1 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_calls_function_3_and_1() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @function_3()
|
||||
; CHECK: call void @function_1()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @function_3()
|
||||
call void @function_1()
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_calls_function_2_and_3() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @function_2()
|
||||
; CHECK: call void @function_3()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @function_2()
|
||||
call void @function_3()
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_calls_function_1_and_2() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @function_1()
|
||||
; CHECK: call void @function_2()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @function_1()
|
||||
call void @function_2()
|
||||
ret void
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; None of lds are pointer-replaced since they are all used in global scope in one or the other way.
|
||||
;
|
||||
|
||||
; CHECK: @lds = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4
|
||||
; CHECK: @lds.3 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
|
||||
@lds = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds.1 = addrspace(3) global i16 undef, align 2
|
||||
@lds.2 = addrspace(3) global i32 undef, align 4
|
||||
@lds.3 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
|
||||
|
||||
; CHECK: @global_var = addrspace(1) global float* addrspacecast (float addrspace(3)* bitcast ([4 x i32] addrspace(3)* @lds to float addrspace(3)*) to float*), align 8
|
||||
; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
|
||||
; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
|
||||
; CHECK: @alias.to.lds.3 = alias [1 x i8], [1 x i8] addrspace(3)* @lds.3
|
||||
@global_var = addrspace(1) global float* addrspacecast ([4 x i32] addrspace(3)* @lds to float*), align 8
|
||||
@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
|
||||
@llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
|
||||
@alias.to.lds.3 = alias [1 x i8], [1 x i8] addrspace(3)* @lds.3
|
||||
|
||||
; CHECK-NOT: @lds.ptr
|
||||
; CHECK-NOT: @lds.1.ptr
|
||||
; CHECK-NOT: @lds.2.ptr
|
||||
; CHECK-NOT: @lds.3.ptr
|
||||
|
||||
define void @f0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 0
|
||||
; CHECK: %ld1 = load i16, i16 addrspace(3)* @lds.1
|
||||
; CHECK: %ld2 = load i32, i32 addrspace(3)* @lds.2
|
||||
; CHECK: %gep2 = getelementptr inbounds [1 x i8], [1 x i8] addrspace(3)* @lds.3, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 0
|
||||
%ld1 = load i16, i16 addrspace(3)* @lds.1
|
||||
%ld2 = load i32, i32 addrspace(3)* @lds.2
|
||||
%gep2 = getelementptr inbounds [1 x i8], [1 x i8] addrspace(3)* @lds.3, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
define protected amdgpu_kernel void @k0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: call void @f0()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @f0()
|
||||
ret void
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; We do not know what to do with inline asm call, we ignore it, hence pointer replacement for
|
||||
; @used_only_within_func does not take place.
|
||||
;
|
||||
|
||||
; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4
|
||||
@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; CHECK-NOT: @used_only_within_func.ptr
|
||||
|
||||
define void @f0(i32 %x) {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_func, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_func, i32 0, i32 0) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @k0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: call i32 asm "s_mov_b32 $0, 0", "=s"()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call i32 asm "s_mov_b32 $0, 0", "=s"()
|
||||
ret void
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION ;
|
||||
;
|
||||
; LDS global @used_only_within_kern is used only within kernel @k0, hence pointer replacement
|
||||
; does not take place for @used_only_within_kern.
|
||||
;
|
||||
|
||||
; CHECK: @used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4
|
||||
@used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; CHECK-NOT: @used_only_within_kern.ptr
|
||||
|
||||
define amdgpu_kernel void @k0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: %mul = mul i32 %ld, 2
|
||||
; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
%mul = mul i32 %ld, 2
|
||||
store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
ret void
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION ;
|
||||
;
|
||||
; LDS global @not-reachable-lds is used within non-kernel function @f0, but @f0 is *not*
|
||||
; reachable from kernel @k, hence pointer replacement does not take place.
|
||||
;
|
||||
|
||||
; CHECK: @not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; CHECK-NOT: @not-reachable-lds.ptr
|
||||
|
||||
define internal void @f0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
define protected amdgpu_kernel void @k0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
ret void
|
||||
}
|
31
test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll
Normal file
31
test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll
Normal file
@ -0,0 +1,31 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION ;
|
||||
;
|
||||
; LDS global @small_lds is used within non-kernel function @f0, and @f0 is reachable
|
||||
; from kernel @k0, but since @small_lds too small for pointer replacement, pointer
|
||||
; replacement does not take place.
|
||||
;
|
||||
|
||||
; CHECK: @small_lds = addrspace(3) global i8 undef, align 1
|
||||
@small_lds = addrspace(3) global i8 undef, align 1
|
||||
|
||||
; CHECK-NOT: @small_lds.ptr
|
||||
|
||||
define void @f0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: store i8 1, i8 addrspace(3)* @small_lds, align 1
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
store i8 1, i8 addrspace(3)* @small_lds, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @k0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: call void @f0()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @f0()
|
||||
ret void
|
||||
}
|
@ -0,0 +1,95 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; The lds global @lds_used_within_func is used within non-kernel function @func_uses_lds
|
||||
; which is *indirectly* recheable from kernel @kernel_reaches_lds, hence pointer replacement
|
||||
; takes place for @lds_used_within_func.
|
||||
|
||||
; Original LDS should exit.
|
||||
; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; Function pointer should exist as it is.
|
||||
; CHECK: @ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8
|
||||
@ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8
|
||||
|
||||
; Pointer should be created.
|
||||
; CHECK: @lds_used_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @func_uses_lds() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_func.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; No change
|
||||
define internal void @func_does_not_use_lds_3() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8
|
||||
; CHECK: call void %fptr()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%fptr = load void ()*, void ()** @ptr_to_func, align 8
|
||||
call void %fptr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; No change
|
||||
define internal void @func_does_not_use_lds_2() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8
|
||||
; CHECK: call void %fptr()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%fptr = load void ()*, void ()** @ptr_to_func, align 8
|
||||
call void %fptr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; No change
|
||||
define internal void @func_does_not_use_lds_1() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: call void @func_does_not_use_lds_2()
|
||||
; CHECK: call void @func_does_not_use_lds_3()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @func_does_not_use_lds_2()
|
||||
call void @func_does_not_use_lds_3()
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_reaches_lds() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_func to i16), i16 addrspace(3)* @lds_used_within_func.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @func_does_not_use_lds_1()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @func_does_not_use_lds_1()
|
||||
ret void
|
||||
}
|
||||
|
||||
; No change here since this kernel does not reach @func_uses_lds which uses lds.
|
||||
define protected amdgpu_kernel void @kernel_does_not_reach_lds() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
ret void
|
||||
}
|
@ -0,0 +1,151 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; There are three lds globals defined here, and these three lds are used respectively within
|
||||
; three non-kernel functions. There are three kernels, which *indirectly* call two of the
|
||||
; non-kernel functions. Hence pointer replacement should take place for all three lds, and
|
||||
; pointer initialization within kernel should selectively happen depending on which lds is
|
||||
; reachable from the kernel.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; Function pointers should exist.
|
||||
; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8
|
||||
; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8
|
||||
; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8
|
||||
@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8
|
||||
@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8
|
||||
@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8
|
||||
|
||||
; Pointers should be created.
|
||||
; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_3(i8 %c) {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_2(i16 %i) {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_1(float %f) {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_calls_function_3_and_1() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8
|
||||
; CHECK: %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8
|
||||
; CHECK: call void %fptr3(i8 1)
|
||||
; CHECK: call void %fptr1(float 2.000000e+00)
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8
|
||||
%fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8
|
||||
call void %fptr3(i8 1)
|
||||
call void %fptr1(float 2.0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_calls_function_2_and_3() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8
|
||||
; CHECK: %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8
|
||||
; CHECK: call void %fptr2(i16 3)
|
||||
; CHECK: call void %fptr3(i8 4)
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8
|
||||
%fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8
|
||||
call void %fptr2(i16 3)
|
||||
call void %fptr3(i8 4)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_calls_function_1_and_2() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8
|
||||
; CHECK: %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8
|
||||
; CHECK: call void %fptr1(float 5.000000e+00)
|
||||
; CHECK: call void %fptr2(i16 6)
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8
|
||||
%fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8
|
||||
call void %fptr1(float 5.0)
|
||||
call void %fptr2(i16 6)
|
||||
ret void
|
||||
}
|
@ -0,0 +1,94 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; There are three lds globals defined here, and these three lds are used respectively within
|
||||
; three non-kernel functions. There is one kernel which *indirectly* calls one of the non-kernel
|
||||
; functions. But since all the three non-kernel functions have same signature, all three
|
||||
; non-kernel functions are resolved as potential callees for indirect call-site. Hence we land-up
|
||||
; pointer replacement for three lds globals.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; Function pointers should exist.
|
||||
; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8
|
||||
; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8
|
||||
; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8
|
||||
@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8
|
||||
@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8
|
||||
@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8
|
||||
|
||||
; Pointers should be created.
|
||||
; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_3(i16 %i) {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_2(i16 %i) {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function_1(i16 %i) {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel_indirectly_calls_function_1() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: %fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8
|
||||
; CHECK: call void %fptr1(i16 6)
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8
|
||||
call void %fptr1(i16 6)
|
||||
ret void
|
||||
}
|
214
test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll
Normal file
214
test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll
Normal file
@ -0,0 +1,214 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck --check-prefix=POINTER-REPLACE %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-lower-module-lds < %s | FileCheck --check-prefix=LOWER_LDS %s
|
||||
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
|
||||
|
||||
;
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; 1. There are three lds defined - @lds.1, @lds.2 and @lds.3, which are of types i32, i64, and [2 x i64].
|
||||
; @lds.3 is aliased to to @alias.to.lds.3
|
||||
; 2. @lds.1 is used in function @f1, and @lds.2 is used in function @f2, @alias.to.lds.3 is used in kernel @k1.
|
||||
|
||||
; 3. Pointer-replacement pass replaces @lds.1 and @lds.2 by pointers @lds.1.ptr and @lds.2.ptr respectively.
|
||||
; However it does not touch @lds.3 since it is used in global scope (aliased).
|
||||
;
|
||||
; 4. LDS-lowering pass sees use of @lds.1.ptr in function @f1, use of @lds.2.ptr in function @f2, and use of
|
||||
; @lds.3 (via alias @alias.to.lds.3) in kernel @k1. Hence it module lowers these lds into struct instance
|
||||
; @llvm.amdgcn.module.lds.
|
||||
;
|
||||
; The struct member order is - [lds.3, lds.1.ptr, lds.2.ptr]. Since @llvm.amdgcn.module.lds itself is allocated
|
||||
; on address 0, lds.3 is allocated on address 0, lds.1.ptr is allocated on address 16, and lds.2.ptr is allocated
|
||||
; on address 18.
|
||||
;
|
||||
; Again LDS-lowering pass sees use of @lds.1 and @lds.2 in kernel. Hence it kernel lowers these lds into struct
|
||||
; instance @llvm.amdgcn.kernel.k1.lds.
|
||||
;
|
||||
; The struct member order is - [@lds.2, @lds.1]. By now, already (16 + 2 + 2) 20 byte of memory allocated, @lds.2
|
||||
; is allocated on address 24 since it needs to be allocated on 8 byte boundary, and @lds.1 is allocated on address
|
||||
; 32.
|
||||
;
|
||||
; 5. Hence the final GCN ISA looks as below:
|
||||
;
|
||||
; Within kernel @k1:
|
||||
; address 24 is stored in address 18.
|
||||
; address 32 is stored in address 16
|
||||
;
|
||||
; Within function @f1:
|
||||
; address 32 is loaded from address 16
|
||||
;
|
||||
; Within function @f2:
|
||||
; address 24 is loaded from address 18
|
||||
;
|
||||
|
||||
|
||||
; POINTER-REPLACE: @lds.1 = addrspace(3) global i32 undef, align 4
|
||||
; POINTER-REPLACE: @lds.2 = addrspace(3) global i64 undef, align 8
|
||||
; POINTER-REPLACE: @lds.3 = addrspace(3) global [2 x i64] undef, align 16
|
||||
; POINTER-REPLACE: @lds.1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; POINTER-REPLACE: @lds.2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; POINTER-REPLACE: @alias.to.lds.3 = alias [2 x i64], [2 x i64] addrspace(3)* @lds.3
|
||||
|
||||
|
||||
; LOWER_LDS-NOT: @lds.1
|
||||
; LOWER_LDS-NOT: @lds.2
|
||||
; LOWER_LDS-NOT: @lds.3
|
||||
; LOWER_LDS: %llvm.amdgcn.module.lds.t = type { [2 x i64], i16, i16 }
|
||||
; LOWER_LDS: %llvm.amdgcn.kernel.k1.lds.t = type { i64, i32 }
|
||||
; LOWER_LDS: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 16
|
||||
; LOWER_LDS: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
|
||||
; LOWER_LDS: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 8
|
||||
; LOWER_LDS: @alias.to.lds.3 = alias [2 x i64], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0)
|
||||
|
||||
@lds.1 = addrspace(3) global i32 undef, align 4
|
||||
@lds.2 = addrspace(3) global i64 undef, align 8
|
||||
@lds.3 = addrspace(3) global [2 x i64] undef, align 16
|
||||
@alias.to.lds.3 = alias [2 x i64], [2 x i64] addrspace(3)* @lds.3
|
||||
|
||||
; POINTER-REPLACE-LABEL: @f1
|
||||
; POINTER-REPLACE: %1 = load i16, i16 addrspace(3)* @lds.1.ptr, align 2
|
||||
; POINTER-REPLACE: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1
|
||||
; POINTER-REPLACE: %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)*
|
||||
; POINTER-REPLACE: store i32 7, i32 addrspace(3)* %3, align 4
|
||||
; POINTER-REPLACE: ret void
|
||||
|
||||
|
||||
; LOWER_LDS-LABEL: @f1
|
||||
; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2
|
||||
; LOWER_LDS: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1
|
||||
; LOWER_LDS: %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)*
|
||||
; LOWER_LDS: store i32 7, i32 addrspace(3)* %3, align 4
|
||||
; LOWER_LDS: ret void
|
||||
|
||||
|
||||
; GCN-LABEL: f1:
|
||||
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: ds_read_i16 v0, v0 offset:16
|
||||
; GCN: v_mov_b32_e32 v1, 7
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: ds_write_b32 v0, v1
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: s_setpc_b64 s[30:31]
|
||||
define void @f1() {
|
||||
store i32 7, i32 addrspace(3)* @lds.1
|
||||
ret void
|
||||
}
|
||||
|
||||
; POINTER-REPLACE-LABEL: @f2
|
||||
; POINTER-REPLACE: %1 = load i16, i16 addrspace(3)* @lds.2.ptr, align 2
|
||||
; POINTER-REPLACE: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1
|
||||
; POINTER-REPLACE: %3 = bitcast i8 addrspace(3)* %2 to i64 addrspace(3)*
|
||||
; POINTER-REPLACE: store i64 15, i64 addrspace(3)* %3, align 4
|
||||
; POINTER-REPLACE: ret void
|
||||
|
||||
|
||||
; LOWER_LDS-LABEL: @f2
|
||||
; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2
|
||||
; LOWER_LDS: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1
|
||||
; LOWER_LDS: %3 = bitcast i8 addrspace(3)* %2 to i64 addrspace(3)*
|
||||
; LOWER_LDS: store i64 15, i64 addrspace(3)* %3, align 4
|
||||
; LOWER_LDS: ret void
|
||||
|
||||
|
||||
; GCN-LABEL: f2:
|
||||
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN: v_mov_b32_e32 v1, 0
|
||||
; GCN: ds_read_i16 v2, v1 offset:18
|
||||
; GCN: v_mov_b32_e32 v0, 15
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: ds_write_b64 v2, v[0:1]
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: s_setpc_b64 s[30:31]
|
||||
define void @f2() {
|
||||
store i64 15, i64 addrspace(3)* @lds.2
|
||||
ret void
|
||||
}
|
||||
|
||||
; POINTER-REPLACE-LABEL: @k1
|
||||
; POINTER-REPLACE: %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; POINTER-REPLACE: %2 = icmp eq i32 %1, 0
|
||||
; POINTER-REPLACE: br i1 %2, label %3, label %4
|
||||
;
|
||||
; POINTER-REPLACE-LABEL: 3:
|
||||
; POINTER-REPLACE: store i16 ptrtoint (i64 addrspace(3)* @lds.2 to i16), i16 addrspace(3)* @lds.2.ptr, align 2
|
||||
; POINTER-REPLACE: store i16 ptrtoint (i32 addrspace(3)* @lds.1 to i16), i16 addrspace(3)* @lds.1.ptr, align 2
|
||||
; POINTER-REPLACE: br label %4
|
||||
;
|
||||
; POINTER-REPLACE-LABEL: 4:
|
||||
; POINTER-REPLACE: call void @llvm.amdgcn.wave.barrier()
|
||||
; POINTER-REPLACE: %bc = bitcast [2 x i64] addrspace(3)* @alias.to.lds.3 to i8 addrspace(3)*
|
||||
; POINTER-REPLACE: store i8 3, i8 addrspace(3)* %bc, align 2
|
||||
; POINTER-REPLACE: call void @f1()
|
||||
; POINTER-REPLACE: call void @f2()
|
||||
; POINTER-REPLACE: ret void
|
||||
|
||||
|
||||
; LOWER_LDS-LABEL: @k1
|
||||
; LOWER_LDS: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; LOWER_LDS: %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; LOWER_LDS: %2 = icmp eq i32 %1, 0
|
||||
; LOWER_LDS: br i1 %2, label %3, label %6
|
||||
;
|
||||
; LOWER_LDS-LABEL: 3:
|
||||
; LOWER_LDS: %4 = ptrtoint i64 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i16
|
||||
; LOWER_LDS: store i16 %4, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2
|
||||
; LOWER_LDS: %5 = ptrtoint i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i16
|
||||
; LOWER_LDS: store i16 %5, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2
|
||||
; LOWER_LDS: br label %6
|
||||
;
|
||||
; LOWER_LDS-LABEL: 6:
|
||||
; LOWER_LDS: call void @llvm.amdgcn.wave.barrier()
|
||||
; LOWER_LDS: %bc = bitcast [2 x i64] addrspace(3)* @alias.to.lds.3 to i8 addrspace(3)*
|
||||
; LOWER_LDS: store i8 3, i8 addrspace(3)* %bc, align 2
|
||||
; LOWER_LDS: call void @f1()
|
||||
; LOWER_LDS: call void @f2()
|
||||
; LOWER_LDS: ret void
|
||||
|
||||
|
||||
; GCN-LABEL: k1:
|
||||
; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
||||
; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
||||
; GCN: s_mov_b32 s10, -1
|
||||
; GCN: s_mov_b32 s11, 0xe00000
|
||||
; GCN: s_add_u32 s8, s8, s1
|
||||
; GCN: v_mbcnt_lo_u32_b32 v0, -1, 0
|
||||
; GCN: s_addc_u32 s9, s9, 0
|
||||
; GCN: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GCN: s_mov_b32 s32, 0
|
||||
; GCN: s_and_saveexec_b64 s[0:1], vcc
|
||||
; GCN: s_cbranch_execz BB2_2
|
||||
; GCN: v_mov_b32_e32 v0, 24
|
||||
; GCN: v_mov_b32_e32 v1, 0
|
||||
; GCN: ds_write_b16 v1, v0 offset:18
|
||||
; GCN: v_mov_b32_e32 v0, 32
|
||||
; GCN: ds_write_b16 v1, v0 offset:16
|
||||
; GCN-LABEL: BB2_2:
|
||||
; GCN: s_or_b64 exec, exec, s[0:1]
|
||||
; GCN: s_getpc_b64 s[0:1]
|
||||
; GCN: s_add_u32 s0, s0, f1@gotpcrel32@lo+4
|
||||
; GCN: s_addc_u32 s1, s1, f1@gotpcrel32@hi+12
|
||||
; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GCN: s_mov_b64 s[0:1], s[8:9]
|
||||
; GCN: s_mov_b64 s[2:3], s[10:11]
|
||||
; GCN: v_mov_b32_e32 v0, alias.to.lds.3@abs32@lo
|
||||
; GCN: v_mov_b32_e32 v1, 3
|
||||
; ; wave barrier
|
||||
; GCN: ds_write_b8 v0, v1
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN: s_getpc_b64 s[0:1]
|
||||
; GCN: s_add_u32 s0, s0, f2@gotpcrel32@lo+4
|
||||
; GCN: s_addc_u32 s1, s1, f2@gotpcrel32@hi+12
|
||||
; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GCN: s_mov_b64 s[0:1], s[8:9]
|
||||
; GCN: s_mov_b64 s[2:3], s[10:11]
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; GCN: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @k1() {
|
||||
%bc = bitcast [2 x i64] addrspace(3)* @alias.to.lds.3 to i8 addrspace(3)*
|
||||
store i8 3, i8 addrspace(3)* %bc, align 2
|
||||
call void @f1()
|
||||
call void @f2()
|
||||
ret void
|
||||
}
|
66
test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll
Normal file
66
test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll
Normal file
@ -0,0 +1,66 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; There are three lds globals defined here, and these three lds are used within a single
|
||||
; non-kernel function, and this non-kernel function is reachable from kernel. Hence pointer
|
||||
; replacement is required for all three lds globals.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @lds1 = internal addrspace(3) global [1 x i32] undef, align 4
|
||||
; CHECK: @lds2 = internal addrspace(3) global [2 x i32] undef, align 4
|
||||
; CHECK: @lds3 = internal addrspace(3) global [3 x i32] undef, align 4
|
||||
@lds1 = internal addrspace(3) global [1 x i32] undef, align 4
|
||||
@lds2 = internal addrspace(3) global [2 x i32] undef, align 4
|
||||
@lds3 = internal addrspace(3) global [3 x i32] undef, align 4
|
||||
|
||||
; Pointers should be created.
|
||||
; CHECK: @lds1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds3.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [3 x i32] addrspace(3)*
|
||||
; CHECK: %3 = load i16, i16 addrspace(3)* @lds2.ptr, align 2
|
||||
; CHECK: %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3
|
||||
; CHECK: %5 = bitcast i8 addrspace(3)* %4 to [2 x i32] addrspace(3)*
|
||||
; CHECK: %6 = load i16, i16 addrspace(3)* @lds1.ptr, align 2
|
||||
; CHECK: %7 = getelementptr i8, i8 addrspace(3)* null, i16 %6
|
||||
; CHECK: %8 = bitcast i8 addrspace(3)* %7 to [1 x i32] addrspace(3)*
|
||||
; CHECK: %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %8, i32 0, i32 0
|
||||
; CHECK: %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* %5, i32 0, i32 0
|
||||
; CHECK: %gep3 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0
|
||||
%gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @lds2, i32 0, i32 0
|
||||
%gep3 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* @lds3, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added;
|
||||
define protected amdgpu_kernel void @kernel() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds3 to i16), i16 addrspace(3)* @lds3.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds2 to i16), i16 addrspace(3)* @lds2.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds1 to i16), i16 addrspace(3)* @lds1.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @function()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @function()
|
||||
ret void
|
||||
}
|
53
test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll
Normal file
53
test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll
Normal file
@ -0,0 +1,53 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; There is one lds global defined here, and this lds is used within a single non-kernel
|
||||
; function multiple times, and this non-kernel function is reachable from kernel. Hence
|
||||
; pointer takes place. But important note is - store-to/load-from pointer should happen
|
||||
; only once irrespective of number of uses.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @lds1 = internal addrspace(3) global [1 x i32] undef, align 4
|
||||
@lds1 = internal addrspace(3) global [1 x i32] undef, align 4
|
||||
|
||||
; Pointers should be created.
|
||||
; CHECK: @lds1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds1.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [1 x i32] addrspace(3)*
|
||||
; CHECK: %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: %gep2 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: %gep3 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0
|
||||
%gep2 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0
|
||||
%gep3 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added;
|
||||
define protected amdgpu_kernel void @kernel() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds1 to i16), i16 addrspace(3)* @lds1.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @function()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @function()
|
||||
ret void
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; There is one lds global defined here, and this lds is used within a single non-kernel
|
||||
; function, as an operand of nested constant expression, and this non-kernel function is
|
||||
; reachable from kernel. Hence nested constant expression should to be converted into a
|
||||
; series of instructons and pointer replacement should take place.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4
|
||||
@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; Pointers should be created.
|
||||
; CHECK: @used_only_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define void @f0(i32 %x) {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @used_only_within_func.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
|
||||
; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32*
|
||||
; CHECK: %5 = ptrtoint i32* %4 to i64
|
||||
; CHECK: %6 = add i64 %5, %5
|
||||
; CHECK: %7 = inttoptr i64 %6 to i32*
|
||||
; CHECK: store i32 %x, i32* %7, align 4
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define amdgpu_kernel void @k0() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @used_only_within_func to i16), i16 addrspace(3)* @used_only_within_func.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @f0(i32 0)
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @f0(i32 0)
|
||||
ret void
|
||||
}
|
@ -0,0 +1,58 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
; There is one lds global defined here, and this lds is used within a single non-kernel
|
||||
; function, as an operand of nested constant expression, and this non-kernel function is
|
||||
; reachable from kernel. Hence nested constant expression should to be converted into a
|
||||
; series of instructons and pointer replacement should take place. But, important note
|
||||
; is - only constant expression operands which uses lds should be converted into
|
||||
; instructions, other constant expression operands which do not use lds should be left
|
||||
; untouched.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
@lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; Non-LDS global should exist as it is.
|
||||
; CHECK: @global_var = internal addrspace(1) global [4 x i32] undef, align 4
|
||||
@global_var = internal addrspace(1) global [4 x i32] undef, align 4
|
||||
|
||||
; Pointer should be created.
|
||||
; CHECK: @lds_used_within_function.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
; Pointer replacement code should be added.
|
||||
define internal void @function() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
||||
; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 2
|
||||
; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32*
|
||||
; CHECK: %5 = ptrtoint i32* %4 to i32
|
||||
; CHECK: %6 = add i32 %5, ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32)
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
%0 = add i32 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 2) to i32*) to i32), ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Pointer initialization code shoud be added
|
||||
define protected amdgpu_kernel void @kernel() {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %1 = icmp eq i32 %0, 0
|
||||
; CHECK: br i1 %1, label %2, label %3
|
||||
;
|
||||
; CHECK-LABEL: 2:
|
||||
; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function to i16), i16 addrspace(3)* @lds_used_within_function.ptr, align 2
|
||||
; CHECK: br label %3
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @function()
|
||||
; CHECK: ret void
|
||||
entry:
|
||||
call void @function()
|
||||
ret void
|
||||
}
|
@ -0,0 +1,93 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s
|
||||
|
||||
; DESCRIPTION:
|
||||
;
|
||||
; Replace lds globals used within phi instruction.
|
||||
;
|
||||
|
||||
; Original LDS should exist.
|
||||
; CHECK: @lds.1 = addrspace(3) global i32 undef, align 4
|
||||
; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4
|
||||
@lds.1 = addrspace(3) global i32 undef, align 4
|
||||
@lds.2 = addrspace(3) global i32 undef, align 4
|
||||
|
||||
; Pointers should be created.
|
||||
; CHECK: @lds.1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
; CHECK: @lds.2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
||||
|
||||
define void @f0(i32 %arg) {
|
||||
; CHECK-LABEL: bb:
|
||||
; CHECK: %0 = load i16, i16 addrspace(3)* @lds.2.ptr, align 2
|
||||
; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
||||
; CHECK: %2 = bitcast i8 addrspace(3)* %1 to i32 addrspace(3)*
|
||||
; CHECK: %3 = load i16, i16 addrspace(3)* @lds.1.ptr, align 2
|
||||
; CHECK: %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3
|
||||
; CHECK: %5 = bitcast i8 addrspace(3)* %4 to i32 addrspace(3)*
|
||||
; CHECK: %id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK: %my.tmp = sub i32 %id, %arg
|
||||
; CHECK: br label %bb1
|
||||
bb:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%my.tmp = sub i32 %id, %arg
|
||||
br label %bb1
|
||||
|
||||
; CHECK-LABEL: bb1:
|
||||
; CHECK: %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ]
|
||||
; CHECK: %6 = icmp ne i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), %5
|
||||
; CHECK: %lsr.iv.next = add i32 %lsr.iv, 1
|
||||
; CHECK: %cmp0 = icmp slt i32 %lsr.iv.next, 0
|
||||
; CHECK: br i1 %cmp0, label %bb4, label %Flow
|
||||
bb1:
|
||||
%lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ]
|
||||
%lsr.iv.next = add i32 %lsr.iv, 1
|
||||
%cmp0 = icmp slt i32 %lsr.iv.next, 0
|
||||
br i1 %cmp0, label %bb4, label %Flow
|
||||
|
||||
; CHECK-LABEL: bb4:
|
||||
; CHECK: %load = load volatile i32, i32 addrspace(1)* undef, align 4
|
||||
; CHECK: %cmp1 = icmp sge i32 %my.tmp, %load
|
||||
; CHECK: br label %Flow
|
||||
bb4:
|
||||
%load = load volatile i32, i32 addrspace(1)* undef, align 4
|
||||
%cmp1 = icmp sge i32 %my.tmp, %load
|
||||
br label %Flow
|
||||
|
||||
; CHECK-LABEL: Flow:
|
||||
; CHECK: %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
|
||||
; CHECK: %my.tmp3 = phi i32 addrspace(3)* [ %2, %bb4 ], [ %5, %bb1 ]
|
||||
; CHECK: %my.tmp4 = phi i1 [ %cmp1, %bb4 ], [ %6, %bb1 ]
|
||||
; CHECK: br i1 %my.tmp4, label %bb9, label %bb1
|
||||
Flow:
|
||||
%my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
|
||||
%my.tmp3 = phi i32 addrspace(3)* [@lds.2, %bb4 ], [ @lds.1, %bb1 ]
|
||||
%my.tmp4 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds.1), %bb1 ]
|
||||
br i1 %my.tmp4, label %bb9, label %bb1
|
||||
|
||||
; CHECK-LABEL: bb9:
|
||||
; CHECK: store volatile i32 7, i32 addrspace(3)* undef, align 4
|
||||
; CHECK: ret void
|
||||
bb9:
|
||||
store volatile i32 7, i32 addrspace(3)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @k0
|
||||
; CHECK: %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
; CHECK: %2 = icmp eq i32 %1, 0
|
||||
; CHECK: br i1 %2, label %3, label %4
|
||||
;
|
||||
; CHECK-LABEL: 3:
|
||||
; CHECK: store i16 ptrtoint (i32 addrspace(3)* @lds.2 to i16), i16 addrspace(3)* @lds.2.ptr, align 2
|
||||
; CHECK: store i16 ptrtoint (i32 addrspace(3)* @lds.1 to i16), i16 addrspace(3)* @lds.1.ptr, align 2
|
||||
; CHECK: br label %4
|
||||
;
|
||||
; CHECK-LABEL: 4:
|
||||
; CHECK: call void @llvm.amdgcn.wave.barrier()
|
||||
; CHECK: call void @f0(i32 %arg)
|
||||
; CHECK: ret void
|
||||
define amdgpu_kernel void @k0(i32 %arg) {
|
||||
call void @f0(i32 %arg)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
Loading…
Reference in New Issue
Block a user