mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
[AMDGPU] Lower kernel LDS into a sorted structure
Differential Revision: https://reviews.llvm.org/D102954
This commit is contained in:
parent
831c97ae39
commit
b8b00b1711
@ -73,6 +73,10 @@ class AMDGPULowerModuleLDS : public ModulePass {
|
||||
|
||||
GV->eraseFromParent();
|
||||
|
||||
for (Constant *C : ToRemove) {
|
||||
C->removeDeadConstantUsers();
|
||||
}
|
||||
|
||||
if (!Init.empty()) {
|
||||
ArrayType *ATy =
|
||||
ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
|
||||
@ -129,6 +133,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
|
||||
"");
|
||||
}
|
||||
|
||||
private:
|
||||
SmallPtrSet<GlobalValue *, 32> UsedList;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
@ -137,13 +144,28 @@ public:
|
||||
}
|
||||
|
||||
bool runOnModule(Module &M) override {
|
||||
UsedList = AMDGPU::getUsedList(M);
|
||||
|
||||
bool Changed = processUsedLDS(M);
|
||||
|
||||
for (Function &F : M.functions()) {
|
||||
if (!AMDGPU::isKernelCC(&F))
|
||||
continue;
|
||||
Changed |= processUsedLDS(M, &F);
|
||||
}
|
||||
|
||||
UsedList.clear();
|
||||
return Changed;
|
||||
}
|
||||
|
||||
private:
|
||||
bool processUsedLDS(Module &M, Function *F = nullptr) {
|
||||
LLVMContext &Ctx = M.getContext();
|
||||
const DataLayout &DL = M.getDataLayout();
|
||||
SmallPtrSet<GlobalValue *, 32> UsedList = AMDGPU::getUsedList(M);
|
||||
|
||||
// Find variables to move into new struct instance
|
||||
std::vector<GlobalVariable *> FoundLocalVars =
|
||||
AMDGPU::findVariablesToLower(M, UsedList);
|
||||
AMDGPU::findVariablesToLower(M, UsedList, F);
|
||||
|
||||
if (FoundLocalVars.empty()) {
|
||||
// No variables to rewrite, no changes made.
|
||||
@ -207,21 +229,25 @@ public:
|
||||
LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
|
||||
[](const GlobalVariable *V) -> Type * { return V->getValueType(); });
|
||||
|
||||
StructType *LDSTy = StructType::create(
|
||||
Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t"));
|
||||
std::string VarName(
|
||||
F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
|
||||
: "llvm.amdgcn.module.lds");
|
||||
StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
|
||||
|
||||
Align MaxAlign =
|
||||
AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
|
||||
|
||||
GlobalVariable *SGV = new GlobalVariable(
|
||||
M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
|
||||
"llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal,
|
||||
AMDGPUAS::LOCAL_ADDRESS, false);
|
||||
VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
|
||||
false);
|
||||
SGV->setAlignment(MaxAlign);
|
||||
appendToCompilerUsed(
|
||||
M, {static_cast<GlobalValue *>(
|
||||
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
|
||||
cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
|
||||
if (!F) {
|
||||
appendToCompilerUsed(
|
||||
M, {static_cast<GlobalValue *>(
|
||||
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
|
||||
cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
|
||||
}
|
||||
|
||||
// The verifier rejects used lists containing an inttoptr of a constant
|
||||
// so remove the variables from these lists before replaceAllUsesWith
|
||||
@ -233,16 +259,25 @@ public:
|
||||
for (size_t I = 0; I < LocalVars.size(); I++) {
|
||||
GlobalVariable *GV = LocalVars[I];
|
||||
Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
|
||||
GV->replaceAllUsesWith(
|
||||
ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx));
|
||||
GV->eraseFromParent();
|
||||
Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
|
||||
if (F) {
|
||||
GV->replaceUsesWithIf(GEP, [F](Use &U) {
|
||||
return AMDGPU::isUsedOnlyFromFunction(U.getUser(), F);
|
||||
});
|
||||
} else {
|
||||
GV->replaceAllUsesWith(GEP);
|
||||
}
|
||||
if (GV->use_empty()) {
|
||||
UsedList.erase(GV);
|
||||
GV->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
// Mark kernels with asm that reads the address of the allocated structure
|
||||
// This is not necessary for lowering. This lets other passes, specifically
|
||||
// PromoteAlloca, accurately calculate how much LDS will be used by the
|
||||
// kernel after lowering.
|
||||
{
|
||||
if (!F) {
|
||||
IRBuilder<> Builder(Ctx);
|
||||
SmallPtrSet<Function *, 32> Kernels;
|
||||
for (auto &I : M.functions()) {
|
||||
|
@ -20,7 +20,7 @@ namespace llvm {
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
bool isKernelCC(Function *Func) {
|
||||
bool isKernelCC(const Function *Func) {
|
||||
return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
|
||||
}
|
||||
|
||||
@ -29,18 +29,33 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
|
||||
GV->getValueType());
|
||||
}
|
||||
|
||||
bool userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
|
||||
User *InitialUser) {
|
||||
bool isUsedOnlyFromFunction(const User *U, const Function *F) {
|
||||
if (auto *I = dyn_cast<Instruction>(U)) {
|
||||
return I->getFunction() == F;
|
||||
}
|
||||
|
||||
if (auto *C = dyn_cast<ConstantExpr>(U)) {
|
||||
return all_of(U->users(),
|
||||
[F](const User *U) { return isUsedOnlyFromFunction(U, F); });
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
|
||||
const GlobalVariable &GV, const Function *F) {
|
||||
// Any LDS variable can be lowered by moving into the created struct
|
||||
// Each variable so lowered is allocated in every kernel, so variables
|
||||
// whose users are all known to be safe to lower without the transform
|
||||
// are left unchanged.
|
||||
SmallPtrSet<User *, 8> Visited;
|
||||
SmallVector<User *, 16> Stack;
|
||||
Stack.push_back(InitialUser);
|
||||
bool Ret = false;
|
||||
SmallPtrSet<const User *, 8> Visited;
|
||||
SmallVector<const User *, 16> Stack(GV.users());
|
||||
|
||||
assert(!F || isKernelCC(F));
|
||||
|
||||
while (!Stack.empty()) {
|
||||
User *V = Stack.pop_back_val();
|
||||
const User *V = Stack.pop_back_val();
|
||||
Visited.insert(V);
|
||||
|
||||
if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
|
||||
@ -50,31 +65,44 @@ bool userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
|
||||
}
|
||||
|
||||
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||
if (isKernelCC(I->getFunction())) {
|
||||
continue;
|
||||
const Function *UF = I->getFunction();
|
||||
if (UF == F) {
|
||||
// Used from this kernel, we want to put it into the structure.
|
||||
Ret = true;
|
||||
} else if (!F) {
|
||||
Ret |= !isKernelCC(UF);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto *E = dyn_cast<ConstantExpr>(V)) {
|
||||
for (Value::user_iterator EU = E->user_begin(); EU != E->user_end();
|
||||
++EU) {
|
||||
if (Visited.insert(*EU).second) {
|
||||
Stack.push_back(*EU);
|
||||
if (F) {
|
||||
// Any use which does not end up an instruction disqualifies a
|
||||
// variable to be put into a kernel's LDS structure because later
|
||||
// we will need to replace only this kernel's uses for which we
|
||||
// need to identify a using function.
|
||||
return isUsedOnlyFromFunction(E, F);
|
||||
}
|
||||
for (const User *U : E->users()) {
|
||||
if (Visited.insert(U).second) {
|
||||
Stack.push_back(U);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Unknown user, conservatively lower the variable
|
||||
return true;
|
||||
// Unknown user, conservatively lower the variable.
|
||||
// For module LDS conservatively means place it into the module LDS struct.
|
||||
// For kernel LDS it means lower as a standalone variable.
|
||||
return !F;
|
||||
}
|
||||
|
||||
return false;
|
||||
return Ret;
|
||||
}
|
||||
|
||||
std::vector<GlobalVariable *>
|
||||
findVariablesToLower(Module &M,
|
||||
const SmallPtrSetImpl<GlobalValue *> &UsedList) {
|
||||
findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList,
|
||||
const Function *F) {
|
||||
std::vector<llvm::GlobalVariable *> LocalVars;
|
||||
for (auto &GV : M.globals()) {
|
||||
if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
|
||||
@ -98,9 +126,7 @@ findVariablesToLower(Module &M,
|
||||
// dropped by the back end if not. This pass skips over it.
|
||||
continue;
|
||||
}
|
||||
if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) {
|
||||
return userRequiresLowering(UsedList, U);
|
||||
})) {
|
||||
if (!shouldLowerLDSToStruct(UsedList, GV, F)) {
|
||||
continue;
|
||||
}
|
||||
LocalVars.push_back(&GV);
|
||||
|
@ -19,18 +19,29 @@ namespace llvm {
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
bool isKernelCC(Function *Func);
|
||||
bool isKernelCC(const Function *Func);
|
||||
|
||||
Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
|
||||
|
||||
bool userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
|
||||
User *InitialUser);
|
||||
/// \returns true if an LDS global requres lowering to a module LDS structure
|
||||
/// if \p F is not given. If \p F is given it must be a kernel and function
|
||||
/// \returns true if an LDS global is directly used from that kernel and it
|
||||
/// is safe to replace its uses with a kernel LDS structure member.
|
||||
/// \p UsedList contains a union of llvm.used and llvm.compiler.used variables
|
||||
/// which do not count as a use.
|
||||
bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
|
||||
const GlobalVariable &GV,
|
||||
const Function *F = nullptr);
|
||||
|
||||
std::vector<GlobalVariable *>
|
||||
findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList);
|
||||
findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList,
|
||||
const Function *F = nullptr);
|
||||
|
||||
SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
|
||||
|
||||
/// \returns true if all uses of \p U end up in a function \p F.
|
||||
bool isUsedOnlyFromFunction(const User *U, const Function *F);
|
||||
|
||||
} // end namespace AMDGPU
|
||||
|
||||
} // end namespace llvm
|
||||
|
@ -83,7 +83,7 @@ define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] a
|
||||
@g_lds = addrspace(3) global float undef, align 4
|
||||
|
||||
; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
|
||||
; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo
|
||||
; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
|
||||
define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
|
||||
%val = load float, float addrspace(3)* @g_lds
|
||||
|
@ -9,9 +9,9 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace
|
||||
; CHECK-LABEL: use_lds_globals:
|
||||
; CHECK: ; %bb.0: ; %entry
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 4
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 8
|
||||
; CHECK-NEXT: s_mov_b32 m0, -1
|
||||
; CHECK-NEXT: ds_read_b32 v3, v0 offset:4
|
||||
; CHECK-NEXT: ds_read_b32 v3, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 9
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, 4
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
|
||||
; FIXME: Merge with DAG test
|
||||
|
||||
@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1096,27 +1096,33 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; CI-NEXT: s_lshl_b32 s0, s2, 2
|
||||
; CI-NEXT: s_add_i32 s1, s0, 0xc20
|
||||
; CI-NEXT: s_addk_i32 s0, 0xc60
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_read2_b32 v[2:3], v0 offset1:1
|
||||
; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
|
||||
; CI-NEXT: s_add_i32 s1, s0, 0x8c40
|
||||
; CI-NEXT: s_add_i32 s0, s0, 0x8c80
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1
|
||||
; CI-NEXT: ds_read2_b32 v[0:1], v8 offset1:1
|
||||
; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
|
||||
; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
|
||||
; CI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; CI-NEXT: s_mov_b32 s0, 0x8020
|
||||
; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v8
|
||||
; CI-NEXT: s_mov_b32 s0, 0x80a0
|
||||
; CI-NEXT: v_add_i32_e32 v6, vcc, s0, v8
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
|
||||
; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
|
||||
; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:1
|
||||
; CI-NEXT: v_add_i32_e32 v8, vcc, 0x8120, v8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_add_f32_e32 v2, v2, v3
|
||||
; CI-NEXT: v_add_f32_e32 v2, v2, v4
|
||||
; CI-NEXT: v_add_f32_e32 v2, v2, v5
|
||||
; CI-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v2
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v3
|
||||
; CI-NEXT: ds_read2_b32 v[8:9], v8 offset1:1
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v4
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v5
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v6
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v7
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v8
|
||||
; CI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; CI-NEXT: s_mov_b32 s6, -1
|
||||
; CI-NEXT: v_add_f32_e32 v0, v0, v9
|
||||
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
@ -1125,23 +1131,29 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
|
||||
; GFX9-LABEL: sgemm_inner_loop_read2_sequence:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_lshl_b32 s2, s2, 2
|
||||
; GFX9-NEXT: s_add_i32 s3, s2, 0xc20
|
||||
; GFX9-NEXT: s_addk_i32 s2, 0xc60
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX9-NEXT: ds_read2_b32 v[2:3], v0 offset1:1
|
||||
; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
|
||||
; GFX9-NEXT: s_add_i32 s3, s2, 0x8c40
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1
|
||||
; GFX9-NEXT: ds_read2_b32 v[0:1], v8 offset1:1
|
||||
; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
|
||||
; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, 0x8c80
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_add_u32_e32 v4, 0x8020, v8
|
||||
; GFX9-NEXT: v_add_u32_e32 v6, 0x80a0, v8
|
||||
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
|
||||
; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
|
||||
; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:1
|
||||
; GFX9-NEXT: v_add_u32_e32 v8, 0x8120, v8
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v3
|
||||
; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset1:1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v5
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, v2, v3
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, v2, v4
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, v2, v5
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v6
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v7
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
|
||||
@ -1462,11 +1474,11 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
|
||||
; CI: ; %bb.0: ; %entry
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_read_u8 v1, v0 offset:72
|
||||
; CI-NEXT: ds_read_u8 v2, v0 offset:71
|
||||
; CI-NEXT: ds_read_u8 v3, v0 offset:70
|
||||
; CI-NEXT: ds_read_u8 v4, v0 offset:69
|
||||
; CI-NEXT: ds_read_u8 v5, v0 offset:68
|
||||
; CI-NEXT: ds_read_u8 v1, v0 offset:37032
|
||||
; CI-NEXT: ds_read_u8 v2, v0 offset:37031
|
||||
; CI-NEXT: ds_read_u8 v3, v0 offset:37030
|
||||
; CI-NEXT: ds_read_u8 v4, v0 offset:37029
|
||||
; CI-NEXT: ds_read_u8 v5, v0 offset:37028
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(3)
|
||||
@ -1477,9 +1489,9 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
|
||||
; CI-NEXT: v_or_b32_e32 v3, v3, v4
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; CI-NEXT: v_or_b32_e32 v1, v1, v3
|
||||
; CI-NEXT: ds_read_u8 v2, v0 offset:67
|
||||
; CI-NEXT: ds_read_u8 v3, v0 offset:66
|
||||
; CI-NEXT: ds_read_u8 v0, v0 offset:65
|
||||
; CI-NEXT: ds_read_u8 v2, v0 offset:37027
|
||||
; CI-NEXT: ds_read_u8 v3, v0 offset:37026
|
||||
; CI-NEXT: ds_read_u8 v0, v0 offset:37025
|
||||
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; CI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; CI-NEXT: s_mov_b32 s2, -1
|
||||
@ -1496,14 +1508,14 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
|
||||
; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
|
||||
; GFX9-ALIGNED: ; %bb.0: ; %entry
|
||||
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:70
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:71
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:72
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:37025
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:37026
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:37027
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:37028
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:37029
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:37030
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:37031
|
||||
; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:37032
|
||||
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
|
||||
@ -1521,7 +1533,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
|
||||
;
|
||||
; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
|
||||
; GFX9-UNALIGNED: ; %bb.0: ; %entry
|
||||
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41
|
||||
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x90a1
|
||||
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
|
||||
|
@ -1,4 +1,4 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
|
||||
@ -889,9 +889,9 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; CI-NEXT: s_lshl_b32 s2, s2, 2
|
||||
; CI-NEXT: s_add_i32 s3, s2, 0xc20
|
||||
; CI-NEXT: s_add_i32 s3, s2, 0x8c40
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; CI-NEXT: s_addk_i32 s2, 0xc60
|
||||
; CI-NEXT: s_add_i32 s2, s2, 0x8c80
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
@ -902,17 +902,20 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1
|
||||
; CI-NEXT: v_add_i32_e32 v1, vcc, 0x8020, v0
|
||||
; CI-NEXT: ds_write2_b32 v1, v2, v3 offset1:1
|
||||
; CI-NEXT: v_add_i32_e32 v1, vcc, 0x80a0, v0
|
||||
; CI-NEXT: v_add_i32_e32 v0, vcc, 0x8120, v0
|
||||
; CI-NEXT: ds_write2_b32 v1, v2, v3 offset1:1
|
||||
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
|
||||
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
|
||||
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: write2_sgemm_sequence:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_lshl_b32 s2, s2, 2
|
||||
; GFX9-NEXT: s_add_i32 s3, s2, 0xc20
|
||||
; GFX9-NEXT: s_addk_i32 s2, 0xc60
|
||||
; GFX9-NEXT: s_add_i32 s3, s2, 0x8c40
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, 0x8c80
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
@ -923,9 +926,12 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
|
||||
; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, 0x8020, v0
|
||||
; GFX9-NEXT: ds_write2_b32 v1, v3, v4 offset1:1
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, 0x80a0, v0
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, 0x8120, v0
|
||||
; GFX9-NEXT: ds_write2_b32 v1, v3, v4 offset1:1
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
|
||||
%y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
|
||||
@ -1026,37 +1032,37 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
|
||||
; CI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_write_b8 v1, v0 offset:65
|
||||
; CI-NEXT: ds_write_b8 v1, v0 offset:37025
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 1
|
||||
; CI-NEXT: ds_write_b8 v1, v0 offset:70
|
||||
; CI-NEXT: ds_write_b8 v1, v0 offset:37030
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 0xc8
|
||||
; CI-NEXT: ds_write_b8 v1, v0 offset:69
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:68
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:67
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:66
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:72
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:71
|
||||
; CI-NEXT: ds_write_b8 v1, v0 offset:37029
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:37028
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:37027
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:37026
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:37032
|
||||
; CI-NEXT: ds_write_b8 v1, v1 offset:37031
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
|
||||
; GFX9-ALIGNED: ; %bb.0: ; %entry
|
||||
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
|
||||
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:37025
|
||||
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:37030
|
||||
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:37029
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37028
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37027
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37026
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37032
|
||||
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37031
|
||||
; GFX9-ALIGNED-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
|
||||
; GFX9-UNALIGNED: ; %bb.0: ; %entry
|
||||
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41
|
||||
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x90a1
|
||||
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8
|
||||
; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
|
||||
|
@ -46,9 +46,9 @@ define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace
|
||||
ret void
|
||||
}
|
||||
|
||||
; 38 + (10 pad) + 38
|
||||
; 38 + (2 pad) + 38
|
||||
; HSA-LABEL: {{^}}test_round_size_2_align_8:
|
||||
; HSA: workgroup_group_segment_byte_size = 86
|
||||
; HSA: workgroup_group_segment_byte_size = 78
|
||||
; HSA: group_segment_alignment = 4
|
||||
define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
||||
%lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
||||
@ -94,9 +94,10 @@ define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 add
|
||||
ret void
|
||||
}
|
||||
|
||||
; (7 * 8) + (39 * 4) = 212
|
||||
; FIXME: missign alignment can be improved.
|
||||
; (39 * 4) + (4 pad) + (7 * 8) = 216
|
||||
; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
|
||||
; HSA: workgroup_group_segment_byte_size = 212
|
||||
; HSA: workgroup_group_segment_byte_size = 216
|
||||
; HSA: group_segment_alignment = 4
|
||||
define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
||||
%lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
|
||||
@ -125,22 +126,11 @@ define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)*
|
||||
|
||||
ret void
|
||||
}
|
||||
; Test how the size needed for padding changes based on when the
|
||||
; global is encountered during lowering. There should be a consistent
|
||||
; order to minimize padding waste.
|
||||
;
|
||||
; The way global addresses are lowered now, this is in inverse of
|
||||
; first use order which isn't great.
|
||||
;
|
||||
; This should be the optimal order for these globals. If sorted to
|
||||
; minimize padding, the minimum possible size is: align 32, align 8,
|
||||
; align 16
|
||||
|
||||
|
||||
; align 32, 16, 8
|
||||
; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
|
||||
; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
|
||||
; HSA-LABEL: {{^}}test_round_size_3_order0:
|
||||
; HSA: workgroup_group_segment_byte_size = 134
|
||||
; HSA: workgroup_group_segment_byte_size = 126
|
||||
; HSA: group_segment_alignment = 4
|
||||
define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
||||
%lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
|
||||
@ -159,9 +149,9 @@ define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 ad
|
||||
}
|
||||
|
||||
; align 32, 8, 16
|
||||
; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134
|
||||
; 38 (+ 10 pad) + 38 + (2 pad) + 38 = 126
|
||||
; HSA-LABEL: {{^}}test_round_size_3_order1:
|
||||
; HSA: workgroup_group_segment_byte_size = 134
|
||||
; HSA: workgroup_group_segment_byte_size = 126
|
||||
; HSA: group_segment_alignment = 4
|
||||
define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
||||
%lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
|
||||
@ -180,9 +170,9 @@ define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 ad
|
||||
}
|
||||
|
||||
; align 16, 32, 8
|
||||
; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
|
||||
; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
|
||||
; HSA-LABEL: {{^}}test_round_size_3_order2:
|
||||
; HSA: workgroup_group_segment_byte_size = 150
|
||||
; HSA: workgroup_group_segment_byte_size = 126
|
||||
; HSA: group_segment_alignment = 4
|
||||
define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
||||
%lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
||||
@ -200,10 +190,11 @@ define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 ad
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Improve alignment
|
||||
; align 16, 8, 32
|
||||
; 38 + (2 pad) + 38 + (2 pad) + 38
|
||||
; 38 + (10 pad) + 38 + (2 pad) + 38
|
||||
; HSA-LABEL: {{^}}test_round_size_3_order3:
|
||||
; HSA: workgroup_group_segment_byte_size = 118
|
||||
; HSA: workgroup_group_segment_byte_size = 126
|
||||
; HSA: group_segment_alignment = 4
|
||||
define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
||||
%lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
||||
@ -222,9 +213,9 @@ define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 ad
|
||||
}
|
||||
|
||||
; align 8, 32, 16
|
||||
; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
|
||||
; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
|
||||
; HSA-LABEL: {{^}}test_round_size_3_order4:
|
||||
; HSA: workgroup_group_segment_byte_size = 142
|
||||
; HSA: workgroup_group_segment_byte_size = 126
|
||||
; HSA: group_segment_alignment = 4
|
||||
define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
||||
%lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
|
||||
|
@ -1,5 +1,5 @@
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t - | FileCheck -check-prefixes=ELF %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -filetype=obj < %s | llvm-readobj -r -t - | FileCheck -check-prefixes=ELF %s
|
||||
|
||||
@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
|
||||
@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8
|
||||
|
@ -275,11 +275,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
|
||||
; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
|
||||
; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
|
||||
define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
@ -428,11 +424,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
|
||||
; CIVI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
|
||||
; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
|
||||
; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
|
||||
; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
|
||||
; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
|
||||
define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
@ -136,10 +136,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
|
||||
@lds0 = addrspace(3) global [512 x i32] undef, align 4
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
|
||||
; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
|
||||
; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
|
||||
; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
|
||||
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
|
||||
define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
@ -337,10 +334,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
|
||||
@lds1 = addrspace(3) global [512 x i64] undef, align 8
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
|
||||
; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
|
||||
; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
|
||||
; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
|
||||
; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
|
||||
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
|
||||
; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
|
||||
define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
@ -38,17 +38,15 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}local_memory_two_objects:
|
||||
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
|
||||
; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
|
||||
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
|
||||
; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
|
||||
; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
|
||||
; CI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 0, [[ADDRW]]
|
||||
; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]]
|
||||
; GCN-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 16, [[ADDRW]]
|
||||
; GCN-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
|
||||
|
||||
; GCN: s_barrier
|
||||
|
||||
; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
|
||||
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
|
||||
; GCN-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
|
||||
; GCN-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
|
||||
|
||||
define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
|
||||
entry:
|
||||
|
@ -10,7 +10,7 @@
|
||||
; not an immediate.
|
||||
|
||||
; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
|
||||
; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo
|
||||
; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], 0{{$}}
|
||||
; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4
|
||||
|
||||
; R600: LDS_READ_RET
|
||||
|
@ -1,5 +1,3 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
@ -49,7 +47,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 {
|
||||
; GCN-NEXT: s_cbranch_scc1 BB0_3
|
||||
; GCN-NEXT: ; %bb.2: ; %bb4
|
||||
; GCN-NEXT: ; in Loop: Header=BB0_1 Depth=1
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
|
||||
@ -128,7 +126,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
|
||||
; GCN-NEXT: s_cbranch_scc1 BB1_3
|
||||
; GCN-NEXT: ; %bb.2: ; %bb4
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_1 Depth=1
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
|
||||
@ -206,24 +204,20 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_load_dword s3, s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GCN-NEXT: s_mov_b32 s2, lds@abs32@lo
|
||||
; GCN-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5
|
||||
; GCN-NEXT: ; implicit-def: $sgpr3
|
||||
; GCN-NEXT: ; implicit-def: $sgpr6
|
||||
; GCN-NEXT: BB2_1: ; %bb1
|
||||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_cmp_ne_u32_e64 s[8:9], s2, 4
|
||||
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
|
||||
; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
||||
; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
|
||||
; GCN-NEXT: s_cmp_gt_i32 s3, -1
|
||||
; GCN-NEXT: s_or_b64 s[4:5], s[4:5], exec
|
||||
; GCN-NEXT: s_cmp_gt_i32 s6, -1
|
||||
; GCN-NEXT: s_cbranch_scc1 BB2_3
|
||||
; GCN-NEXT: ; %bb.2: ; %bb4
|
||||
; GCN-NEXT: ; in Loop: Header=BB2_1 Depth=1
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
|
||||
@ -231,7 +225,7 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
|
||||
; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
|
||||
; GCN-NEXT: BB2_3: ; %Flow
|
||||
; GCN-NEXT: ; in Loop: Header=BB2_1 Depth=1
|
||||
; GCN-NEXT: s_add_i32 s3, s3, 1
|
||||
; GCN-NEXT: s_add_i32 s6, s6, 1
|
||||
; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5]
|
||||
; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
|
||||
; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
@ -311,7 +305,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
|
||||
; GCN-NEXT: s_cbranch_scc1 BB3_3
|
||||
; GCN-NEXT: ; %bb.2: ; %bb4
|
||||
; GCN-NEXT: ; in Loop: Header=BB3_1 Depth=1
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
|
||||
@ -399,7 +393,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
|
||||
; GCN-NEXT: s_cbranch_scc1 BB4_3
|
||||
; GCN-NEXT: ; %bb.2: ; %bb4
|
||||
; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
|
||||
@ -491,7 +485,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
|
||||
; GCN-NEXT: s_cbranch_scc1 BB5_3
|
||||
; GCN-NEXT: ; %bb.2: ; %bb4
|
||||
; GCN-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
|
||||
|
119
test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
Normal file
119
test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
Normal file
@ -0,0 +1,119 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
|
||||
|
||||
@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
|
||||
@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2
|
||||
@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4
|
||||
@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
|
||||
@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
|
||||
|
||||
; CHECK: %llvm.amdgcn.module.lds.t = type { [8 x i8], [1 x i8] }
|
||||
; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
|
||||
; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
|
||||
; CHECK: %llvm.amdgcn.kernel..lds.t = type { [2 x i8] }
|
||||
; CHECK: %llvm.amdgcn.kernel..lds.t.0 = type { [4 x i8] }
|
||||
|
||||
;.
|
||||
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
|
||||
; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"
|
||||
; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16
|
||||
; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
|
||||
; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2
|
||||
; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4
|
||||
;.
|
||||
define amdgpu_kernel void @k0() {
|
||||
; CHECK-LABEL: @k0(
|
||||
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
|
||||
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
|
||||
store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
|
||||
|
||||
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
|
||||
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
|
||||
%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
|
||||
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
|
||||
%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
|
||||
store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @k1() {
|
||||
; CHECK-LABEL: @k1(
|
||||
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
|
||||
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
|
||||
%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
|
||||
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
|
||||
%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
|
||||
store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @0() {
|
||||
; CHECK-LABEL: @0(
|
||||
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
|
||||
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @1() {
|
||||
; CHECK-LABEL: @1(
|
||||
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
|
||||
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @f0() {
|
||||
; CHECK-LABEL: @f0(
|
||||
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
|
||||
; CHECK-NEXT: %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
|
||||
store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
|
||||
|
||||
%lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)*
|
||||
store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
;.
|
||||
; CHECK: attributes #0 = { nofree nosync nounwind readnone willreturn }
|
||||
;.
|
68
test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
Normal file
68
test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
Normal file
@ -0,0 +1,68 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
|
||||
|
||||
; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 }
|
||||
|
||||
@lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
|
||||
|
||||
; Use constant from different kernels
|
||||
;.
|
||||
; CHECK: @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
|
||||
; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4
|
||||
;.
|
||||
define amdgpu_kernel void @k0(i64 %x) {
|
||||
; CHECK-LABEL: @k0(
|
||||
; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
|
||||
; CHECK-NEXT: store i8 1, i8* %ptr, align 1
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
|
||||
store i8 1, i8 addrspace(0)* %ptr, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @k1(i64 %x) {
|
||||
; CHECK-LABEL: @k1(
|
||||
; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
|
||||
; CHECK-NEXT: store i8 1, i8* %ptr, align 1
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
|
||||
store i8 1, i8 addrspace(0)* %ptr, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
@lds.2 = internal unnamed_addr addrspace(3) global i32 undef, align 4
|
||||
|
||||
; Use constant twice from the same kernel
|
||||
define amdgpu_kernel void @k2(i64 %x) {
|
||||
; CHECK-LABEL: @k2(
|
||||
; CHECK-NEXT: %ptr1 = bitcast i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %ptr1, align 4
|
||||
; CHECK-NEXT: %ptr2 = bitcast i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %ptr2, align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%ptr1 = bitcast i32 addrspace(3)* @lds.2 to i8 addrspace(3)*
|
||||
store i8 1, i8 addrspace(3)* %ptr1, align 4
|
||||
%ptr2 = bitcast i32 addrspace(3)* @lds.2 to i8 addrspace(3)*
|
||||
store i8 2, i8 addrspace(3)* %ptr2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@lds.3 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1
|
||||
|
||||
; Use constant twice from the same kernel but a different other constant.
|
||||
define amdgpu_kernel void @k3(i64 %x) {
|
||||
; CHECK-LABEL: @k3(
|
||||
; CHECK-NEXT: %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 16) to i64 addrspace(3)*) to i64*
|
||||
; CHECK-NEXT: store i64 1, i64* %ptr1, align 1
|
||||
; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 24) to i64 addrspace(3)*) to i64*
|
||||
; CHECK-NEXT: store i64 2, i64* %ptr2, align 1
|
||||
;
|
||||
%ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64*
|
||||
store i64 1, i64* %ptr1, align 1
|
||||
%ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 24) to i64 addrspace(3)*) to i64*
|
||||
store i64 2, i64* %ptr2, align 1
|
||||
ret void
|
||||
}
|
65
test/CodeGen/AMDGPU/lower-kernel-lds.ll
Normal file
65
test/CodeGen/AMDGPU/lower-kernel-lds.ll
Normal file
@ -0,0 +1,65 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
|
||||
|
||||
@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
|
||||
@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2
|
||||
@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4
|
||||
@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
|
||||
@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
|
||||
|
||||
; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [1 x i8] }
|
||||
; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
|
||||
|
||||
;.
|
||||
; CHECK: @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
|
||||
; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16
|
||||
; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
|
||||
;.
|
||||
define amdgpu_kernel void @k0() {
|
||||
; CHECK-LABEL: @k0(
|
||||
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
|
||||
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
|
||||
store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
|
||||
|
||||
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
|
||||
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
|
||||
%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
|
||||
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
|
||||
%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
|
||||
store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @k1() {
|
||||
; CHECK-LABEL: @k1(
|
||||
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
|
||||
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
|
||||
|
||||
%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
|
||||
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
|
||||
|
||||
%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
|
||||
store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
|
||||
ret void
|
||||
}
|
@ -5,8 +5,8 @@
|
||||
|
||||
@func = addrspace(3) global float undef, align 4
|
||||
|
||||
; @kern is only used from a kernel so it is left unchanged
|
||||
; CHECK: @kern = addrspace(3) global float undef, align 4
|
||||
; CHECK: %llvm.amdgcn.kernel.timestwo.lds.t = type { float }
|
||||
|
||||
@kern = addrspace(3) global float undef, align 4
|
||||
|
||||
; @func is only used from a non-kernel function so is rewritten
|
||||
@ -17,6 +17,7 @@
|
||||
@both = addrspace(3) global float undef, align 4
|
||||
|
||||
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
|
||||
; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4
|
||||
|
||||
; CHECK-LABEL: @get_func()
|
||||
; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
@ -36,9 +37,9 @@ entry:
|
||||
|
||||
; CHECK-LABEL: @timestwo()
|
||||
; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: %mul = mul i32 %ld, 2
|
||||
; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
define amdgpu_kernel void @timestwo() {
|
||||
%ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
%mul = mul i32 %ld, 2
|
||||
|
@ -7,7 +7,7 @@
|
||||
; CHECK-NOT: llvm.amdgcn.module.lds.t
|
||||
|
||||
; var1, var2 would be transformed were they used from a non-kernel function
|
||||
; CHECK: @var1 = addrspace(3) global i32 undef
|
||||
; CHECK-NOT: @var1 =
|
||||
; CHECK: @var2 = addrspace(3) global float undef
|
||||
@var1 = addrspace(3) global i32 undef
|
||||
@var2 = addrspace(3) global float undef
|
||||
@ -36,7 +36,7 @@
|
||||
@toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
|
||||
|
||||
; Use by .used lists doesn't trigger lowering
|
||||
; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
|
||||
; CHECK-NOT: @llvm.used =
|
||||
@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
|
||||
|
||||
; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
|
||||
@ -58,9 +58,8 @@ define void @use_variables() {
|
||||
ret void
|
||||
}
|
||||
|
||||
; Use by kernel doesn't trigger lowering
|
||||
; CHECK-LABEL: @kern_use()
|
||||
; CHECK: %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
|
||||
; CHECK: %inc = atomicrmw add i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kern_use.lds.t, %llvm.amdgcn.kernel.kern_use.lds.t addrspace(3)* @llvm.amdgcn.kernel.kern_use.lds, i32 0, i32 0), i32 1 monotonic, align 4
|
||||
define amdgpu_kernel void @kern_use() {
|
||||
%inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
|
||||
call void @use_variables()
|
||||
|
@ -20,7 +20,7 @@ define amdgpu_kernel void @k0() {
|
||||
; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
|
||||
; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
|
||||
; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
|
||||
; OPT-NEXT: ret void
|
||||
;
|
||||
|
@ -8,8 +8,8 @@
|
||||
; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
|
||||
; IR: alloca [10 x i32]
|
||||
; ASM-LABEL: {{^}}promote_alloca_size_256:
|
||||
; ASM: .amdgpu_lds global_array0, 30000, 4
|
||||
; ASM: .amdgpu_lds global_array1, 30000, 4
|
||||
; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 4
|
||||
; ASM-NOT: .amdgpu_lds
|
||||
|
||||
define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
|
||||
entry:
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -disable-promote-alloca-to-vector -amdgpu-enable-lower-module-lds=0 < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; This shows that the amount LDS size estimate should try to not be
|
||||
; sensitive to the order of the LDS globals. This should try to
|
||||
|
@ -34,11 +34,7 @@ define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 add
|
||||
|
||||
; GCN-LABEL: {{^}}load_shl_base_lds_1:
|
||||
; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
|
||||
; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
|
||||
; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
|
||||
|
||||
; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
|
||||
; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[OFS]] offset:8
|
||||
; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
|
||||
; GCN-DAG: buffer_store_dword [[RESULT]]
|
||||
; GCN-DAG: buffer_store_dword [[ADDUSE]]
|
||||
@ -72,18 +68,10 @@ define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i
|
||||
; The two globals are placed adjacent in memory, so the same base
|
||||
; pointer can be used with an offset into the second one.
|
||||
|
||||
; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints
|
||||
|
||||
; GCN-LABEL: {{^}}load_shl_base_lds_2:
|
||||
; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
|
||||
; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1
|
||||
|
||||
; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
|
||||
; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
|
||||
; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
@ -10,7 +10,7 @@
|
||||
; CHECK: machineFunctionInfo:
|
||||
; CHECK-NEXT: explicitKernArgSize: 128
|
||||
; CHECK-NEXT: maxKernArgAlign: 64
|
||||
; CHECK-NEXT: ldsSize: 0
|
||||
; CHECK-NEXT: ldsSize: 2048
|
||||
; CHECK-NEXT: dynLDSAlign: 1
|
||||
; CHECK-NEXT: isEntryFunction: true
|
||||
; CHECK-NEXT: noSignedZerosFPMath: false
|
||||
|
Loading…
Reference in New Issue
Block a user