From 756687dc6c36faadc4ce5ee4114db598bc44aaa1 Mon Sep 17 00:00:00 2001 From: hsmahesha Date: Tue, 8 Jun 2021 03:58:13 +0530 Subject: [PATCH] [AMDGPU] Introduce command line switch to control super aligning of LDS. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D103817 --- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 50 ++++++++++--------- .../AMDGPU/lower-kernel-lds-super-align.ll | 23 +++++++++ 2 files changed, 49 insertions(+), 24 deletions(-) create mode 100644 test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll diff --git a/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 8e3895ee141..8e3ce775b58 100644 --- a/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include @@ -46,6 +47,11 @@ using namespace llvm; +static cl::opt SuperAlignLDSGlobals( + "amdgpu-super-align-lds-globals", + cl::desc("Increase alignment of LDS if it is not on align boundary"), + cl::init(true), cl::Hidden); + namespace { class AMDGPULowerModuleLDS : public ModulePass { @@ -174,31 +180,27 @@ private: // Increase the alignment of LDS globals if necessary to maximise the chance // that we can use aligned LDS instructions to access them. - for (auto *GV : FoundLocalVars) { - unsigned AlignValue = GV->getAlignment(); - if (AlignValue == 0) { - GV->setAlignment(DL.getABITypeAlign(GV->getValueType())); - continue; + if (SuperAlignLDSGlobals) { + for (auto *GV : FoundLocalVars) { + Align Alignment = AMDGPU::getAlign(DL, GV); + TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = std::max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = std::max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = std::max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = std::max(Alignment, Align(2)); + } + + GV->setAlignment(Alignment); } - - Align Alignment(AlignValue); - TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); - - if (GVSize > 8) { - // We might want to use a b96 or b128 load/store - Alignment = std::max(Alignment, Align(16)); - } else if (GVSize > 4) { - // We might want to use a b64 load/store - Alignment = std::max(Alignment, Align(8)); - } else if (GVSize > 2) { - // We might want to use a b32 load/store - Alignment = std::max(Alignment, Align(4)); - } else if (GVSize > 1) { - // We might want to use a b16 load/store - Alignment = std::max(Alignment, Align(2)); - } - - GV->setAlignment(Alignment); } // Sort by alignment, descending, to minimise padding. diff --git a/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll new file mode 100644 index 00000000000..206f51f5be3 --- /dev/null +++ b/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll @@ -0,0 +1,23 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck --check-prefix=SUPER-ALIGN_ON %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck --check-prefix=SUPER-ALIGN_ON %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefix=SUPER-ALIGN_OFF %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefix=SUPER-ALIGN_OFF %s + +; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [32 x i8] } + +; CHECK-NOT: @lds.1 +@lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1 + +; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 16 +; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 1 + +; CHECK-LABEL: @k4 +; CHECK: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k4.lds.t, %llvm.amdgcn.kernel.k4.lds.t addrspace(3)* @llvm.amdgcn.kernel.k4.lds, i32 0, i32 +; CHECK: 0, i32 0) to i8*), i64 %x +; CHECK: store i8 1, i8* %ptr, align 1 +; CHECK: ret void +define amdgpu_kernel void @k4(i64 %x) { + %ptr = getelementptr inbounds i8, i8* addrspacecast ([32 x i8] addrspace(3)* @lds.1 to i8*), i64 %x + store i8 1, i8 addrspace(0)* %ptr, align 1 + ret void +}