[amdgpu] Implement lower function LDS pass

[amdgpu] Implement lower function LDS pass Local variables are allocated at kernel launch. This pass collects global variables that are used from non-kernel functions, moves them into a new struct type, and allocates an instance of that type in every kernel. Uses are then replaced with a constantexpr offset. Prior to this pass, accesses from a function are compiled to trap. With this pass, most such accesses are removed before reaching codegen. The trap logic is left unchanged by this pass. It is still reachable for the cases this pass misses, notably the extern shared construct from hip and variables marked constant which survive the optimizer. This is of interest to the openmp project because the deviceRTL runtime library uses cuda shared variables from functions that cannot be inlined. Trunk llvm therefore cannot compile some openmp kernels for amdgpu. In addition to the unit tests attached, this patch applied to ROCm llvm with fixed-abi enabled and the function pointer hashing scheme deleted passes the openmp suite. This lowering will use more LDS than strictly necessary. It is intended to be a functionally correct fallback for cases that are difficult to target from future optimisation passes. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D94648
2024-11-22 02:33:06 +01:00 · 2021-03-15 15:24:00 +00:00 · 2021-03-15 15:24:00 +00:00 · c86d684da8
commit c86d684da8
parent 2593dc40a4
18 changed files with 681 additions and 10 deletions
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@ -71,6 +71,7 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
 ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
+ModulePass *createAMDGPULowerModuleLDSPass();
 FunctionPass *createSIModeRegisterPass();

 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
@ -145,6 +146,13 @@ private:
  TargetMachine &TM;
 };

+void initializeAMDGPULowerModuleLDSPass(PassRegistry &);
+extern char &AMDGPULowerModuleLDSID;
+
+struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;

--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@ -500,9 +500,10 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-
  const DataLayout &DL = F.getParent()->getDataLayout();

+  Info->allocateModuleLDSGlobal(F.getParent());
+
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());

@ -591,6 +592,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
  const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
  const DataLayout &DL = F.getParent()->getDataLayout();

+  Info->allocateModuleLDSGlobal(F.getParent());

  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
--- a/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@ -0,0 +1,380 @@
+//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates LDS uses from non-kernel functions.
+//
+// The strategy is to create a new struct with a field for each LDS variable
+// and allocate that struct at the same address for every kernel. Uses of the
+// original LDS variables are then replaced with compile time offsets from that
+// known address. AMDGPUMachineFunction allocates the LDS global.
+//
+// Local variables with constant annotation or non-undef initializer are passed
+// through unchanged for simplication or error diagnostics in later passes.
+//
+// To reduce the memory overhead variables that are only used by kernels are
+// excluded from this transform. The analysis to determine whether a variable
+// is only used by a kernel is cheap and conservative so this may allocate
+// a variable in every kernel when it was not strictly necessary to do so.
+//
+// A possible future refinement is to specialise the structure per-kernel, so
+// that fields can be elided based on more expensive analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <vector>
+
+#define DEBUG_TYPE "amdgpu-lower-module-lds"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerModuleLDS : public ModulePass {
+
+  static bool isKernelCC(Function *Func) {
+    return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+  }
+
+  static Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+    return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+                                         GV->getValueType());
+  }
+
+  static bool
+  userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                       User *InitialUser) {
+    // Any LDS variable can be lowered by moving into the created struct
+    // Each variable so lowered is allocated in every kernel, so variables
+    // whose users are all known to be safe to lower without the transform
+    // are left unchanged.
+    SmallPtrSet<User *, 8> Visited;
+    SmallVector<User *, 16> Stack;
+    Stack.push_back(InitialUser);
+
+    while (!Stack.empty()) {
+      User *V = Stack.pop_back_val();
+      Visited.insert(V);
+
+      if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
+        if (UsedList.contains(G)) {
+          continue;
+        }
+      }
+
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        if (isKernelCC(I->getFunction())) {
+          continue;
+        }
+      }
+
+      if (auto *E = dyn_cast<ConstantExpr>(V)) {
+        for (Value::user_iterator EU = E->user_begin(); EU != E->user_end();
+             ++EU) {
+          if (Visited.insert(*EU).second) {
+            Stack.push_back(*EU);
+          }
+        }
+        continue;
+      }
+
+      // Unknown user, conservatively lower the variable
+      return true;
+    }
+
+    return false;
+  }
+
+  static std::vector<GlobalVariable *>
+  findVariablesToLower(Module &M,
+                       const SmallPtrSetImpl<GlobalValue *> &UsedList) {
+    std::vector<llvm::GlobalVariable *> LocalVars;
+    for (auto &GV : M.globals()) {
+      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+        continue;
+      }
+      if (!GV.hasInitializer()) {
+        // addrspace(3) without initializer implies cuda/hip extern __shared__
+        // the semantics for such a variable appears to be that all extern
+        // __shared__ variables alias one another, in which case this transform
+        // is not required
+        continue;
+      }
+      if (!isa<UndefValue>(GV.getInitializer())) {
+        // Initializers are unimplemented for local address space.
+        // Leave such variables in place for consistent error reporting.
+        continue;
+      }
+      if (GV.isConstant()) {
+        // A constant undef variable can't be written to, and any load is
+        // undef, so it should be eliminated by the optimizer. It could be
+        // dropped by the back end if not. This pass skips over it.
+        continue;
+      }
+      if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) {
+            return userRequiresLowering(UsedList, U);
+          })) {
+        continue;
+      }
+      LocalVars.push_back(&GV);
+    }
+    return LocalVars;
+  }
+
+  static void removeFromUsedList(Module &M, StringRef Name,
+                                 SmallPtrSetImpl<Constant *> &ToRemove) {
+    GlobalVariable *GV = M.getGlobalVariable(Name);
+    if (!GV || ToRemove.empty()) {
+      return;
+    }
+
+    SmallVector<Constant *, 16> Init;
+    auto *CA = cast<ConstantArray>(GV->getInitializer());
+    for (auto &Op : CA->operands()) {
+      // ModuleUtils::appendToUsed only inserts Constants
+      Constant *C = cast<Constant>(Op);
+      if (!ToRemove.contains(C->stripPointerCasts())) {
+        Init.push_back(C);
+      }
+    }
+
+    if (Init.size() == CA->getNumOperands()) {
+      return; // none to remove
+    }
+
+    GV->eraseFromParent();
+
+    if (!Init.empty()) {
+      ArrayType *ATy =
+          ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
+      GV =
+          new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                   ConstantArray::get(ATy, Init), Name);
+      GV->setSection("llvm.metadata");
+    }
+  }
+
+  static void
+  removeFromUsedLists(Module &M,
+                      const std::vector<GlobalVariable *> &LocalVars) {
+    SmallPtrSet<Constant *, 32> LocalVarsSet;
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+        LocalVarsSet.insert(C);
+      }
+    }
+    removeFromUsedList(M, "llvm.used", LocalVarsSet);
+    removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
+  }
+
+  static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
+                               GlobalVariable *SGV) {
+    // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
+    // that might call a function which accesses a field within it. This is
+    // presently approximated to 'all kernels' if there are any such functions
+    // in the module. This implicit use is reified as an explicit use here so
+    // that later passes, specifically PromoteAlloca, account for the required
+    // memory without any knowledge of this transform.
+
+    // An operand bundle on llvm.donothing works because the call instruction
+    // survives until after the last pass that needs to account for LDS. It is
+    // better than inline asm as the latter survives until the end of codegen. A
+    // totally robust solution would be a function with the same semantics as
+    // llvm.donothing that takes a pointer to the instance and is lowered to a
+    // no-op after LDS is allocated, but that is not presently necessary.
+
+    LLVMContext &Ctx = Func->getContext();
+
+    Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
+
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
+
+    Function *Decl =
+        Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
+
+    Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
+        SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
+
+    Builder.CreateCall(FTy, Decl, {},
+                       {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
+                       "");
+  }
+
+  static SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+    SmallPtrSet<GlobalValue *, 32> UsedList;
+
+    SmallVector<GlobalValue *, 32> TmpVec;
+    collectUsedGlobalVariables(M, TmpVec, true);
+    UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+    TmpVec.clear();
+    collectUsedGlobalVariables(M, TmpVec, false);
+    UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+    return UsedList;
+  }
+
+public:
+  static char ID;
+
+  AMDGPULowerModuleLDS() : ModulePass(ID) {
+    initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
+    SmallPtrSet<GlobalValue *, 32> UsedList = getUsedList(M);
+
+    // Find variables to move into new struct instance
+    std::vector<GlobalVariable *> FoundLocalVars =
+        findVariablesToLower(M, UsedList);
+
+    if (FoundLocalVars.empty()) {
+      // No variables to rewrite, no changes made.
+      return false;
+    }
+
+    // Sort by alignment, descending, to minimise padding.
+    // On ties, sort by size, descending, then by name, lexicographical.
+    llvm::stable_sort(
+        FoundLocalVars,
+        [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
+          Align ALHS = getAlign(DL, LHS);
+          Align ARHS = getAlign(DL, RHS);
+          if (ALHS != ARHS) {
+            return ALHS > ARHS;
+          }
+
+          TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
+          TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
+          if (SLHS != SRHS) {
+            return SLHS > SRHS;
+          }
+
+          // By variable name on tie for predictable order in test cases.
+          return LHS->getName() < RHS->getName();
+        });
+
+    std::vector<GlobalVariable *> LocalVars;
+    LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+    {
+      // This usually won't need to insert any padding, perhaps avoid the alloc
+      uint64_t CurrentOffset = 0;
+      for (size_t I = 0; I < FoundLocalVars.size(); I++) {
+        GlobalVariable *FGV = FoundLocalVars[I];
+        Align DataAlign = getAlign(DL, FGV);
+
+        uint64_t DataAlignV = DataAlign.value();
+        if (uint64_t Rem = CurrentOffset % DataAlignV) {
+          uint64_t Padding = DataAlignV - Rem;
+
+          // Append an array of padding bytes to meet alignment requested
+          // Note (o +      (a - (o % a)) ) % a == 0
+          //      (offset + Padding       ) % align == 0
+
+          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
+          LocalVars.push_back(new GlobalVariable(
+              M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
+              "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+              false));
+          CurrentOffset += Padding;
+        }
+
+        LocalVars.push_back(FGV);
+        CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
+      }
+    }
+
+    std::vector<Type *> LocalVarTypes;
+    LocalVarTypes.reserve(LocalVars.size());
+    std::transform(
+        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
+        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
+
+    StructType *LDSTy = StructType::create(
+        Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t"));
+
+    Align MaxAlign = getAlign(DL, LocalVars[0]); // was sorted on alignment
+    Constant *InstanceAddress = Constant::getIntegerValue(
+        PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));
+
+    GlobalVariable *SGV = new GlobalVariable(
+        M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
+        "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal,
+        AMDGPUAS::LOCAL_ADDRESS, false);
+    SGV->setAlignment(MaxAlign);
+    appendToCompilerUsed(
+        M, {static_cast<GlobalValue *>(
+               ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                   cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+    // The verifier rejects used lists containing an inttoptr of a constant
+    // so remove the variables from these lists before replaceAllUsesWith
+    removeFromUsedLists(M, LocalVars);
+
+    // Replace uses of ith variable with a constantexpr to the ith field of the
+    // instance that will be allocated by AMDGPUMachineFunction
+    Type *I32 = Type::getInt32Ty(Ctx);
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      GlobalVariable *GV = LocalVars[I];
+      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+      GV->replaceAllUsesWith(
+          ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
+      GV->eraseFromParent();
+    }
+
+    // Mark kernels with asm that reads the address of the allocated structure
+    // This is not necessary for lowering. This lets other passes, specifically
+    // PromoteAlloca, accurately calculate how much LDS will be used by the
+    // kernel after lowering.
+    {
+      IRBuilder<> Builder(Ctx);
+      SmallPtrSet<Function *, 32> Kernels;
+      for (auto &I : M.functions()) {
+        Function *Func = &I;
+        if (isKernelCC(Func) && !Kernels.contains(Func)) {
+          markUsedByKernel(Builder, Func, SGV);
+          Kernels.insert(Func);
+        }
+      }
+    }
+    return true;
+  }
+};
+
+} // namespace
+char AMDGPULowerModuleLDS::ID = 0;
+
+char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
+                "Lower uses of LDS variables from non-kernel functions", false,
+                false)
+
+ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
+  return new AMDGPULowerModuleLDS();
+}
+
+PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
+                                                ModuleAnalysisManager &) {
+  return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
+                                               : PreservedAnalyses::all();
+}
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@ -64,6 +64,18 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
  return Offset;
 }

+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+  if (isModuleEntryFunction()) {
+    GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.module.lds");
+    if (GV) {
+      unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
+      (void)Offset;
+      assert(Offset == 0 &&
+             "Module LDS expected to be allocated before other LDS");
+    }
+  }
+}
+
 void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
                                           const GlobalVariable &GV) {
  assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@ -94,6 +94,7 @@ public:
  }

  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
+  void allocateModuleLDSGlobal(const Module *M);

  Align getDynLDSAlign() const { return DynLDSAlign; }

--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@ -126,8 +126,13 @@ public:
 char AMDGPUPromoteAlloca::ID = 0;
 char AMDGPUPromoteAllocaToVector::ID = 0;

-INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
-                "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                      "AMDGPU promote alloca to vector or LDS", false, false)
+// Move LDS uses from functions to kernels before promote alloca for accurate
+// estimation of LDS available
+INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
+INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                    "AMDGPU promote alloca to vector or LDS", false, false)

 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
                "AMDGPU promote alloca to vector", false, false)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -193,6 +193,11 @@ static cl::opt<bool> EnableStructurizerWorkarounds(
    cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
    cl::Hidden);

+static cl::opt<bool>
+    DisableLowerModuleLDS("amdgpu-disable-lower-module-lds", cl::Hidden,
+                          cl::desc("Disable lower module lds pass"),
+                          cl::init(false));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
  // Register the target
  RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@ -235,6 +240,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
  initializeAMDGPULateCodeGenPreparePass(*PR);
  initializeAMDGPUPropagateAttributesEarlyPass(*PR);
  initializeAMDGPUPropagateAttributesLatePass(*PR);
+  initializeAMDGPULowerModuleLDSPass(*PR);
  initializeAMDGPURewriteOutArgumentsPass(*PR);
  initializeAMDGPUUnifyMetadataPass(*PR);
  initializeSIAnnotateControlFlowPass(*PR);
@ -506,6 +512,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
          PM.addPass(AMDGPUAlwaysInlinePass());
          return true;
        }
+        if (PassName == "amdgpu-lower-module-lds") {
+          PM.addPass(AMDGPULowerModuleLDSPass());
+          return true;
+        }
        return false;
      });
  PB.registerPipelineParsingCallback(
@ -535,7 +545,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
          PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
          return true;
        }
-
        return false;
      });

@ -884,6 +893,10 @@ void AMDGPUPassConfig::addIRPasses() {
  // Replace OpenCL enqueued block function pointers with global variables.
  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());

+  // Can increase LDS used by kernel so runs before PromoteAlloca
+  if (!DisableLowerModuleLDS)
+    addPass(createAMDGPULowerModuleLDSPass());
+
  if (TM.getOptLevel() > CodeGenOpt::None) {
    addPass(createInferAddressSpacesPass());
    addPass(createAMDGPUPromoteAlloca());
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@ -67,6 +67,7 @@ add_llvm_target(AMDGPUCodeGen
  AMDGPULowerIntrinsics.cpp
  AMDGPULowerKernelArguments.cpp
  AMDGPULowerKernelAttributes.cpp
+  AMDGPULowerModuleLDSPass.cpp
  AMDGPUMachineCFGStructurizer.cpp
  AMDGPUMachineFunction.cpp
  AMDGPUMachineModuleInfo.cpp
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -2263,6 +2263,8 @@ SDValue SITargetLowering::LowerFormalArguments(
    return DAG.getEntryNode();
  }

+  Info->allocateModuleLDSGlobal(Fn.getParent());
+
  SmallVector<ISD::InputArg, 16> Splits;
  SmallVector<CCValAssign, 16> ArgLocs;
  BitVector Skipped(Ins.size());
--- a/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t

-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t

@lds = internal addrspace(3) global float undef, align 4
--- a/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
@ -1,4 +1,4 @@
-; RUN: not --crash llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-module-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s

 ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*)

--- a/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t

-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t

@lds = internal addrspace(3) global float undef, align 4
--- a/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@ -0,0 +1,47 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, float }
+
+@func = addrspace(3) global float undef, align 4
+
+; @kern is only used from a kernel so it is left unchanged
+; CHECK: @kern = addrspace(3) global float undef, align 4
+@kern = addrspace(3) global float undef, align 4
+
+; @func is only used from a non-kernel function so is rewritten
+; CHECK-NOT: @func
+; @both is used from a non-kernel function so is rewritten
+; CHECK-NOT: @both
+; sorted both < func, so @both at null and @func at 4
+@both = addrspace(3) global float undef, align 4
+
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
+
+; CHECK-LABEL: @get_func()
+; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+define i32 @get_func() local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: @set_func(i32 %x)
+; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
+define void @set_func(i32 %x) local_unnamed_addr #1 {
+entry:
+  store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
+
+; CHECK-LABEL: @timestwo()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %mul = mul i32 %ld, 2
+; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
+define amdgpu_kernel void @timestwo() {
+  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  %mul = mul i32 %ld, 2
+  store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
--- a/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
+++ b/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@ -0,0 +1,68 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Variables that are not lowered by this pass are left unchanged
+; CHECK-NOT: asm
+; CHECK-NOT: llvm.amdgcn.module.lds
+; CHECK-NOT: llvm.amdgcn.module.lds.t
+
+; var1, var2 would be transformed were they used from a non-kernel function
+; CHECK: @var1 = addrspace(3) global i32 undef
+; CHECK: @var2 = addrspace(3) global float undef
+@var1 = addrspace(3) global i32 undef
+@var2 = addrspace(3) global float undef
+
+; constant variables are left to the optimizer / error diagnostics
+; CHECK: @const_undef = addrspace(3) constant i32 undef
+; CHECK: @const_with_init = addrspace(3) constant i64 8
+@const_undef = addrspace(3) constant i32 undef
+@const_with_init = addrspace(3) constant i64 8
+
+; External and constant are both left to the optimizer / error diagnostics
+; CHECK: @extern = external addrspace(3) global i32
+@extern = external addrspace(3) global i32
+
+; Use of an addrspace(3) variable with an initializer is skipped,
+; so as to preserve the unimplemented error from llc
+; CHECK: @with_init = addrspace(3) global i64 0
+@with_init = addrspace(3) global i64 0
+
+; Only local addrspace variables are transformed
+; CHECK: @addr4 = addrspace(4) global i64 undef
+@addr4 = addrspace(4) global i64 undef
+
+; Assign to self is treated as any other initializer, i.e. ignored by this pass
+; CHECK: @toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
+@toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
+
+; Use by .used lists doesn't trigger lowering
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+@llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; Access from a function would cause lowering for non-excluded cases
+; CHECK-LABEL: @use_variables()
+; CHECK: %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
+; CHECK: %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
+; CHECK: %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst
+; CHECK: %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic
+; CHECK: %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+define void @use_variables() {
+  %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
+  %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
+  %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst
+  %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic
+  %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+  ret void
+}
+
+; Use by kernel doesn't trigger lowering
+; CHECK-LABEL: @kern_use()
+; CHECK: %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+define amdgpu_kernel void @kern_use() {
+  %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+  call void @use_variables()
+  ret void
+}
--- a/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
+++ b/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
@ -0,0 +1,39 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { double, float }
+
+; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8
+
+; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8
+
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+@function_target = addrspace(3) global float undef, align 4
+@function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* @function_target to float*), align 8
+
+@kernel_target = addrspace(3) global double undef, align 8
+@kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* @kernel_target to double*), align 8
+
+; CHECK-LABEL: @function(float %x)
+; CHECK: %0 = load float*, float* addrspace(1)* @function_indirect, align 8
+define void @function(float %x) local_unnamed_addr #5 {
+entry:
+  %0 = load float*, float* addrspace(1)* @function_indirect, align 8
+  store float %x, float* %0, align 4
+  ret void
+}
+
+; CHECK-LABEL: @kernel(double %x)
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8
+define amdgpu_kernel void @kernel(double %x) local_unnamed_addr #5 {
+entry:
+  %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8
+  store double %x, double* %0, align 8
+  ret void
+}
+
+
+
+
--- a/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Check new struct is added to compiler.used and that the replaced variable is removed
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { float }
+; CHECK: @ignored = addrspace(1) global i64 0
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+; CHECK-NOT: @tolower
+
+@tolower = addrspace(3) global float undef, align 8
+
+; A variable that is unchanged by pass
+@ignored = addrspace(1) global i64 0
+
+
+; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
+; Start with one value to replace and one to ignore in the .use list
+
+; @ignored still in list, @tolower removed
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+@llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
+; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; CHECK-LABEL: @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+define void @func() {
+  %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
+  %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
+  ret void
+}
--- a/test/CodeGen/AMDGPU/lower-module-lds.ll
+++ b/test/CodeGen/AMDGPU/lower-module-lds.ll
@ -0,0 +1,56 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2
+; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4).
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 }
+
+; Variables removed by pass
+; CHECK-NOT: @var0
+; CHECK-NOT: @var1
+
+@var0 = addrspace(3) global float undef, align 8
+@var1 = addrspace(3) global i32 undef, align 8
+
+@ptr =  addrspace(1) global i32 addrspace(3)* @var1, align 4
+
+; A variable that is unchanged by pass
+; CHECK: @with_init = addrspace(3) global i64 0
+@with_init = addrspace(3) global i64 0
+
+; Instance of new type, aligned to max of element alignment
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+; Use in func rewritten to access struct at address zero, which prints as null
+; CHECK-LABEL: @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %val1 = add i32 %val0, 4
+; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+define void @func() {
+  %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
+  %val0 = load i32, i32 addrspace(3)* @var1, align 4
+  %val1 = add i32 %val0, 4
+  store i32 %val1, i32 addrspace(3)* @var1, align 4
+  %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+  ret void
+}
+
+; This kernel calls a function that uses LDS so needs the block
+; CHECK-LABEL: @kern_call()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: call void @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0
+define amdgpu_kernel void @kern_call() {
+  call void @func()
+  %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
+  ret void
+}
+
+; This kernel does not need to alloc the LDS block as it makes no calls
+; CHECK-LABEL: @kern_empty()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+define spir_kernel void @kern_empty() {
+  ret void
+}
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@ -1,5 +1,5 @@
 ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
-; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s
+; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true < %s | FileCheck -check-prefix=ASM %s

 target datalayout = "A5"