[AMDGPU] Lower kernel LDS into a sorted structure

Differential Revision: https://reviews.llvm.org/D102954
2025-01-31 20:51:52 +01:00 · 2021-05-19 13:39:55 -07:00 · 2021-05-19 13:39:55 -07:00 · b8b00b1711
commit b8b00b1711
parent 831c97ae39
26 changed files with 641 additions and 360 deletions
--- a/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@ -73,6 +73,10 @@ class AMDGPULowerModuleLDS : public ModulePass {

    GV->eraseFromParent();

+    for (Constant *C : ToRemove) {
+      C->removeDeadConstantUsers();
+    }
+
    if (!Init.empty()) {
      ArrayType *ATy =
          ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
@ -129,6 +133,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
                       "");
  }

+private:
+  SmallPtrSet<GlobalValue *, 32> UsedList;
+
 public:
  static char ID;

@ -137,13 +144,28 @@ public:
  }

  bool runOnModule(Module &M) override {
+    UsedList = AMDGPU::getUsedList(M);
+
+    bool Changed = processUsedLDS(M);
+
+    for (Function &F : M.functions()) {
+      if (!AMDGPU::isKernelCC(&F))
+        continue;
+      Changed |= processUsedLDS(M, &F);
+    }
+
+    UsedList.clear();
+    return Changed;
+  }
+
+private:
+  bool processUsedLDS(Module &M, Function *F = nullptr) {
    LLVMContext &Ctx = M.getContext();
    const DataLayout &DL = M.getDataLayout();
-    SmallPtrSet<GlobalValue *, 32> UsedList = AMDGPU::getUsedList(M);

    // Find variables to move into new struct instance
    std::vector<GlobalVariable *> FoundLocalVars =
-        AMDGPU::findVariablesToLower(M, UsedList);
+        AMDGPU::findVariablesToLower(M, UsedList, F);

    if (FoundLocalVars.empty()) {
      // No variables to rewrite, no changes made.
@ -207,21 +229,25 @@ public:
        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });

-    StructType *LDSTy = StructType::create(
-        Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t"));
+    std::string VarName(
+        F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
+          : "llvm.amdgcn.module.lds");
+    StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");

    Align MaxAlign =
        AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment

    GlobalVariable *SGV = new GlobalVariable(
        M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
-        "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal,
-        AMDGPUAS::LOCAL_ADDRESS, false);
+        VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+        false);
    SGV->setAlignment(MaxAlign);
-    appendToCompilerUsed(
-        M, {static_cast<GlobalValue *>(
-               ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-                   cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+    if (!F) {
+      appendToCompilerUsed(
+          M, {static_cast<GlobalValue *>(
+                 ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                     cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+    }

    // The verifier rejects used lists containing an inttoptr of a constant
    // so remove the variables from these lists before replaceAllUsesWith
@ -233,16 +259,25 @@ public:
    for (size_t I = 0; I < LocalVars.size(); I++) {
      GlobalVariable *GV = LocalVars[I];
      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
-      GV->replaceAllUsesWith(
-          ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx));
-      GV->eraseFromParent();
+      Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
+      if (F) {
+        GV->replaceUsesWithIf(GEP, [F](Use &U) {
+          return AMDGPU::isUsedOnlyFromFunction(U.getUser(), F);
+        });
+      } else {
+        GV->replaceAllUsesWith(GEP);
+      }
+      if (GV->use_empty()) {
+        UsedList.erase(GV);
+        GV->eraseFromParent();
+      }
    }

    // Mark kernels with asm that reads the address of the allocated structure
    // This is not necessary for lowering. This lets other passes, specifically
    // PromoteAlloca, accurately calculate how much LDS will be used by the
    // kernel after lowering.
-    {
+    if (!F) {
      IRBuilder<> Builder(Ctx);
      SmallPtrSet<Function *, 32> Kernels;
      for (auto &I : M.functions()) {
--- a/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@ -20,7 +20,7 @@ namespace llvm {

 namespace AMDGPU {

-bool isKernelCC(Function *Func) {
+bool isKernelCC(const Function *Func) {
  return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
 }

@ -29,18 +29,33 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
                                       GV->getValueType());
 }

-bool userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
-                          User *InitialUser) {
+bool isUsedOnlyFromFunction(const User *U, const Function *F) {
+  if (auto *I = dyn_cast<Instruction>(U)) {
+    return I->getFunction() == F;
+  }
+
+  if (auto *C = dyn_cast<ConstantExpr>(U)) {
+    return all_of(U->users(),
+                  [F](const User *U) { return isUsedOnlyFromFunction(U, F); });
+  }
+
+  return false;
+}
+
+bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                            const GlobalVariable &GV, const Function *F) {
  // Any LDS variable can be lowered by moving into the created struct
  // Each variable so lowered is allocated in every kernel, so variables
  // whose users are all known to be safe to lower without the transform
  // are left unchanged.
-  SmallPtrSet<User *, 8> Visited;
-  SmallVector<User *, 16> Stack;
-  Stack.push_back(InitialUser);
+  bool Ret = false;
+  SmallPtrSet<const User *, 8> Visited;
+  SmallVector<const User *, 16> Stack(GV.users());
+
+  assert(!F || isKernelCC(F));

  while (!Stack.empty()) {
-    User *V = Stack.pop_back_val();
+    const User *V = Stack.pop_back_val();
    Visited.insert(V);

    if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
@ -50,31 +65,44 @@ bool userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
    }

    if (auto *I = dyn_cast<Instruction>(V)) {
-      if (isKernelCC(I->getFunction())) {
-        continue;
+      const Function *UF = I->getFunction();
+      if (UF == F) {
+        // Used from this kernel, we want to put it into the structure.
+        Ret = true;
+      } else if (!F) {
+        Ret |= !isKernelCC(UF);
      }
+      continue;
    }

    if (auto *E = dyn_cast<ConstantExpr>(V)) {
-      for (Value::user_iterator EU = E->user_begin(); EU != E->user_end();
-           ++EU) {
-        if (Visited.insert(*EU).second) {
-          Stack.push_back(*EU);
+      if (F) {
+        // Any use which does not end up an instruction disqualifies a
+        // variable to be put into a kernel's LDS structure because later
+        // we will need to replace only this kernel's uses for which we
+        // need to identify a using function.
+        return isUsedOnlyFromFunction(E, F);
+      }
+      for (const User *U : E->users()) {
+        if (Visited.insert(U).second) {
+          Stack.push_back(U);
        }
      }
      continue;
    }

-    // Unknown user, conservatively lower the variable
-    return true;
+    // Unknown user, conservatively lower the variable.
+    // For module LDS conservatively means place it into the module LDS struct.
+    // For kernel LDS it means lower as a standalone variable.
+    return !F;
  }

-  return false;
+  return Ret;
 }

 std::vector<GlobalVariable *>
-findVariablesToLower(Module &M,
-                     const SmallPtrSetImpl<GlobalValue *> &UsedList) {
+findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                     const Function *F) {
  std::vector<llvm::GlobalVariable *> LocalVars;
  for (auto &GV : M.globals()) {
    if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
@ -98,9 +126,7 @@ findVariablesToLower(Module &M,
      // dropped by the back end if not. This pass skips over it.
      continue;
    }
-    if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) {
-          return userRequiresLowering(UsedList, U);
-        })) {
+    if (!shouldLowerLDSToStruct(UsedList, GV, F)) {
      continue;
    }
    LocalVars.push_back(&GV);
--- a/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@ -19,18 +19,29 @@ namespace llvm {

 namespace AMDGPU {

-bool isKernelCC(Function *Func);
+bool isKernelCC(const Function *Func);

 Align getAlign(DataLayout const &DL, const GlobalVariable *GV);

-bool userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
-                          User *InitialUser);
+/// \returns true if an LDS global requres lowering to a module LDS structure
+/// if \p F is not given. If \p F is given it must be a kernel and function
+/// \returns true if an LDS global is directly used from that kernel and it
+/// is safe to replace its uses with a kernel LDS structure member.
+/// \p UsedList contains a union of llvm.used and llvm.compiler.used variables
+/// which do not count as a use.
+bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                            const GlobalVariable &GV,
+                            const Function *F = nullptr);

 std::vector<GlobalVariable *>
-findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList);
+findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                     const Function *F = nullptr);

 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);

+/// \returns true if all uses of \p U end up in a function \p F.
+bool isUsedOnlyFromFunction(const User *U, const Function *F);
+
 } // end namespace AMDGPU

 } // end namespace llvm
--- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@ -83,7 +83,7 @@ define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] a
@g_lds = addrspace(3) global float undef, align 4

 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
-; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo
+; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
 define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
  %val = load float, float addrspace(3)* @g_lds
--- a/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@ -9,9 +9,9 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace
 ; CHECK-LABEL: use_lds_globals:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 4
+; CHECK-NEXT:    v_mov_b32_e32 v0, 8
 ; CHECK-NEXT:    s_mov_b32 m0, -1
-; CHECK-NEXT:    ds_read_b32 v3, v0 offset:4
+; CHECK-NEXT:    ds_read_b32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 9
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s0, 4
--- a/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
 ; FIXME: Merge with DAG test

@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
--- a/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@ -1096,27 +1096,33 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; CI-NEXT:    s_lshl_b32 s0, s2, 2
-; CI-NEXT:    s_add_i32 s1, s0, 0xc20
-; CI-NEXT:    s_addk_i32 s0, 0xc60
-; CI-NEXT:    v_mov_b32_e32 v0, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_read2_b32 v[2:3], v0 offset1:1
-; CI-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
+; CI-NEXT:    s_add_i32 s1, s0, 0x8c40
+; CI-NEXT:    s_add_i32 s0, s0, 0x8c80
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 2, v1
-; CI-NEXT:    ds_read2_b32 v[0:1], v8 offset1:1
-; CI-NEXT:    ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
-; CI-NEXT:    ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
-; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s0, 0x8020
+; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v8
+; CI-NEXT:    s_mov_b32 s0, 0x80a0
+; CI-NEXT:    v_add_i32_e32 v6, vcc, s0, v8
+; CI-NEXT:    v_mov_b32_e32 v0, s1
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; CI-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
+; CI-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
+; CI-NEXT:    ds_read2_b32 v[6:7], v6 offset1:1
+; CI-NEXT:    v_add_i32_e32 v8, vcc, 0x8120, v8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_add_f32_e32 v2, v2, v3
-; CI-NEXT:    v_add_f32_e32 v2, v2, v4
-; CI-NEXT:    v_add_f32_e32 v2, v2, v5
-; CI-NEXT:    v_add_f32_e32 v0, v2, v0
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v1
+; CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; CI-NEXT:    v_add_f32_e32 v0, v0, v3
+; CI-NEXT:    ds_read2_b32 v[8:9], v8 offset1:1
+; CI-NEXT:    v_add_f32_e32 v0, v0, v4
+; CI-NEXT:    v_add_f32_e32 v0, v0, v5
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v7
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v8
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v9
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@ -1125,23 +1131,29 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
 ; GFX9-LABEL: sgemm_inner_loop_read2_sequence:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
-; GFX9-NEXT:    s_add_i32 s3, s2, 0xc20
-; GFX9-NEXT:    s_addk_i32 s2, 0xc60
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    ds_read2_b32 v[2:3], v0 offset1:1
-; GFX9-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x8c40
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 2, v1
-; GFX9-NEXT:    ds_read2_b32 v[0:1], v8 offset1:1
-; GFX9-NEXT:    ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
-; GFX9-NEXT:    ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
+; GFX9-NEXT:    s_add_i32 s2, s2, 0x8c80
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_add_u32_e32 v4, 0x8020, v8
+; GFX9-NEXT:    v_add_u32_e32 v6, 0x80a0, v8
+; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
+; GFX9-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
+; GFX9-NEXT:    ds_read2_b32 v[6:7], v6 offset1:1
+; GFX9-NEXT:    v_add_u32_e32 v8, 0x8120, v8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT:    ds_read2_b32 v[8:9], v8 offset1:1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v6
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
@ -1462,11 +1474,11 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_read_u8 v1, v0 offset:72
-; CI-NEXT:    ds_read_u8 v2, v0 offset:71
-; CI-NEXT:    ds_read_u8 v3, v0 offset:70
-; CI-NEXT:    ds_read_u8 v4, v0 offset:69
-; CI-NEXT:    ds_read_u8 v5, v0 offset:68
+; CI-NEXT:    ds_read_u8 v1, v0 offset:37032
+; CI-NEXT:    ds_read_u8 v2, v0 offset:37031
+; CI-NEXT:    ds_read_u8 v3, v0 offset:37030
+; CI-NEXT:    ds_read_u8 v4, v0 offset:37029
+; CI-NEXT:    ds_read_u8 v5, v0 offset:37028
 ; CI-NEXT:    s_waitcnt lgkmcnt(4)
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; CI-NEXT:    s_waitcnt lgkmcnt(3)
@ -1477,9 +1489,9 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
 ; CI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v1, v1, v3
-; CI-NEXT:    ds_read_u8 v2, v0 offset:67
-; CI-NEXT:    ds_read_u8 v3, v0 offset:66
-; CI-NEXT:    ds_read_u8 v0, v0 offset:65
+; CI-NEXT:    ds_read_u8 v2, v0 offset:37027
+; CI-NEXT:    ds_read_u8 v3, v0 offset:37026
+; CI-NEXT:    ds_read_u8 v0, v0 offset:37025
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
@ -1496,14 +1508,14 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
 ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
 ; GFX9-ALIGNED:       ; %bb.0: ; %entry
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v2 offset:65
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v2 offset:66
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v2 offset:67
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v2 offset:68
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v2 offset:69
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v2 offset:70
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v2 offset:71
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v2 offset:72
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v2 offset:37025
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v2 offset:37026
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v2 offset:37027
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v2 offset:37028
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v2 offset:37029
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v2 offset:37030
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v2 offset:37031
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v2 offset:37032
 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
@ -1521,7 +1533,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
 ;
 ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
 ; GFX9-UNALIGNED:       ; %bb.0: ; %entry
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x41
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x90a1
 ; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
@ -889,9 +889,9 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; CI-NEXT:    s_lshl_b32 s2, s2, 2
-; CI-NEXT:    s_add_i32 s3, s2, 0xc20
+; CI-NEXT:    s_add_i32 s3, s2, 0x8c40
 ; CI-NEXT:    v_mov_b32_e32 v0, s3
-; CI-NEXT:    s_addk_i32 s2, 0xc60
+; CI-NEXT:    s_add_i32 s2, s2, 0x8c80
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 m0, -1
@ -902,17 +902,20 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x8020, v0
+; CI-NEXT:    ds_write2_b32 v1, v2, v3 offset1:1
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x80a0, v0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 0x8120, v0
+; CI-NEXT:    ds_write2_b32 v1, v2, v3 offset1:1
 ; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
-; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
-; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: write2_sgemm_sequence:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
-; GFX9-NEXT:    s_add_i32 s3, s2, 0xc20
-; GFX9-NEXT:    s_addk_i32 s2, 0xc60
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x8c40
+; GFX9-NEXT:    s_add_i32 s2, s2, 0x8c80
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
@ -923,9 +926,12 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
 ; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
 ; GFX9-NEXT:    ds_write2_b32 v2, v3, v4 offset1:1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x8020, v0
+; GFX9-NEXT:    ds_write2_b32 v1, v3, v4 offset1:1
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80a0, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x8120, v0
+; GFX9-NEXT:    ds_write2_b32 v1, v3, v4 offset1:1
 ; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
-; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
-; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
 ; GFX9-NEXT:    s_endpgm
  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
@ -1026,37 +1032,37 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_write_b8 v1, v0 offset:65
+; CI-NEXT:    ds_write_b8 v1, v0 offset:37025
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
-; CI-NEXT:    ds_write_b8 v1, v0 offset:70
+; CI-NEXT:    ds_write_b8 v1, v0 offset:37030
 ; CI-NEXT:    v_mov_b32_e32 v0, 0xc8
-; CI-NEXT:    ds_write_b8 v1, v0 offset:69
-; CI-NEXT:    ds_write_b8 v1, v1 offset:68
-; CI-NEXT:    ds_write_b8 v1, v1 offset:67
-; CI-NEXT:    ds_write_b8 v1, v1 offset:66
-; CI-NEXT:    ds_write_b8 v1, v1 offset:72
-; CI-NEXT:    ds_write_b8 v1, v1 offset:71
+; CI-NEXT:    ds_write_b8 v1, v0 offset:37029
+; CI-NEXT:    ds_write_b8 v1, v1 offset:37028
+; CI-NEXT:    ds_write_b8 v1, v1 offset:37027
+; CI-NEXT:    ds_write_b8 v1, v1 offset:37026
+; CI-NEXT:    ds_write_b8 v1, v1 offset:37032
+; CI-NEXT:    ds_write_b8 v1, v1 offset:37031
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
 ; GFX9-ALIGNED:       ; %bb.0: ; %entry
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:65
+; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:37025
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:70
+; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:37030
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0xc8
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:69
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:68
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:67
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:66
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:72
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:71
+; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:37029
+; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:37028
+; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:37027
+; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:37026
+; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:37032
+; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:37031
 ; GFX9-ALIGNED-NEXT:    s_endpgm
 ;
 ; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
 ; GFX9-UNALIGNED:       ; %bb.0: ; %entry
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x41
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x90a1
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x1c8
 ; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
--- a/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/test/CodeGen/AMDGPU/lds-alignment.ll
@ -46,9 +46,9 @@ define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace
  ret void
 }

-; 38 + (10 pad) + 38
+; 38 + (2 pad) + 38
 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
-; HSA: workgroup_group_segment_byte_size = 86
+; HSA: workgroup_group_segment_byte_size = 78
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@ -94,9 +94,10 @@ define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 add
  ret void
 }

-; (7 * 8) + (39 * 4) = 212
+; FIXME: missign alignment can be improved.
+; (39 * 4) + (4 pad) + (7 * 8) = 216
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
-; HSA: workgroup_group_segment_byte_size = 212
+; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
  %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
@ -125,22 +126,11 @@ define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)*

  ret void
 }
-; Test how the size needed for padding changes based on when the
-; global is encountered during lowering. There should be a consistent
-; order to minimize padding waste.
-;
-; The way global addresses are lowered now, this is in inverse of
-; first use order which isn't great.
-;
-; This should be the optimal order for these globals. If sorted to
-; minimize padding, the minimum possible size is: align 32, align 8,
-; align 16
-

 ; align 32, 16, 8
-; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
+; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
 ; HSA-LABEL: {{^}}test_round_size_3_order0:
-; HSA: workgroup_group_segment_byte_size = 134
+; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
@ -159,9 +149,9 @@ define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 ad
 }

 ; align 32, 8, 16
-; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134
+; 38 (+ 10 pad) + 38 + (2 pad) + 38 = 126
 ; HSA-LABEL: {{^}}test_round_size_3_order1:
-; HSA: workgroup_group_segment_byte_size = 134
+; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
@ -180,9 +170,9 @@ define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 ad
 }

 ; align 16, 32, 8
-; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
+; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
-; HSA: workgroup_group_segment_byte_size = 150
+; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@ -200,10 +190,11 @@ define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 ad
  ret void
 }

+; FIXME: Improve alignment
 ; align 16, 8, 32
-; 38 + (2 pad) + 38 + (2 pad) + 38
+; 38 + (10 pad) + 38 + (2 pad) + 38
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
-; HSA: workgroup_group_segment_byte_size = 118
+; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@ -222,9 +213,9 @@ define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 ad
 }

 ; align 8, 32, 16
-; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
+; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
-; HSA: workgroup_group_segment_byte_size = 142
+; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
--- a/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/test/CodeGen/AMDGPU/lds-relocs.ll
@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t - | FileCheck -check-prefixes=ELF %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -filetype=obj < %s | llvm-readobj -r -t - | FileCheck -check-prefixes=ELF %s

@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@ -275,11 +275,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; CIVI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0

-; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
-; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
-; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
-; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
-
+; GCN-DAG:  v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -428,11 +424,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; CIVI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0

-; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
-; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
-; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
-; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
-
+; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@ -136,10 +136,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
@lds0 = addrspace(3) global [512 x i32] undef, align 4

 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
-; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
-; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
-; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
-; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -337,10 +334,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
@lds1 = addrspace(3) global [512 x i64] undef, align 8

 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
-; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
-; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
-; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
-; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
--- a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@ -38,17 +38,15 @@ entry:

 ; GCN-LABEL: {{^}}local_memory_two_objects:
 ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
-; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
-; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
+; CI-DAG:  v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 0, [[ADDRW]]
+; SI-DAG:  v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]]
+; GCN-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 16, [[ADDRW]]
+; GCN-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4

 ; GCN: s_barrier

-; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
-; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
+; GCN-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
+; GCN-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]

 define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@ -10,7 +10,7 @@
 ; not an immediate.

 ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
-; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo
+; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], 0{{$}}
 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4

 ; R600: LDS_READ_RET
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@ -1,5 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s

@ -49,7 +47,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_cbranch_scc1 BB0_3
 ; GCN-NEXT:  ; %bb.2: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
@ -128,7 +126,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_cbranch_scc1 BB1_3
 ; GCN-NEXT:  ; %bb.2: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
@ -206,24 +204,20 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s3, s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
-; GCN-NEXT:    s_mov_b32 s2, lds@abs32@lo
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT:    ; implicit-def: $sgpr3
+; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:  BB2_1: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[8:9], s2, 4
-; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
-; GCN-NEXT:    s_and_b64 s[8:9], s[8:9], exec
-; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GCN-NEXT:    s_cmp_gt_i32 s3, -1
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cmp_gt_i32 s6, -1
 ; GCN-NEXT:    s_cbranch_scc1 BB2_3
 ; GCN-NEXT:  ; %bb.2: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB2_1 Depth=1
-; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
@ -231,7 +225,7 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:  BB2_3: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB2_1 Depth=1
-; GCN-NEXT:    s_add_i32 s3, s3, 1
+; GCN-NEXT:    s_add_i32 s6, s6, 1
 ; GCN-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
 ; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@ -311,7 +305,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_cbranch_scc1 BB3_3
 ; GCN-NEXT:  ; %bb.2: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB3_1 Depth=1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
@ -399,7 +393,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_cbranch_scc1 BB4_3
 ; GCN-NEXT:  ; %bb.2: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB4_1 Depth=1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
@ -491,7 +485,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_cbranch_scc1 BB5_3
 ; GCN-NEXT:  ; %bb.2: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
--- a/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@ -0,0 +1,119 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
+@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2
+@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4
+@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
+@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { [8 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel..lds.t = type { [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel..lds.t.0 = type { [4 x i8] }
+
+;.
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"
+; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16
+; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
+; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2
+; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4
+;.
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: @k0(
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT:    %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+; CHECK-NEXT:    ret void
+;
+  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
+  store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+
+  %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
+  store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+
+  %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
+  store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+
+  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+  store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+
+  ret void
+}
+
+define amdgpu_kernel void @k1() {
+; CHECK-LABEL: @k1(
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT:    %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+; CHECK-NEXT:    ret void
+;
+  %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
+  store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+
+  %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
+  store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+
+  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+  store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+
+  ret void
+}
+
+define amdgpu_kernel void @0() {
+; CHECK-LABEL: @0(
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT:    ret void
+;
+  %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
+  store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+
+  ret void
+}
+
+define amdgpu_kernel void @1() {
+; CHECK-LABEL: @1(
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT:    ret void
+;
+  %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
+  store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+
+  ret void
+}
+
+define void @f0() {
+; CHECK-LABEL: @f0(
+; CHECK-NEXT:    %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+; CHECK-NEXT:    %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4
+; CHECK-NEXT:    ret void
+;
+  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
+  store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+
+  %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)*
+  store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4
+
+  ret void
+}
+;.
+; CHECK: attributes #0 = { nofree nosync nounwind readnone willreturn }
+;.
--- a/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
+++ b/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
@ -0,0 +1,68 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 }
+
+@lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
+
+; Use constant from different kernels
+;.
+; CHECK: @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
+; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4
+;.
+define amdgpu_kernel void @k0(i64 %x) {
+; CHECK-LABEL: @k0(
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
+; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
+  store i8 1, i8 addrspace(0)* %ptr, align 1
+  ret void
+}
+
+define amdgpu_kernel void @k1(i64 %x) {
+; CHECK-LABEL: @k1(
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
+; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
+  store i8 1, i8 addrspace(0)* %ptr, align 1
+  ret void
+}
+
+@lds.2 = internal unnamed_addr addrspace(3) global i32 undef, align 4
+
+; Use constant twice from the same kernel
+define amdgpu_kernel void @k2(i64 %x) {
+; CHECK-LABEL: @k2(
+; CHECK-NEXT:    %ptr1 = bitcast i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 1, i8 addrspace(3)* %ptr1, align 4
+; CHECK-NEXT:    %ptr2 = bitcast i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %ptr2, align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr1 = bitcast i32 addrspace(3)* @lds.2 to i8 addrspace(3)*
+  store i8 1, i8 addrspace(3)* %ptr1, align 4
+  %ptr2 = bitcast i32 addrspace(3)* @lds.2 to i8 addrspace(3)*
+  store i8 2, i8 addrspace(3)* %ptr2, align 4
+  ret void
+}
+
+@lds.3 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1
+
+; Use constant twice from the same kernel but a different other constant.
+define amdgpu_kernel void @k3(i64 %x) {
+; CHECK-LABEL: @k3(
+; CHECK-NEXT:    %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 16) to i64 addrspace(3)*) to i64*
+; CHECK-NEXT:    store i64 1, i64* %ptr1, align 1
+; CHECK-NEXT:    %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 24) to i64 addrspace(3)*) to i64*
+; CHECK-NEXT:    store i64 2, i64* %ptr2, align 1
+;
+  %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64*
+  store i64 1, i64* %ptr1, align 1
+  %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 24) to i64 addrspace(3)*) to i64*
+  store i64 2, i64* %ptr2, align 1
+  ret void
+}
--- a/test/CodeGen/AMDGPU/lower-kernel-lds.ll
+++ b/test/CodeGen/AMDGPU/lower-kernel-lds.ll
@ -0,0 +1,65 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
+@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2
+@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4
+@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
+@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
+
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
+
+;.
+; CHECK: @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
+; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16
+; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
+;.
+define amdgpu_kernel void @k0() {
+; CHECK-LABEL: @k0(
+; CHECK-NEXT:    %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT:    %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+; CHECK-NEXT:    ret void
+;
+  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
+  store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+
+  %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
+  store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+
+  %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
+  store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+
+  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+  store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+
+  ret void
+}
+
+define amdgpu_kernel void @k1() {
+; CHECK-LABEL: @k1(
+; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT:    %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
+; CHECK-NEXT:    store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+; CHECK-NEXT:    ret void
+;
+  %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
+  store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+
+  %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
+  store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+
+  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+  store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+
+  ret void
+}
--- a/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@ -5,8 +5,8 @@

@func = addrspace(3) global float undef, align 4

-; @kern is only used from a kernel so it is left unchanged
-; CHECK: @kern = addrspace(3) global float undef, align 4
+; CHECK: %llvm.amdgcn.kernel.timestwo.lds.t = type { float }
+
@kern = addrspace(3) global float undef, align 4

 ; @func is only used from a non-kernel function so is rewritten
@ -17,6 +17,7 @@
@both = addrspace(3) global float undef, align 4

 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
+; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4

 ; CHECK-LABEL: @get_func()
 ; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@ -36,9 +37,9 @@ entry:

 ; CHECK-LABEL: @timestwo()
 ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
-; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 ; CHECK: %mul = mul i32 %ld, 2
-; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 define amdgpu_kernel void @timestwo() {
  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
  %mul = mul i32 %ld, 2
--- a/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
+++ b/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@ -7,7 +7,7 @@
 ; CHECK-NOT: llvm.amdgcn.module.lds.t

 ; var1, var2 would be transformed were they used from a non-kernel function
-; CHECK: @var1 = addrspace(3) global i32 undef
+; CHECK-NOT: @var1 =
 ; CHECK: @var2 = addrspace(3) global float undef
@var1 = addrspace(3) global i32 undef
@var2 = addrspace(3) global float undef
@ -36,7 +36,7 @@
@toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8

 ; Use by .used lists doesn't trigger lowering
-; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+; CHECK-NOT: @llvm.used =
@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"

 ; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
@ -58,9 +58,8 @@ define void @use_variables() {
  ret void
 }

-; Use by kernel doesn't trigger lowering
 ; CHECK-LABEL: @kern_use()
-; CHECK: %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+; CHECK: %inc = atomicrmw add i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kern_use.lds.t, %llvm.amdgcn.kernel.kern_use.lds.t addrspace(3)* @llvm.amdgcn.kernel.kern_use.lds, i32 0, i32 0), i32 1 monotonic, align 4
 define amdgpu_kernel void @kern_use() {
  %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
  call void @use_variables()
--- a/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@ -20,7 +20,7 @@ define amdgpu_kernel void @k0() {
 ; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
 ; OPT-NEXT:    store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
-; OPT-NEXT:    [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+; OPT-NEXT:    [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
 ; OPT-NEXT:    store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
 ; OPT-NEXT:    ret void
 ;
--- a/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@ -8,8 +8,8 @@
 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 ; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
-; ASM: .amdgpu_lds global_array0, 30000, 4
-; ASM: .amdgpu_lds global_array1, 30000, 4
+; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 4
+; ASM-NOT: .amdgpu_lds

 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
--- a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -disable-promote-alloca-to-vector -amdgpu-enable-lower-module-lds=0 < %s | FileCheck -check-prefix=GCN %s

 ; This shows that the amount LDS size estimate should try to not be
 ; sensitive to the order of the LDS globals. This should try to
--- a/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@ -34,11 +34,7 @@ define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 add

 ; GCN-LABEL: {{^}}load_shl_base_lds_1:
 ; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
-
-; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
-; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
-
-; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
+; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[OFS]] offset:8
 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
 ; GCN-DAG: buffer_store_dword [[RESULT]]
 ; GCN-DAG: buffer_store_dword [[ADDUSE]]
@ -72,18 +68,10 @@ define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i
 ; The two globals are placed adjacent in memory, so the same base
 ; pointer can be used with an offset into the second one.

-; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints
-
 ; GCN-LABEL: {{^}}load_shl_base_lds_2:
-; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
-; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
-; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
-; GCN-DAG: s_mov_b32 m0, -1
-
-; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
-; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
-; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
-
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: s_mov_b32 m0, -1
+; GCN: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
 ; GCN: s_endpgm
 define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
--- a/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@ -10,7 +10,7 @@
 ; CHECK: machineFunctionInfo:
 ; CHECK-NEXT: explicitKernArgSize: 128
 ; CHECK-NEXT: maxKernArgAlign: 64
-; CHECK-NEXT: ldsSize: 0
+; CHECK-NEXT: ldsSize: 2048
 ; CHECK-NEXT: dynLDSAlign: 1
 ; CHECK-NEXT: isEntryFunction: true
 ; CHECK-NEXT: noSignedZerosFPMath: false