[AMDGPU] Change constant addr space to 4

Differential Revision: https://reviews.llvm.org/D43170 llvm-svn: 325030
2025-01-31 12:41:49 +01:00 · 2018-02-13 18:00:25 +00:00 · 2018-02-13 18:00:25 +00:00 · c6e831c09d
commit c6e831c09d
parent 3f5aaeaf08
90 changed files with 1240 additions and 1249 deletions
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@ -270,27 +270,17 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR).
  .. table:: Address Space Mapping
     :name: amdgpu-address-space-mapping-table

-     ================== ================= =================
+     ================== =================
     LLVM Address Space Memory Space
-     ------------------ -----------------------------------
-     \                  Current Default   Future Default
-     ================== ================= =================
-     0                  Generic (Flat)    Generic (Flat)
-     1                  Global            Global
-     2                  Constant          Region (GDS)
-     3                  Local (group/LDS) Local (group/LDS)
-     4                  Region (GDS)      Constant
-     5                  Private (Scratch) Private (Scratch)
-     6                  Constant 32-bit   Constant 32-bit
-     ================== ================= =================
-
-Current Default
-  This is the current default address space mapping used for all languages.
-  This will shortly be deprecated.
-
-Future Default
-  This will shortly be the only address space mapping for all languages using
-  AMDGPU backend.
+     ================== =================
+     0                  Generic (Flat)
+     1                  Global
+     2                  Region (GDS)
+     3                  Local (group/LDS)
+     4                  Constant
+     5                  Private (Scratch)
+     6                  Constant 32-bit
+     ================== =================

 .. _amdgpu-memory-scopes:

--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@ -83,22 +83,22 @@ defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named

 def int_amdgcn_dispatch_ptr :
  GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
  [IntrNoMem, IntrSpeculatable]>;

 def int_amdgcn_queue_ptr :
  GCCBuiltin<"__builtin_amdgcn_queue_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
  [IntrNoMem, IntrSpeculatable]>;

 def int_amdgcn_kernarg_segment_ptr :
  GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
  [IntrNoMem, IntrSpeculatable]>;

 def int_amdgcn_implicitarg_ptr :
  GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
  [IntrNoMem, IntrSpeculatable]>;

 def int_amdgcn_groupstaticsize :
@ -111,7 +111,7 @@ def int_amdgcn_dispatch_id :

 def int_amdgcn_implicit_buffer_ptr :
  GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
-  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
  [IntrNoMem, IntrSpeculatable]>;

 // Set EXEC to the 64-bit value given.
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@ -222,7 +222,7 @@ struct AMDGPUAS {
    MAX_COMMON_ADDRESS = 5,

    GLOBAL_ADDRESS = 1,   ///< Address space for global memory (RAT0, VTX0).
-    CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
+    CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
    LOCAL_ADDRESS = 3,    ///< Address space for local memory.

    CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@ -61,7 +61,7 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
  /* Region   */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
  };
  static const AliasResult ASAliasRulesGenIsZero[6][6] = {
-  /*             Flat       Global    Constant  Group     Region    Private */
+  /*             Flat       Global    Region    Group     Constant  Private */
  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
  /* Global   */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
  /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias,  NoAlias},
@ -72,9 +72,9 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar
  assert(AS.MAX_COMMON_ADDRESS <= 5);
  if (AS.FLAT_ADDRESS == 0) {
    assert(AS.GLOBAL_ADDRESS   == 1 &&
-           AS.REGION_ADDRESS   == 4 &&
+           AS.REGION_ADDRESS   == 2 &&
           AS.LOCAL_ADDRESS    == 3 &&
-           AS.CONSTANT_ADDRESS == 2 &&
+           AS.CONSTANT_ADDRESS == 4 &&
           AS.PRIVATE_ADDRESS  == 5);
    ASAliasRules = &ASAliasRulesGenIsZero;
  } else {
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@ -116,7 +116,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,

  if (Info->hasKernargSegmentPtr()) {
    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
-    const LLT P2 = LLT::pointer(2, 64);
+    const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
    unsigned VReg = MRI.createGenericVirtualRegister(P2);
    MRI.addLiveIn(InputPtrReg, VReg);
    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@ -12,6 +12,7 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//

+#include "AMDGPU.h"
 #include "AMDGPULegalizerInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
@ -29,8 +30,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
  const LLT V2S16 = LLT::vector(2, 16);
  const LLT S32 = LLT::scalar(32);
  const LLT S64 = LLT::scalar(64);
-  const LLT P1 = LLT::pointer(1, 64);
-  const LLT P2 = LLT::pointer(2, 64);
+  const LLT P1 = LLT::pointer(AMDGPUAS::GLOBAL_ADDRESS, 64);
+  const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);

  setAction({G_ADD, S32}, Legal);
  setAction({G_AND, S32}, Legal);
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -266,7 +266,7 @@ static StringRef computeDataLayout(const Triple &TT) {

  // 32-bit private, local, and region pointers. 64-bit global, constant and
  // flat.
-    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32"
+    return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
 }
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@ -929,7 +929,7 @@ AMDGPUAS getAMDGPUAS(Triple T) {
  AMDGPUAS AS;
  AS.FLAT_ADDRESS = 0;
  AS.PRIVATE_ADDRESS = 5;
-  AS.REGION_ADDRESS = 4;
+  AS.REGION_ADDRESS = 2;
  return AS;
 }

--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@ -5,7 +5,7 @@
 # REQUIRES: global-isel

 --- |
-  define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
+  define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void }
 ...
 ---

@ -91,50 +91,50 @@ body: |
  bb.0:
    liveins: $sgpr0_sgpr1

-    %0:sgpr(p2) = COPY $sgpr0_sgpr1
+    %0:sgpr(p4) = COPY $sgpr0_sgpr1

    %1:sgpr(s64) = G_CONSTANT i64 4
-    %2:sgpr(p2) = G_GEP %0, %1
+    %2:sgpr(p4) = G_GEP %0, %1
    %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %3

    %4:sgpr(s64) = G_CONSTANT i64 1020
-    %5:sgpr(p2) = G_GEP %0, %4
+    %5:sgpr(p4) = G_GEP %0, %4
    %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %6

    %7:sgpr(s64) = G_CONSTANT i64 1024
-    %8:sgpr(p2) = G_GEP %0, %7
+    %8:sgpr(p4) = G_GEP %0, %7
    %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %9

    %10:sgpr(s64) = G_CONSTANT i64 1048572
-    %11:sgpr(p2) = G_GEP %0, %10
+    %11:sgpr(p4) = G_GEP %0, %10
    %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %12

    %13:sgpr(s64) = G_CONSTANT i64 1048576
-    %14:sgpr(p2) = G_GEP %0, %13
+    %14:sgpr(p4) = G_GEP %0, %13
    %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %15

    %16:sgpr(s64) = G_CONSTANT i64 17179869180
-    %17:sgpr(p2) = G_GEP %0, %16
+    %17:sgpr(p4) = G_GEP %0, %16
    %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %18

    %19:sgpr(s64) = G_CONSTANT i64 17179869184
-    %20:sgpr(p2) = G_GEP %0, %19
+    %20:sgpr(p4) = G_GEP %0, %19
    %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %21

    %22:sgpr(s64) = G_CONSTANT i64 4294967292
-    %23:sgpr(p2) = G_GEP %0, %22
+    %23:sgpr(p4) = G_GEP %0, %22
    %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %24

    %25:sgpr(s64) = G_CONSTANT i64 4294967296
-    %26:sgpr(p2) = G_GEP %0, %25
+    %26:sgpr(p4) = G_GEP %0, %25
    %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0)
    $sgpr0 = COPY %27

--- a/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
@ -18,28 +18,28 @@ define amdgpu_vs void @test_f32(float %arg0) {
 }

 ; CHECK-LABEL: name: test_ptr2_byval
-; CHECK: [[S01:%[0-9]+]]:_(p2) = COPY $sgpr0_sgpr1
+; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
 ; CHECK: G_LOAD [[S01]]
-define amdgpu_vs void @test_ptr2_byval(i32 addrspace(2)* byval %arg0) {
-   %tmp0 = load volatile i32, i32 addrspace(2)* %arg0
+define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) {
+   %tmp0 = load volatile i32, i32 addrspace(4)* %arg0
  ret void
 }

 ; CHECK-LABEL: name: test_ptr2_inreg
-; CHECK: [[S01:%[0-9]+]]:_(p2) = COPY $sgpr0_sgpr1
+; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
 ; CHECK: G_LOAD [[S01]]
-define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(2)* inreg %arg0) {
-  %tmp0 = load volatile i32, i32 addrspace(2)* %arg0
+define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
+  %tmp0 = load volatile i32, i32 addrspace(4)* %arg0
  ret void
 }

 ; CHECK-LABEL: name: test_sgpr_alignment0
 ; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
-; CHECK: [[S23:%[0-9]+]]:_(p2) = COPY $sgpr2_sgpr3
+; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
 ; CHECK: G_LOAD [[S23]]
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]]
-define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(2)* inreg %arg1) {
-  %tmp0 = load volatile i32, i32 addrspace(2)* %arg1
+define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) {
+  %tmp0 = load volatile i32, i32 addrspace(4)* %arg1
  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
  ret void
 }
--- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
@ -3,7 +3,7 @@
 # REQUIRES: global-isel

 --- |
-  define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
+  define amdgpu_kernel void @load_constant(i32 addrspace(4)* %ptr0) { ret void }
  define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
    %tmp0 = load i32, i32 addrspace(1)* %ptr1
    ret void
@ -30,7 +30,7 @@ legalized: true
 body: |
  bb.0:
    liveins: $sgpr0_sgpr1
-    %0:_(p2) = COPY $sgpr0_sgpr1
+    %0:_(p4) = COPY $sgpr0_sgpr1
    %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0)
 ...

--- a/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@ -9,10 +9,10 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
-  %1 = load i32, i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
+  %1 = load i32, i32 addrspace(4)* %0
  store i32 %1, i32 addrspace(1)* %out
  ret void
 }
@ -21,10 +21,10 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
-  %1 = load i32, i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
+  %1 = load i32, i32 addrspace(4)* %0
  store i32 %1, i32 addrspace(1)* %out
  ret void
 }
@ -36,10 +36,10 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
-  %1 = load i32, i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
+  %1 = load i32, i32 addrspace(4)* %0
  store i32 %1, i32 addrspace(1)* %out
  ret void
 }
@ -51,10 +51,10 @@ entry:
 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; XGCN: s_endpgm
-define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
-  %1 = load i32, i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32
+  %1 = load i32, i32 addrspace(4)* %0
  store i32 %1, i32 addrspace(1)* %out
  ret void
 }
@ -65,10 +65,10 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
-  %1 = load i32, i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
+  %1 = load i32, i32 addrspace(4)* %0
  store i32 %1, i32 addrspace(1)* %out
  ret void
 }
@ -79,10 +79,10 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
-  %1 = load i32, i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
+  %1 = load i32, i32 addrspace(4)* %0
  store i32 %1, i32 addrspace(1)* %out
  ret void
 }
--- a/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@ -27,9 +27,9 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i

 ; VI: s_add_i32
 ; VI: s_add_i32
-define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
-  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
-  %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
+define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
+  %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
  %add = add <2 x i16> %a, %b
  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
  ret void
@ -41,8 +41,8 @@ define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i

 ; VI: s_add_i32
 ; VI: s_add_i32
-define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
-  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
  %add = add <2 x i16> %a, %a
  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@ -100,8 +100,8 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %p
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
-define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
-  %stof = addrspacecast i32 addrspace(2)* %ptr to i32*
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+  %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
  %ld = load volatile i32, i32* %stof
  ret void
 }
@ -160,8 +160,8 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
-  %ftos = addrspacecast i32* %ptr to i32 addrspace(2)*
-  load volatile i32, i32 addrspace(2)* %ftos
+  %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
+  load volatile i32, i32 addrspace(4)* %ftos
  ret void
 }

--- a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@ -4,9 +4,9 @@
 ; This test just checks that the compiler doesn't crash.

 ; FUNC-LABEL: {{^}}v32i8_to_v8i32:
-define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(4)* inreg) #0 {
 entry:
-  %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
+  %1 = load <32 x i8>, <32 x i8> addrspace(4)* %0
  %2 = bitcast <32 x i8> %1 to <8 x i32>
  %3 = extractelement <8 x i32> %2, i32 1
  %4 = icmp ne i32 %3, 0
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@ -48,12 +48,12 @@
 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0


-; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(2)* [[DISPATCH_PTR]] to i32 addrspace(2)*
-; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 1
-; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP0]], align 4, !invariant.load !0
-; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 2
-; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0
+; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)*
+; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 1
+; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP0]], align 4, !invariant.load !0
+; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 2
+; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP1]], align 4, !range !1, !invariant.load !0
 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16

 ; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2
--- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@ -8,10 +8,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
 declare i32 @llvm.amdgcn.workitem.id.z() #0

-declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
-declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
-declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
-declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
 declare i64 @llvm.amdgcn.dispatch.id() #0

 ; HSA: define void @use_workitem_id_x() #1 {
@ -58,15 +58,15 @@ define void @use_workgroup_id_z() #1 {

 ; HSA: define void @use_dispatch_ptr() #7 {
 define void @use_dispatch_ptr() #1 {
-  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  store volatile i8 addrspace(2)* %dispatch.ptr, i8 addrspace(2)* addrspace(1)* undef
+  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(4)* %dispatch.ptr, i8 addrspace(4)* addrspace(1)* undef
  ret void
 }

 ; HSA: define void @use_queue_ptr() #8 {
 define void @use_queue_ptr() #1 {
-  %queue.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
-  store volatile i8 addrspace(2)* %queue.ptr, i8 addrspace(2)* addrspace(1)* undef
+  %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
+  store volatile i8 addrspace(4)* %queue.ptr, i8 addrspace(4)* addrspace(1)* undef
  ret void
 }

@ -186,22 +186,22 @@ define void @call_recursive_use_workitem_id_y() #1 {

 ; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #8 {
 define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
-  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
-  store volatile i32 0, i32 addrspace(4)* %stof
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)*
+  store volatile i32 0, i32 addrspace(2)* %stof
  ret void
 }

 ; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #12 {
 define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 {
-  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
-  store volatile i32 0, i32 addrspace(4)* %stof
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)*
+  store volatile i32 0, i32 addrspace(2)* %stof
  ret void
 }

 ; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #13 {
 define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 {
-  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
-  store volatile i32 0, i32 addrspace(4)* %stof
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)*
+  store volatile i32 0, i32 addrspace(2)* %stof
  call void @func_indirect_use_queue_ptr()
  ret void
 }
@ -226,8 +226,8 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {

 ; HSA: define void @use_kernarg_segment_ptr() #14 {
 define void @use_kernarg_segment_ptr() #1 {
-  %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef
+  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  store volatile i8 addrspace(4)* %kernarg.segment.ptr, i8 addrspace(4)* addrspace(1)* undef
  ret void
 }

@ -239,15 +239,15 @@ define void @func_indirect_use_kernarg_segment_ptr() #1 {

 ; HSA: define amdgpu_kernel void @kern_use_implicitarg_ptr() #15 {
 define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
-  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
-  store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef
+  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef
  ret void
 }

 ; HSA: define void @use_implicitarg_ptr() #15 {
 define void @use_implicitarg_ptr() #1 {
-  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
-  store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef
+  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef
  ret void
 }

--- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@ -8,9 +8,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
 declare i32 @llvm.amdgcn.workitem.id.z() #0

-declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
-declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
-declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0

 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
 define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
@ -149,27 +149,27 @@ define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {

 ; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
 define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
-  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
-  %val = load i32, i32 addrspace(2)* %bc
+  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %bc
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

 ; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
 define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
-  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
-  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
-  %val = load i32, i32 addrspace(2)* %bc
+  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
+  %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %bc
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

 ; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 {
 define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 {
-  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
-  %val = load i32, i32 addrspace(2)* %bc
+  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %bc
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }
@ -210,9 +210,9 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %p
  ret void
 }

-; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
-define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
-  %stof = addrspacecast i32 addrspace(2)* %ptr to i32*
+; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+  %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
  %ld = load volatile i32, i32* %stof
  ret void
 }
@ -226,8 +226,8 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #1 {

 ; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 {
 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 {
-  %ftos = addrspacecast i32* %ptr to i32 addrspace(2)*
-  %ld = load volatile i32, i32 addrspace(2)* %ftos
+  %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
+  %ld = load volatile i32, i32 addrspace(4)* %ftos
  ret void
 }

--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@ -358,7 +358,7 @@ bb0:
  br i1 %cmp0, label %bb2, label %bb1

 bb1:
-  %val = load volatile i32, i32 addrspace(2)* undef
+  %val = load volatile i32, i32 addrspace(4)* undef
  %cmp1 = icmp eq i32 %val, 3
  br i1 %cmp1, label %bb3, label %bb2

--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@ -345,7 +345,7 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
-  %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
+  %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
  call void @external_void_func_v8i32(<8 x i32> %val)
  ret void
@ -359,7 +359,7 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
-  %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
+  %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
  call void @external_void_func_v16i32(<16 x i32> %val)
  ret void
@ -377,7 +377,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
-  %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+  %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
  call void @external_void_func_v32i32(<32 x i32> %val)
  ret void
@ -405,7 +405,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
-  %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+  %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
  %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
  %val1 = load i32, i32 addrspace(1)* undef
  call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
@ -430,7 +430,7 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)*
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
-  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(2)* undef
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
  call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
  ret void
@ -516,7 +516,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval

 ; GCN-LABEL: {{^}}test_call_external_void_func_v16i8:
 define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
-  %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
+  %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
  call void @external_void_func_v16i8(<16 x i8> %val)
  ret void
--- a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@ -4,9 +4,9 @@
 ; GCN-LABEL: {{^}}use_dispatch_ptr:
 ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
 define void @use_dispatch_ptr() #1 {
-  %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
-  %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
-  %value = load volatile i32, i32 addrspace(2)* %header_ptr
+  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+  %value = load volatile i32, i32 addrspace(4)* %header_ptr
  ret void
 }

@ -21,9 +21,9 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
 ; GCN-LABEL: {{^}}use_queue_ptr:
 ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
 define void @use_queue_ptr() #1 {
-  %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
-  %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
-  %value = load volatile i32, i32 addrspace(2)* %header_ptr
+  %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+  %value = load volatile i32, i32 addrspace(4)* %header_ptr
  ret void
 }

@ -62,9 +62,9 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 {
 ; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
 ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
 define void @use_kernarg_segment_ptr() #1 {
-  %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
-  %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
-  %value = load volatile i32, i32 addrspace(2)* %header_ptr
+  %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
+  %value = load volatile i32, i32 addrspace(4)* %header_ptr
  ret void
 }

@ -435,17 +435,17 @@ define void @use_every_sgpr_input() #1 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca

-  %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
-  %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
-  %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
+  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+  %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc

-  %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
-  %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
-  %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
+  %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+  %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+  %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc

-  %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
-  %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
-  %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
+  %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+  %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
+  %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc

  %val3 = call i64 @llvm.amdgcn.dispatch.id()
  call void asm sideeffect "; use $0", "s"(i64 %val3)
@ -515,17 +515,17 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca

-  %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
-  %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
-  %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
+  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+  %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc

-  %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
-  %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
-  %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
+  %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+  %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+  %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc

-  %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
-  %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
-  %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
+  %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+  %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
+  %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc

  %val3 = call i64 @llvm.amdgcn.dispatch.id()
  call void asm sideeffect "; use $0", "s"(i64 %val3)
@ -573,17 +573,17 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 {

  store volatile i32 0, i32 addrspace(5)* %alloca

-  %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
-  %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
-  %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
+  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+  %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc

-  %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
-  %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
-  %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
+  %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+  %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+  %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc

-  %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
-  %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
-  %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
+  %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+  %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
+  %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc

  %val3 = call i64 @llvm.amdgcn.dispatch.id()
  call void asm sideeffect "; use $0", "s"(i64 %val3)
@ -603,10 +603,10 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 {
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workgroup.id.y() #0
 declare i32 @llvm.amdgcn.workgroup.id.z() #0
-declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
-declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
 declare i64 @llvm.amdgcn.dispatch.id() #0
-declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0

 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind noinline }
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@ -87,12 +87,12 @@ define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32
 entry:
  %out.gep = getelementptr i32, i32* %out, i64 999999
  %in.gep = getelementptr i32, i32* %in, i64 7
-  %cast = addrspacecast i32* %in.gep to i32 addrspace(2)*
+  %cast = addrspacecast i32* %in.gep to i32 addrspace(4)*
  %tmp0 = icmp eq i32 %cond, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %tmp1 = load i32, i32 addrspace(2)* %cast
+  %tmp1 = load i32, i32 addrspace(4)* %cast
  br label %endif

 endif:
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@ -268,23 +268,23 @@ done:
 }

 ; OPT-LABEL: @test_sink_constant_small_offset_i32
-; OPT-NOT:  getelementptr i32, i32 addrspace(2)*
+; OPT-NOT:  getelementptr i32, i32 addrspace(4)*
 ; OPT: br i1

 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
  br label %endif

 endif:
@ -297,23 +297,23 @@ done:
 }

 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
-; OPT-NOT:  getelementptr i32, i32 addrspace(2)*
+; OPT-NOT:  getelementptr i32, i32 addrspace(4)*
 ; OPT: br i1

 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
  br label %endif

 endif:
@ -326,9 +326,9 @@ done:
 }

 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
-; OPT-SI:  getelementptr i32, i32 addrspace(2)*
-; OPT-CI-NOT:  getelementptr i32, i32 addrspace(2)*
-; OPT-VI-NOT:  getelementptr i32, i32 addrspace(2)*
+; OPT-SI:  getelementptr i32, i32 addrspace(4)*
+; OPT-CI-NOT:  getelementptr i32, i32 addrspace(4)*
+; OPT-VI-NOT:  getelementptr i32, i32 addrspace(4)*
 ; OPT: br i1

 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
@ -337,16 +337,16 @@ done:

 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
  br label %endif

 endif:
@ -359,8 +359,8 @@ done:
 }

 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
-; OPT-SI: getelementptr i32, i32 addrspace(2)*
-; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
+; OPT-SI: getelementptr i32, i32 addrspace(4)*
+; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
 ; OPT: br i1

 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
@ -369,16 +369,16 @@ done:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
  br label %endif

 endif:
@ -391,7 +391,7 @@ done:
 }

 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
-; OPT: getelementptr i32, i32 addrspace(2)*
+; OPT: getelementptr i32, i32 addrspace(4)*
 ; OPT: br i1

 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
@ -400,16 +400,16 @@ done:
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
  br label %endif

 endif:
@ -430,16 +430,16 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}

 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
  br label %endif

 endif:
@ -452,9 +452,9 @@ done:
 }

 ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
-; OPT-SI: getelementptr i32, i32 addrspace(2)*
-; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
-; OPT-VI: getelementptr i32, i32 addrspace(2)*
+; OPT-SI: getelementptr i32, i32 addrspace(4)*
+; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
+; OPT-VI: getelementptr i32, i32 addrspace(4)*
 ; OPT: br i1

 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
@ -468,16 +468,16 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}

 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
  br label %endif

 endif:
@ -524,17 +524,17 @@ bb34:
 ; OPT: br i1 %tmp0,
 ; OPT: if:
 ; OPT: getelementptr i8, {{.*}} 4095
-define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
-  %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095
+  %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
  br i1 %tmp0, label %endif, label %if

 if:
-  %bitcast = bitcast i8 addrspace(2)* %in.gep to i32 addrspace(2)*
-  %tmp1 = load i32, i32 addrspace(2)* %bitcast, align 1
+  %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)*
+  %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1
  br label %endif

 endif:
--- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@ -32,9 +32,9 @@ endif:
 ; GCN: v_add_f64
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
-define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(4)* %in) #0 {
 entry:
-  %v = load double, double addrspace(2)* %in
+  %v = load double, double addrspace(4)* %in
  %cc = fcmp oeq double %v, 1.000000e+00
  br i1 %cc, label %if, label %endif

--- a/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert.ll
@ -187,9 +187,9 @@ endif:

 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 {
 entry:
-  %v = load i32, i32 addrspace(2)* %in
+  %v = load i32, i32 addrspace(4)* %in
  %cc = fcmp oeq float %cnd, 1.000000e+00
  br i1 %cc, label %if, label %endif

@ -206,9 +206,9 @@ endif:

 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 ; GCN: v_cndmask_b32
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 {
 entry:
-  %v = load float, float addrspace(2)* %in
+  %v = load float, float addrspace(4)* %in
  %cc = fcmp oeq float %v, 1.000000e+00
  br i1 %cc, label %if, label %endif

@ -248,9 +248,9 @@ endif:
 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
-define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
 entry:
-  %v = load i32, i32 addrspace(2)* %in
+  %v = load i32, i32 addrspace(4)* %in
  %cc = icmp eq i32 %cond, 1
  br i1 %cc, label %if, label %endif

@ -295,9 +295,9 @@ endif:
 ; GCN: s_addc_u32
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 {
 entry:
-  %v = load i64, i64 addrspace(2)* %in
+  %v = load i64, i64 addrspace(4)* %in
  %cc = icmp eq i32 %cond, 1
  br i1 %cc, label %if, label %endif

@ -320,9 +320,9 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
 entry:
-  %v = load <3 x i32>, <3 x i32> addrspace(2)* %in
+  %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
  %cc = icmp eq i32 %cond, 1
  br i1 %cc, label %if, label %endif

@ -345,9 +345,9 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 {
 entry:
-  %v = load <4 x i32>, <4 x i32> addrspace(2)* %in
+  %v = load <4 x i32>, <4 x i32> addrspace(4)* %in
  %cc = icmp eq i32 %cond, 1
  br i1 %cc, label %if, label %endif

--- a/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@ -8,8 +8,8 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
  %p0 = extractelement <2 x half> %vec, i32 0
  %p1 = extractelement <2 x half> %vec, i32 1
  %out1 = getelementptr half, half addrspace(1)* %out, i32 10
@ -26,8 +26,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 %idx) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
  %elt = extractelement <2 x half> %vec, i32 %idx
  store half %elt, half addrspace(1)* %out, align 2
  ret void
@ -45,12 +45,12 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
  %idx = load i32, i32 addrspace(1)* %gep
  %elt = extractelement <2 x half> %vec, i32 %idx
  store half %elt, half addrspace(1)* %out.gep, align 2
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@ -9,8 +9,8 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %p0 = extractelement <2 x i16> %vec, i32 0
  %p1 = extractelement <2 x i16> %vec, i32 1
  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
@ -27,8 +27,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %elt = extractelement <2 x i16> %vec, i32 %idx
  store i16 %elt, i16 addrspace(1)* %out, align 2
  ret void
@ -45,13 +45,13 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
  %idx = load volatile i32, i32 addrspace(1)* %gep
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %elt = extractelement <2 x i16> %vec, i32 %idx
  store i16 %elt, i16 addrspace(1)* %out.gep, align 2
  ret void
--- a/test/CodeGen/AMDGPU/fence-barrier.ll
+++ b/test/CodeGen/AMDGPU/fence-barrier.ll
@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx803 -enable-si-insert-waitcnts=1 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
 ; RUN: llvm-as -data-layout=A5 < %s | llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx803 -enable-si-insert-waitcnts=1 -verify-machineinstrs | FileCheck --check-prefix=GCN %s

-declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.x()
 declare void @llvm.amdgcn.s.barrier()
@ -34,19 +34,19 @@ define amdgpu_kernel void @test_local(i32 addrspace(1)*) {
  fence syncscope("workgroup") acquire
  %8 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_local.temp, i64 0, i64 0), align 4
  %9 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4
-  %10 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %10 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  %11 = call i32 @llvm.amdgcn.workitem.id.x()
  %12 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %13 = getelementptr inbounds i8, i8 addrspace(2)* %10, i64 4
-  %14 = bitcast i8 addrspace(2)* %13 to i16 addrspace(2)*
-  %15 = load i16, i16 addrspace(2)* %14, align 4
+  %13 = getelementptr inbounds i8, i8 addrspace(4)* %10, i64 4
+  %14 = bitcast i8 addrspace(4)* %13 to i16 addrspace(4)*
+  %15 = load i16, i16 addrspace(4)* %14, align 4
  %16 = zext i16 %15 to i32
  %17 = mul i32 %12, %16
  %18 = add i32 %17, %11
-  %19 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %19 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
  %20 = zext i32 %18 to i64
-  %21 = bitcast i8 addrspace(2)* %19 to i64 addrspace(2)*
-  %22 = load i64, i64 addrspace(2)* %21, align 8
+  %21 = bitcast i8 addrspace(4)* %19 to i64 addrspace(4)*
+  %22 = load i64, i64 addrspace(4)* %21, align 8
  %23 = add i64 %22, %20
  %24 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %23
  store i32 %8, i32 addrspace(1)* %24, align 4
@ -68,56 +68,56 @@ define amdgpu_kernel void @test_global(i32 addrspace(1)*) {
 ; <label>:4:                                      ; preds = %58, %1
  %5 = load i32, i32 addrspace(5)* %3, align 4
  %6 = sext i32 %5 to i64
-  %7 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %7 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  %8 = call i32 @llvm.amdgcn.workitem.id.x()
  %9 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %10 = getelementptr inbounds i8, i8 addrspace(2)* %7, i64 4
-  %11 = bitcast i8 addrspace(2)* %10 to i16 addrspace(2)*
-  %12 = load i16, i16 addrspace(2)* %11, align 4
+  %10 = getelementptr inbounds i8, i8 addrspace(4)* %7, i64 4
+  %11 = bitcast i8 addrspace(4)* %10 to i16 addrspace(4)*
+  %12 = load i16, i16 addrspace(4)* %11, align 4
  %13 = zext i16 %12 to i32
  %14 = mul i32 %9, %13
  %15 = add i32 %14, %8
-  %16 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %16 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
  %17 = zext i32 %15 to i64
-  %18 = bitcast i8 addrspace(2)* %16 to i64 addrspace(2)*
-  %19 = load i64, i64 addrspace(2)* %18, align 8
+  %18 = bitcast i8 addrspace(4)* %16 to i64 addrspace(4)*
+  %19 = load i64, i64 addrspace(4)* %18, align 8
  %20 = add i64 %19, %17
  %21 = icmp ult i64 %6, %20
  br i1 %21, label %22, label %61

 ; <label>:22:                                     ; preds = %4
-  %23 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %23 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  %24 = call i32 @llvm.amdgcn.workitem.id.x()
  %25 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %26 = getelementptr inbounds i8, i8 addrspace(2)* %23, i64 4
-  %27 = bitcast i8 addrspace(2)* %26 to i16 addrspace(2)*
-  %28 = load i16, i16 addrspace(2)* %27, align 4
+  %26 = getelementptr inbounds i8, i8 addrspace(4)* %23, i64 4
+  %27 = bitcast i8 addrspace(4)* %26 to i16 addrspace(4)*
+  %28 = load i16, i16 addrspace(4)* %27, align 4
  %29 = zext i16 %28 to i32
  %30 = mul i32 %25, %29
  %31 = add i32 %30, %24
-  %32 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %32 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
  %33 = zext i32 %31 to i64
-  %34 = bitcast i8 addrspace(2)* %32 to i64 addrspace(2)*
-  %35 = load i64, i64 addrspace(2)* %34, align 8
+  %34 = bitcast i8 addrspace(4)* %32 to i64 addrspace(4)*
+  %35 = load i64, i64 addrspace(4)* %34, align 8
  %36 = add i64 %35, %33
  %37 = add i64 %36, 2184
  %38 = trunc i64 %37 to i32
  %39 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4
  %40 = load i32, i32 addrspace(5)* %3, align 4
  %41 = sext i32 %40 to i64
-  %42 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %42 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  %43 = call i32 @llvm.amdgcn.workitem.id.x()
  %44 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %45 = getelementptr inbounds i8, i8 addrspace(2)* %42, i64 4
-  %46 = bitcast i8 addrspace(2)* %45 to i16 addrspace(2)*
-  %47 = load i16, i16 addrspace(2)* %46, align 4
+  %45 = getelementptr inbounds i8, i8 addrspace(4)* %42, i64 4
+  %46 = bitcast i8 addrspace(4)* %45 to i16 addrspace(4)*
+  %47 = load i16, i16 addrspace(4)* %46, align 4
  %48 = zext i16 %47 to i32
  %49 = mul i32 %44, %48
  %50 = add i32 %49, %43
-  %51 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %51 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
  %52 = zext i32 %50 to i64
-  %53 = bitcast i8 addrspace(2)* %51 to i64 addrspace(2)*
-  %54 = load i64, i64 addrspace(2)* %53, align 8
+  %53 = bitcast i8 addrspace(4)* %51 to i64 addrspace(4)*
+  %54 = load i64, i64 addrspace(4)* %53, align 8
  %55 = add i64 %54, %52
  %56 = add i64 %41, %55
  %57 = getelementptr inbounds i32, i32 addrspace(1)* %39, i64 %56
@ -147,19 +147,19 @@ define amdgpu_kernel void @test_global_local(i32 addrspace(1)*) {
  %2 = alloca i32 addrspace(1)*, align 4, addrspace(5)
  store i32 addrspace(1)* %0, i32 addrspace(1)* addrspace(5)* %2, align 4
  %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4
-  %4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %4 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  %5 = call i32 @llvm.amdgcn.workitem.id.x()
  %6 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %7 = getelementptr inbounds i8, i8 addrspace(2)* %4, i64 4
-  %8 = bitcast i8 addrspace(2)* %7 to i16 addrspace(2)*
-  %9 = load i16, i16 addrspace(2)* %8, align 4
+  %7 = getelementptr inbounds i8, i8 addrspace(4)* %4, i64 4
+  %8 = bitcast i8 addrspace(4)* %7 to i16 addrspace(4)*
+  %9 = load i16, i16 addrspace(4)* %8, align 4
  %10 = zext i16 %9 to i32
  %11 = mul i32 %6, %10
  %12 = add i32 %11, %5
-  %13 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %13 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
  %14 = zext i32 %12 to i64
-  %15 = bitcast i8 addrspace(2)* %13 to i64 addrspace(2)*
-  %16 = load i64, i64 addrspace(2)* %15, align 8
+  %15 = bitcast i8 addrspace(4)* %13 to i64 addrspace(4)*
+  %16 = load i64, i64 addrspace(4)* %15, align 8
  %17 = add i64 %16, %14
  %18 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %17
  store i32 1, i32 addrspace(1)* %18, align 4
@ -178,19 +178,19 @@ define amdgpu_kernel void @test_global_local(i32 addrspace(1)*) {
  fence syncscope("workgroup") acquire
  %24 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_global_local.temp, i64 0, i64 0), align 4
  %25 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4
-  %26 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %26 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  %27 = call i32 @llvm.amdgcn.workitem.id.x()
  %28 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %29 = getelementptr inbounds i8, i8 addrspace(2)* %26, i64 4
-  %30 = bitcast i8 addrspace(2)* %29 to i16 addrspace(2)*
-  %31 = load i16, i16 addrspace(2)* %30, align 4
+  %29 = getelementptr inbounds i8, i8 addrspace(4)* %26, i64 4
+  %30 = bitcast i8 addrspace(4)* %29 to i16 addrspace(4)*
+  %31 = load i16, i16 addrspace(4)* %30, align 4
  %32 = zext i16 %31 to i32
  %33 = mul i32 %28, %32
  %34 = add i32 %33, %27
-  %35 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %35 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
  %36 = zext i32 %34 to i64
-  %37 = bitcast i8 addrspace(2)* %35 to i64 addrspace(2)*
-  %38 = load i64, i64 addrspace(2)* %37, align 8
+  %37 = bitcast i8 addrspace(4)* %35 to i64 addrspace(4)*
+  %38 = load i64, i64 addrspace(4)* %37, align 8
  %39 = add i64 %38, %36
  %40 = getelementptr inbounds i32, i32 addrspace(1)* %25, i64 %39
  store i32 %24, i32 addrspace(1)* %40, align 4
--- a/test/CodeGen/AMDGPU/function-returns.ll
+++ b/test/CodeGen/AMDGPU/function-returns.ll
@ -164,7 +164,7 @@ define <5 x i32> @v5i32_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <8 x i32> @v8i32_func_void() #0 {
-  %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
  ret <8 x i32> %val
 }
@ -177,7 +177,7 @@ define <8 x i32> @v8i32_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <16 x i32> @v16i32_func_void() #0 {
-  %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
  ret <16 x i32> %val
 }
@ -194,7 +194,7 @@ define <16 x i32> @v16i32_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <32 x i32> @v32i32_func_void() #0 {
-  %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
  ret <32 x i32> %val
 }
@ -214,7 +214,7 @@ define <2 x i64> @v2i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <3 x i64> @v3i64_func_void() #0 {
-  %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(4)* undef
  %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
  ret <3 x i64> %val
 }
@ -225,7 +225,7 @@ define <3 x i64> @v3i64_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <4 x i64> @v4i64_func_void() #0 {
-  %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(4)* undef
  %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
  ret <4 x i64> %val
 }
@ -237,7 +237,7 @@ define <4 x i64> @v4i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <5 x i64> @v5i64_func_void() #0 {
-  %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(4)* undef
  %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
  ret <5 x i64> %val
 }
@ -250,7 +250,7 @@ define <5 x i64> @v5i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <8 x i64> @v8i64_func_void() #0 {
-  %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(4)* undef
  %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
  ret <8 x i64> %val
 }
@ -267,7 +267,7 @@ define <8 x i64> @v8i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <16 x i64> @v16i64_func_void() #0 {
-  %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(4)* undef
  %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
  ret <16 x i64> %val
 }
@ -309,7 +309,7 @@ define <4 x i16> @v4i16_func_void() #0 {
 ; GFX9: v_lshrrev_b32_e32 v1, 16, v0
 ; GCN: s_setpc_b64
 define <5 x i16> @v5i16_func_void() #0 {
-  %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
  %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
  ret <5 x i16> %val
 }
@ -319,7 +319,7 @@ define <5 x i16> @v5i16_func_void() #0 {
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <8 x i16> @v8i16_func_void() #0 {
-  %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(4)* undef
  %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
  ret <8 x i16> %val
 }
@ -330,7 +330,7 @@ define <8 x i16> @v8i16_func_void() #0 {
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <16 x i16> @v16i16_func_void() #0 {
-  %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(4)* undef
  %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
  ret <16 x i16> %val
 }
@ -342,7 +342,7 @@ define <16 x i16> @v16i16_func_void() #0 {
 ; GCN-DAG: v14
 ; GCN-DAG: v15
 define <16 x i8> @v16i8_func_void() #0 {
-  %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
  ret <16 x i8> %val
 }
@ -356,7 +356,7 @@ define <16 x i8> @v16i8_func_void() #0 {
 ; GFX89-DAG: v_lshrrev_b16_e32 v1, 8, v0
 ; GCN: s_setpc_b64
 define <4  x i8> @v4i8_func_void() #0 {
-  %ptr = load volatile <4  x i8> addrspace(1)*, <4  x i8> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <4  x i8> addrspace(1)*, <4  x i8> addrspace(1)* addrspace(4)* undef
  %val = load <4  x i8>, <4  x i8> addrspace(1)* %ptr
  ret <4  x i8> %val
 }
@ -427,7 +427,7 @@ define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0)
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <33 x i32> @v33i32_func_void() #0 {
-  %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(4)* undef
  %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
  ret <33 x i32> %val
 }
@ -469,7 +469,7 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
-  %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef
  %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
  ret { <32 x i32>, i32 }%val
 }
@ -511,7 +511,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
-  %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef
+  %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(4)* undef
  %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
  ret { i32, <32 x i32> }%val
 }
--- a/test/CodeGen/AMDGPU/global-constant.ll
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@ -1,9 +1,9 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOHSA %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s

-@private1 = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0]
-@private2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0]
-@available_externally = available_externally addrspace(2) global [256 x i32] zeroinitializer
+@private1 = private unnamed_addr addrspace(4) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0]
+@private2 = private unnamed_addr addrspace(4) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0]
+@available_externally = available_externally addrspace(4) global [256 x i32] zeroinitializer

 ; GCN-LABEL: {{^}}private_test:
 ; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
@ -27,11 +27,11 @@
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4

 define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
-  %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
-  %val = load float, float addrspace(2)* %ptr
+  %ptr = getelementptr [4 x float], [4 x float] addrspace(4) * @private1, i32 0, i32 %index
+  %val = load float, float addrspace(4)* %ptr
  store volatile float %val, float addrspace(1)* %out
-  %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @private2, i32 0, i32 %index
-  %val2 = load float, float addrspace(2)* %ptr2
+  %ptr2 = getelementptr [4 x float], [4 x float] addrspace(4) * @private2, i32 0, i32 %index
+  %val2 = load float, float addrspace(4)* %ptr2
  store volatile float %val2, float addrspace(1)* %out
  ret void
 }
@ -41,8 +41,8 @@ define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4
 define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1
-  %val = load i32, i32 addrspace(2)* %ptr
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(4)* @available_externally, i32 0, i32 1
+  %val = load i32, i32 addrspace(4)* %ptr
  store i32 %val, i32 addrspace(1)* %out
  ret void
 }
--- a/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@ -4,9 +4,9 @@
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s


-@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
+@b = internal addrspace(4) constant [1 x i16] [ i16 7 ], align 2

-@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
+@float_gv = internal unnamed_addr addrspace(4) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4

 ; FUNC-LABEL: {{^}}float:
 ; GCN: s_load_dword
@ -17,13 +17,13 @@
 ; EG-NOT: MOV
 define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) {
 entry:
-  %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
-  %1 = load float, float addrspace(2)* %0
+  %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(4)* @float_gv, i32 0, i32 %index
+  %1 = load float, float addrspace(4)* %0
  store float %1, float addrspace(1)* %out
  ret void
 }

-@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
+@i32_gv = internal unnamed_addr addrspace(4) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4

 ; FUNC-LABEL: {{^}}i32:

@ -35,8 +35,8 @@ entry:
 ; EG-NOT: MOV
 define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
-  %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
-  %1 = load i32, i32 addrspace(2)* %0
+  %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(4)* @i32_gv, i32 0, i32 %index
+  %1 = load i32, i32 addrspace(4)* %0
  store i32 %1, i32 addrspace(1)* %out
  ret void
 }
@ -44,7 +44,7 @@ entry:

 %struct.foo = type { float, [5 x i32] }

-@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
+@struct_foo_gv = internal unnamed_addr addrspace(4) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]

 ; FUNC-LABEL: {{^}}struct_foo_gv_load:
 ; GCN: s_load_dword
@ -54,13 +54,13 @@ entry:
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
 define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
-  %load = load i32, i32 addrspace(2)* %gep, align 4
+  %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(4)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i32, i32 addrspace(4)* %gep, align 4
  store i32 %load, i32 addrspace(1)* %out, align 4
  ret void
 }

-@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>,
+@array_v1_gv = internal addrspace(4) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>,
                                                                <1 x i32> <i32 2>,
                                                                <1 x i32> <i32 3>,
                                                                <1 x i32> <i32 4> ]
@ -73,8 +73,8 @@ define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
 define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
-  %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
+  %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(4)* @array_v1_gv, i32 0, i32 %index
+  %load = load <1 x i32>, <1 x i32> addrspace(4)* %gep, align 4
  store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
  ret void
 }
@ -90,8 +90,8 @@ entry:
  br i1 %0, label %if, label %else

 if:
-  %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
-  %2 = load float, float addrspace(2)* %1
+  %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(4)* @float_gv, i32 0, i32 %index
+  %2 = load float, float addrspace(4)* %1
  store float %2, float addrspace(1)* %out
  br label %endif

--- a/test/CodeGen/AMDGPU/hsa-func-align.ll
+++ b/test/CodeGen/AMDGPU/hsa-func-align.ll
@ -10,9 +10,9 @@

 ; HSA: .globl simple_align16
 ; HSA: .p2align 5
-define void @simple_align16(i32 addrspace(1)* addrspace(2)* %ptr.out) align 32 {
+define void @simple_align16(i32 addrspace(1)* addrspace(4)* %ptr.out) align 32 {
 entry:
-  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
  store i32 0, i32 addrspace(1)* %out
  ret void
 }
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@ -51,9 +51,9 @@
 ; HSA: .size   simple, .Lfunc_end0-simple
 ; HSA: ; Function info:
 ; HSA-NOT: COMPUTE_PGM_RSRC2
-define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) {
+define void @simple(i32 addrspace(1)* addrspace(4)* %ptr.out) {
 entry:
-  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
  store i32 0, i32 addrspace(1)* %out
  ret void
 }
@ -61,9 +61,9 @@ entry:
 ; Ignore explicit alignment that is too low.
 ; HSA: .globl simple_align2
 ; HSA: .p2align 2
-define void @simple_align2(i32 addrspace(1)* addrspace(2)* %ptr.out) align 2 {
+define void @simple_align2(i32 addrspace(1)* addrspace(4)* %ptr.out) align 2 {
 entry:
-  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
  store i32 0, i32 addrspace(1)* %out
  ret void
 }
--- a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
@ -581,7 +581,7 @@ define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c)
 ; CHECK-NEXT:       ValueType:     I8
 ; CHECK-NEXT:       AddrSpaceQual: Global
 define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g,
-                                           i32 addrspace(2)* %c,
+                                           i32 addrspace(4)* %c,
                                           i32 addrspace(3)* %l)
    !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51
    !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
--- a/test/CodeGen/AMDGPU/image-schedule.ll
+++ b/test/CodeGen/AMDGPU/image-schedule.ll
@ -20,21 +20,21 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg, i32 inreg %arg1
  %.0.vec.insert = insertelement <2 x i32> undef, i32 %arg2, i32 0
  %.4.vec.insert = shufflevector <2 x i32> %.0.vec.insert, <2 x i32> %tmp6, <2 x i32> <i32 0, i32 3>
  %tmp7 = bitcast <2 x i32> %.4.vec.insert to i64
-  %tmp8 = inttoptr i64 %tmp7 to [4294967295 x i8] addrspace(2)*
+  %tmp8 = inttoptr i64 %tmp7 to [4294967295 x i8] addrspace(4)*
  %tmp9 = add <3 x i32> %arg3, %arg5
-  %tmp10 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp8, i64 0, i64 32
-  %tmp11 = bitcast i8 addrspace(2)* %tmp10 to <8 x i32> addrspace(2)*, !amdgpu.uniform !0
-  %tmp12 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp11, align 16
+  %tmp10 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 32
+  %tmp11 = bitcast i8 addrspace(4)* %tmp10 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0
+  %tmp12 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp11, align 16
  %tmp13 = shufflevector <3 x i32> %tmp9, <3 x i32> undef, <2 x i32> <i32 0, i32 1>
  %tmp14 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp13, <8 x i32> %tmp12, i32 15, i1 false, i1 false, i1 false, i1 false) #0
-  %tmp15 = inttoptr i64 %tmp7 to <8 x i32> addrspace(2)*
-  %tmp16 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp15, align 16
+  %tmp15 = inttoptr i64 %tmp7 to <8 x i32> addrspace(4)*
+  %tmp16 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16
  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %tmp14, <2 x i32> %tmp13, <8 x i32> %tmp16, i32 15, i1 false, i1 false, i1 false, i1 false) #0
-  %tmp17 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp15, align 16
+  %tmp17 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16
  %tmp18 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp13, <8 x i32> %tmp17, i32 15, i1 false, i1 false, i1 false, i1 false) #0
-  %tmp19 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp8, i64 0, i64 64
-  %tmp20 = bitcast i8 addrspace(2)* %tmp19 to <8 x i32> addrspace(2)*, !amdgpu.uniform !0
-  %tmp21 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp20, align 16
+  %tmp19 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 64
+  %tmp20 = bitcast i8 addrspace(4)* %tmp19 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0
+  %tmp21 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp20, align 16
  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %tmp18, <2 x i32> %tmp13, <8 x i32> %tmp21, i32 15, i1 false, i1 false, i1 false, i1 false) #0
  ret void
 }
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@ -10,8 +10,8 @@

 ; GFX9-NOT: lshr
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]]
-define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
  ret void
@ -28,8 +28,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
  ret void
@ -48,8 +48,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
 ; GFX9-DAG: ; use [[ELT1]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %elt1 = extractelement <2 x i16> %vec, i32 1
  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@ -68,8 +68,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %elt.hi = lshr i32 %elt.arg, 16
  %elt = trunc i32 %elt.hi to i16
  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@ -88,8 +88,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]]
 ; GFX9: ; use [[ELT1]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %elt.hi = lshr i32 %elt.arg, 16
  %elt = trunc i32 %elt.hi to i16
  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@ -113,8 +113,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
 ; GFX9: ; use [[ELT_HI]]
 ; GFX9: ; use [[VEC_HI]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %elt.hi = lshr i32 %elt.arg, 16
  %elt = trunc i32 %elt.hi to i16
  %vec.hi = extractelement <2 x i16> %vec, i32 1
@ -137,8 +137,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i
 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000

 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7
-define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
  ret void
@ -153,8 +153,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,

 ; GCN-NOT: shlr
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
-define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
  ret void
@ -167,8 +167,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %

 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]]
-define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
  ret void
@ -182,8 +182,8 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000

 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500
-define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
  ret void
@ -399,9 +399,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac
 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]]
 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
-  %idx = load volatile i32, i32 addrspace(2)* %idx.ptr
-  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
+  %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
+  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@ -22,8 +22,8 @@ define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointe
 ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
-define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
-  %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(4)* dereferenceable(4096) nonnull %in) #0 {
+  %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(4)* %in, !invariant.load !0
  %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
  store i16 123, i16 addrspace(1)* %ptr, align 4
  store i16 456, i16 addrspace(1)* %ptr.1
--- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@ -14,10 +14,10 @@
 ; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc

-define amdgpu_vs void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <4 x i32>] addrspace(2)* byval %arg3, [17 x <4 x i32>] addrspace(2)* inreg %arg4, [17 x <4 x i32>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
+define amdgpu_vs void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <32 x i8>] addrspace(4)* byval %arg2, [2 x <4 x i32>] addrspace(4)* byval %arg3, [17 x <4 x i32>] addrspace(4)* inreg %arg4, [17 x <4 x i32>] addrspace(4)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
 main_body:
-  %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(2)* %arg3, i64 0, i32 1
-  %tmp10 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(4)* %arg3, i64 0, i32 1
+  %tmp10 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
  %tmp11 = shl i32 %arg6, 2
  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
  %tmp13 = bitcast i32 %tmp12 to float
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@ -7,13 +7,13 @@
 ; GCN: enable_sgpr_dispatch_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
-  %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
-  %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
-  %value = load i32, i32 addrspace(2)* %header_ptr
+  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+  %value = load i32, i32 addrspace(4)* %header_ptr
  store i32 %value, i32 addrspace(1)* %out
  ret void
 }

-declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0

 attributes #0 = { readnone }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll
@ -2,23 +2,23 @@

 ; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target
 define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 {
-  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
-  %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
-  %value = load i32, i32 addrspace(2)* %header_ptr
+  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
+  %header_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
+  %value = load i32, i32 addrspace(4)* %header_ptr
  store i32 %value, i32 addrspace(1)* %out
  ret void
 }

 ; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target
 define void @test_func(i32 addrspace(1)* %out) #1 {
-  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
-  %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
-  %value = load i32, i32 addrspace(2)* %header_ptr
+  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
+  %header_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
+  %value = load i32, i32 addrspace(4)* %header_ptr
  store i32 %value, i32 addrspace(1)* %out
  ret void
 }

-declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0

 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind  }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
@ -10,9 +10,9 @@
 define amdgpu_ps i32 @test_ps() #1 {
  %alloca = alloca i32, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca
-  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
-  %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
-  %value = load volatile i32, i32 addrspace(2)* %buffer_ptr
+  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
+  %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
+  %value = load volatile i32, i32 addrspace(4)* %buffer_ptr
  ret i32 %value
 }

@ -23,13 +23,13 @@ define amdgpu_ps i32 @test_ps() #1 {
 define amdgpu_cs i32 @test_cs() #1 {
  %alloca = alloca i32, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca
-  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
-  %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
-  %value = load volatile i32, i32 addrspace(2)* %buffer_ptr
+  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
+  %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
+  %value = load volatile i32, i32 addrspace(4)* %buffer_ptr
  ret i32 %value
 }

-declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0

 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@ -11,9 +11,9 @@

 ; HSA: s_load_dword s0, s[4:5], 0x0
 define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
-  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
-  %load = load volatile i32, i32 addrspace(2)* %cast
+  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+  %load = load volatile i32, i32 addrspace(4)* %cast
  ret void
 }

@ -26,9 +26,9 @@ define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {

 ; HSA: s_load_dword s0, s[4:5], 0x1c
 define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
-  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
-  %load = load volatile i32, i32 addrspace(2)* %cast
+  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+  %load = load volatile i32, i32 addrspace(4)* %cast
  ret void
 }

@ -38,9 +38,9 @@ define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @func_implicitarg_ptr() #1 {
-  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
-  %load = load volatile i32, i32 addrspace(2)* %cast
+  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+  %load = load volatile i32, i32 addrspace(4)* %cast
  ret void
 }

@ -86,12 +86,12 @@ define void @func_call_implicitarg_ptr_func() #1 {
 ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}}
 ; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0{{$}}
 define void @func_kernarg_implicitarg_ptr() #1 {
-  %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
-  %cast.kernarg.segment.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
-  %cast.implicitarg = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
-  %load0 = load volatile i32, i32 addrspace(2)* %cast.kernarg.segment.ptr
-  %load1 = load volatile i32, i32 addrspace(2)* %cast.implicitarg
+  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
+  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
+  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
  ret void
 }

@ -106,8 +106,8 @@ define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8])
  ret void
 }

-declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #2
-declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #2
+declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
+declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2

 attributes #0 = { nounwind noinline }
 attributes #1 = { nounwind noinline }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@ -11,10 +11,10 @@

 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
 define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
-  %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
-  %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
-  %value = load i32, i32 addrspace(2)* %gep
+  %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
+  %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
+  %value = load i32, i32 addrspace(4)* %gep
  store i32 %value, i32 addrspace(1)* %out
  ret void
 }
@ -23,10 +23,10 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
 define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
-  %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
-  %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
-  %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
-  %value = load i32, i32 addrspace(2)* %gep
+  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+  %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
+  %value = load i32, i32 addrspace(4)* %gep
  store i32 %value, i32 addrspace(1)* %out
  ret void
 }
@ -42,9 +42,9 @@ define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
 ; MESA: buffer_store_dword [[V_VAL]]
 ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
 define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
-  %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
-  %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
-  %val = load i32, i32 addrspace(2)* %arg.ptr
+  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %arg.ptr
  store i32 %val, i32 addrspace(1)* %out
  ret void
 }
@ -53,16 +53,16 @@ define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x
 ; HSA: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
 define amdgpu_kernel void @test_no_kernargs() #1 {
-  %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
-  %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
-  %value = load i32, i32 addrspace(2)* %gep
+  %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
+  %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
+  %value = load i32, i32 addrspace(4)* %gep
  store volatile i32 %value, i32 addrspace(1)* undef
  ret void
 }

-declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
-declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0

 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@ -7,13 +7,13 @@
 ; GCN: enable_sgpr_queue_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
-  %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
-  %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
-  %value = load i32, i32 addrspace(2)* %header_ptr
+  %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+  %value = load i32, i32 addrspace(4)* %header_ptr
  store i32 %value, i32 addrspace(1)* %out
  ret void
 }

-declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0

 attributes #0 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@ -3,7 +3,7 @@

 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i1) nounwind
 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
-declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(2)* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(4)* nocapture, i64, i1) nounwind


 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
@ -328,8 +328,8 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 ad
 }

 ; Test shouldConvertConstantLoadToIntImm
-@hello.align4 = private unnamed_addr addrspace(2) constant [16 x i8] c"constant string\00", align 4
-@hello.align1 = private unnamed_addr addrspace(2) constant [16 x i8] c"constant string\00", align 1
+@hello.align4 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 4
+@hello.align1 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 1

 ; FUNC-LABEL: {{^}}test_memcpy_const_string_align4:
 ; SI: s_getpc_b64
@ -341,8 +341,8 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 ad
 ; SI-DAG: buffer_store_dwordx4
 ; SI-DAG: buffer_store_dwordx4
 define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
-  %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)*
-  call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* align 4 %out, i8 addrspace(2)* align 4 %str, i64 32, i1 false)
+  %str = bitcast [16 x i8] addrspace(4)* @hello.align4 to i8 addrspace(4)*
+  call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* align 4 %out, i8 addrspace(4)* align 4 %str, i64 32, i1 false)
  ret void
 }

@ -366,7 +366,7 @@ define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noal
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
-  %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)*
-  call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i1 false)
+  %str = bitcast [16 x i8] addrspace(4)* @hello.align1 to i8 addrspace(4)*
+  call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(4)* %str, i64 32, i1 false)
  ret void
 }
--- a/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/test/CodeGen/AMDGPU/load-constant-f64.ll
@ -6,8 +6,8 @@
 ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_store_dwordx2
-define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
-  %ld = load double, double addrspace(2)* %in
+define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(4)* %in) #0 {
+  %ld = load double, double addrspace(4)* %in
  store double %ld, double addrspace(1)* %out
  ret void
 }
--- a/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i1.ll
@ -9,57 +9,57 @@

 ; EG: VTX_READ_8
 ; EG: AND_INT
-define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
-  %load = load i1, i1 addrspace(2)* %in
+define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
+  %load = load i1, i1 addrspace(4)* %in
  store i1 %load, i1 addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_load_v2i1:
-define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
  store <2 x i1> %load, <2 x i1> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_load_v3i1:
-define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
  store <3 x i1> %load, <3 x i1> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_load_v4i1:
-define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
  store <4 x i1> %load, <4 x i1> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_load_v8i1:
-define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
  store <8 x i1> %load, <8 x i1> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_load_v16i1:
-define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
  store <16 x i1> %load, <16 x i1> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_load_v32i1:
-define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
  store <32 x i1> %load, <32 x i1> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_load_v64i1:
-define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
  store <64 x i1> %load, <64 x i1> addrspace(1)* %out
  ret void
 }
@ -67,8 +67,8 @@ define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64
 ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
-  %a = load i1, i1 addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
+  %a = load i1, i1 addrspace(4)* %in
  %ext = zext i1 %a to i32
  store i32 %ext, i32 addrspace(1)* %out
  ret void
@ -81,136 +81,136 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i

 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
-  %a = load i1, i1 addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
+  %a = load i1, i1 addrspace(4)* %in
  %ext = sext i1 %a to i32
  store i32 %ext, i32 addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
-define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
  %ext = zext <1 x i1> %load to <1 x i32>
  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
-define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
  %ext = sext <1 x i1> %load to <1 x i32>
  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
-define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
  %ext = zext <2 x i1> %load to <2 x i32>
  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
-define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
  %ext = sext <2 x i1> %load to <2 x i32>
  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
-define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
  %ext = zext <3 x i1> %load to <3 x i32>
  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
-define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
  %ext = sext <3 x i1> %load to <3 x i32>
  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
-define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
  %ext = zext <4 x i1> %load to <4 x i32>
  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
-define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
  %ext = sext <4 x i1> %load to <4 x i32>
  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
-define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
  %ext = zext <8 x i1> %load to <8 x i32>
  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
-define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
  %ext = sext <8 x i1> %load to <8 x i32>
  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
-define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
  %ext = zext <16 x i1> %load to <16 x i32>
  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
-define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
  %ext = sext <16 x i1> %load to <16 x i32>
  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
-define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
  %ext = zext <32 x i1> %load to <32 x i32>
  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
-define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
  %ext = sext <32 x i1> %load to <32 x i32>
  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
-define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
  %ext = zext <64 x i1> %load to <64 x i32>
  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
-define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
  %ext = sext <64 x i1> %load to <64 x i32>
  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
  ret void
@ -221,8 +221,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspac
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
-  %a = load i1, i1 addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
+  %a = load i1, i1 addrspace(4)* %in
  %ext = zext i1 %a to i64
  store i64 %ext, i64 addrspace(1)* %out
  ret void
@ -233,136 +233,136 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
-  %a = load i1, i1 addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
+  %a = load i1, i1 addrspace(4)* %in
  %ext = sext i1 %a to i64
  store i64 %ext, i64 addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
-define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
  %ext = zext <1 x i1> %load to <1 x i64>
  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
-define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
  %ext = sext <1 x i1> %load to <1 x i64>
  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
-define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
  %ext = zext <2 x i1> %load to <2 x i64>
  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
-define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
  %ext = sext <2 x i1> %load to <2 x i64>
  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
-define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
  %ext = zext <3 x i1> %load to <3 x i64>
  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
-define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
  %ext = sext <3 x i1> %load to <3 x i64>
  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
-define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
  %ext = zext <4 x i1> %load to <4 x i64>
  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
-define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
  %ext = sext <4 x i1> %load to <4 x i64>
  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
-define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
  %ext = zext <8 x i1> %load to <8 x i64>
  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
-define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
  %ext = sext <8 x i1> %load to <8 x i64>
  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
-define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
  %ext = zext <16 x i1> %load to <16 x i64>
  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
-define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
  %ext = sext <16 x i1> %load to <16 x i64>
  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
-define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
  %ext = zext <32 x i1> %load to <32 x i64>
  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
-define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
  %ext = sext <32 x i1> %load to <32 x i64>
  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
-define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
  %ext = zext <64 x i1> %load to <64 x i64>
  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
-define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
  %ext = sext <64 x i1> %load to <64 x i64>
  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@ -8,9 +8,9 @@
 ; GCN-HSA: flat_load_ushort

 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(4)* %in) {
 entry:
-  %ld = load i16, i16 addrspace(2)* %in
+  %ld = load i16, i16 addrspace(4)* %in
  store i16 %ld, i16 addrspace(1)* %out
  ret void
 }
@ -19,9 +19,9 @@ entry:
 ; GCN: s_load_dword s

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) {
 entry:
-  %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in
+  %ld = load <2 x i16>, <2 x i16> addrspace(4)* %in
  store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
  ret void
 }
@ -31,9 +31,9 @@ entry:

 ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
  store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
  ret void
 }
@ -42,9 +42,9 @@ entry:
 ; GCN: s_load_dwordx2

 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) {
 entry:
-  %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in
+  %ld = load <4 x i16>, <4 x i16> addrspace(4)* %in
  store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
  ret void
 }
@ -53,9 +53,9 @@ entry:
 ; GCN: s_load_dwordx4

 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) {
 entry:
-  %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in
+  %ld = load <8 x i16>, <8 x i16> addrspace(4)* %in
  store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
  ret void
 }
@ -65,9 +65,9 @@ entry:

 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) {
 entry:
-  %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
+  %ld = load <16 x i16>, <16 x i16> addrspace(4)* %in
  store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
  ret void
 }
@ -80,8 +80,8 @@ entry:
 ; GCN-HSA: flat_store_dword

 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
-  %a = load i16, i16 addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+  %a = load i16, i16 addrspace(4)* %in
  %ext = zext i16 %a to i32
  store i32 %ext, i32 addrspace(1)* %out
  ret void
@ -97,8 +97,8 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out,
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
-  %a = load i16, i16 addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+  %a = load i16, i16 addrspace(4)* %in
  %ext = sext i16 %a to i32
  store i32 %ext, i32 addrspace(1)* %out
  ret void
@ -109,8 +109,8 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out,
 ; GCN-HSA: flat_load_ushort

 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
-  %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
  %ext = zext <1 x i16> %load to <1 x i32>
  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
  ret void
@ -123,8 +123,8 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
-  %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
  %ext = sext <1 x i16> %load to <1 x i32>
  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
  ret void
@ -140,8 +140,8 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EG: 16
 ; EG: 16
-define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
-  %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
  %ext = zext <2 x i16> %load to <2 x i32>
  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
  ret void
@ -160,8 +160,8 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
-  %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
  %ext = sext <2 x i16> %load to <2 x i32>
  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
  ret void
@ -183,9 +183,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(
 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
  %ext = zext <3 x i16> %ld to <3 x i32>
  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
  ret void
@ -204,9 +204,9 @@ entry:
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
  %ext = sext <3 x i16> %ld to <3 x i32>
  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
  ret void
@ -229,8 +229,8 @@ entry:
 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
-  %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
  %ext = zext <4 x i16> %load to <4 x i32>
  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
  ret void
@ -254,8 +254,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
-  %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
  %ext = sext <4 x i16> %load to <4 x i32>
  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
  ret void
@ -288,8 +288,8 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; EG-DAG: 65535
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
-  %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
  %ext = zext <8 x i16> %load to <8 x i32>
  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
  ret void
@ -322,8 +322,8 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
-  %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
  %ext = sext <8 x i16> %load to <8 x i32>
  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
  ret void
@ -337,8 +337,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1
-define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
  %ext = zext <16 x i16> %load to <16 x i32>
  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
  ret void
@ -352,8 +352,8 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1
-define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
  %ext = sext <16 x i16> %load to <16 x i32>
  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
  ret void
@ -369,8 +369,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
-  %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
  %ext = zext <32 x i16> %load to <32 x i32>
  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
  ret void
@ -385,8 +385,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
-  %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
  %ext = sext <32 x i16> %load to <32 x i32>
  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
  ret void
@ -404,8 +404,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
-  %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
+  %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
  %ext = zext <64 x i16> %load to <64 x i32>
  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
  ret void
@ -421,8 +421,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
-  %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
+  %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
  %ext = sext <64 x i16> %load to <64 x i32>
  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
  ret void
@ -438,8 +438,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa

 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
-  %a = load i16, i16 addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+  %a = load i16, i16 addrspace(4)* %in
  %ext = zext i16 %a to i64
  store i64 %ext, i64 addrspace(1)* %out
  ret void
@ -464,8 +464,8 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out,
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
-  %a = load i16, i16 addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+  %a = load i16, i16 addrspace(4)* %in
  %ext = sext i16 %a to i64
  store i64 %ext, i64 addrspace(1)* %out
  ret void
@ -475,8 +475,8 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out,

 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
-  %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
  %ext = zext <1 x i16> %load to <1 x i64>
  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
  ret void
@ -488,8 +488,8 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
-  %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
  %ext = sext <1 x i16> %load to <1 x i64>
  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
  ret void
@ -498,8 +498,8 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(
 ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
-  %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
  %ext = zext <2 x i16> %load to <2 x i64>
  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
  ret void
@ -508,8 +508,8 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(
 ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
-  %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
  %ext = sext <2 x i16> %load to <2 x i64>
  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
  ret void
@ -518,8 +518,8 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(
 ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:

 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
-  %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
  %ext = zext <4 x i16> %load to <4 x i64>
  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
  ret void
@ -528,8 +528,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(
 ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:

 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
-  %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
  %ext = sext <4 x i16> %load to <4 x i64>
  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
  ret void
@ -538,8 +538,8 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(
 ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:

 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
-  %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
  %ext = zext <8 x i16> %load to <8 x i64>
  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
  ret void
@ -548,8 +548,8 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:

 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
-  %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
  %ext = sext <8 x i16> %load to <8 x i64>
  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
  ret void
@ -559,8 +559,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(

 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
  %ext = zext <16 x i16> %load to <16 x i64>
  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
  ret void
@ -570,8 +570,8 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa

 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
  %ext = sext <16 x i16> %load to <16 x i64>
  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
  ret void
@ -583,8 +583,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
-  %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
  %ext = zext <32 x i16> %load to <32 x i64>
  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
  ret void
@ -596,8 +596,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
-  %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
  %ext = sext <32 x i16> %load to <32 x i64>
  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
  ret void
@ -606,16 +606,16 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; These trigger undefined register machine verifier errors

 ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
-; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
-;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
+;   %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
 ;   ret void
 ; }

 ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
-; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
-;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
+;   %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
 ;   ret void
--- a/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i32.ll
@ -7,9 +7,9 @@
 ; GCN: s_load_dword s{{[0-9]+}}

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
 entry:
-  %ld = load i32, i32 addrspace(2)* %in
+  %ld = load i32, i32 addrspace(4)* %in
  store i32 %ld, i32 addrspace(1)* %out
  ret void
 }
@ -18,9 +18,9 @@ entry:
 ; GCN: s_load_dwordx2

 ; EG: VTX_READ_64
-define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+  %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
  store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
  ret void
 }
@ -29,9 +29,9 @@ entry:
 ; GCN: s_load_dwordx4

 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in
+  %ld = load <3 x i32>, <3 x i32> addrspace(4)* %in
  store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
  ret void
 }
@ -40,9 +40,9 @@ entry:
 ; GCN: s_load_dwordx4

 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+  %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
  store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
  ret void
 }
@ -52,9 +52,9 @@ entry:

 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+  %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
  store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
  ret void
 }
@ -66,9 +66,9 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+  %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
  store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
  ret void
 }
@ -81,8 +81,8 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG: CF_END
 ; EG: VTX_READ_32
-define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
-  %ld = load i32, i32 addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
+  %ld = load i32, i32 addrspace(4)* %in
  %ext = zext i32 %ld to i64
  store i64 %ext, i64 addrspace(1)* %out
  ret void
@ -98,8 +98,8 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out,
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
-  %ld = load i32, i32 addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
+  %ld = load i32, i32 addrspace(4)* %in
  %ext = sext i32 %ld to i64
  store i64 %ext, i64 addrspace(1)* %out
  ret void
@ -108,8 +108,8 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out,
 ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
 ; GCN: s_load_dword
 ; GCN: store_dwordx2
-define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
-  %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
+  %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
  %ext = zext <1 x i32> %ld to <1 x i64>
  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
  ret void
@ -119,8 +119,8 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(
 ; GCN: s_load_dword s[[LO:[0-9]+]]
 ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
 ; GCN: store_dwordx2
-define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
-  %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
+  %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
  %ext = sext <1 x i32> %ld to <1 x i64>
  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
  ret void
@ -129,8 +129,8 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(
 ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
-  %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
+  %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
  %ext = zext <2 x i32> %ld to <2 x i64>
  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
  ret void
@ -143,8 +143,8 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(
 ; GCN-DAG: s_ashr_i32

 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
-  %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
+  %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
  %ext = sext <2 x i32> %ld to <2 x i64>
  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
  ret void
@ -155,8 +155,8 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(

 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
-  %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
+  %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
  %ext = zext <4 x i32> %ld to <4 x i64>
  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
  ret void
@ -172,8 +172,8 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(

 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
-  %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
+  %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
  %ext = sext <4 x i32> %ld to <4 x i64>
  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
  ret void
@ -191,8 +191,8 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
-  %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
+  %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
  %ext = zext <8 x i32> %ld to <8 x i64>
  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
  ret void
@ -219,8 +219,8 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
-  %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
+  %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
  %ext = sext <8 x i32> %ld to <8 x i64>
  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
  ret void
@ -240,8 +240,8 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
-  %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
+  %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
  %ext = sext <16 x i32> %ld to <16 x i64>
  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
  ret void
@ -267,8 +267,8 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
-  %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
+  %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
  %ext = zext <16 x i32> %ld to <16 x i64>
  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
  ret void
@ -319,8 +319,8 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4

-define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
-  %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
+  %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
  %ext = sext <32 x i32> %ld to <32 x i64>
  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
  ret void
@ -370,8 +370,8 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
-  %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
+  %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
  %ext = zext <32 x i32> %ld to <32 x i64>
  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i64.ll
@ -7,8 +7,8 @@
 ; FUNC-LABEL: {{^}}constant_load_i64:
 ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; EG: VTX_READ_64
-define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
-  %ld = load i64, i64 addrspace(2)* %in
+define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(4)* %in) #0 {
+  %ld = load i64, i64 addrspace(4)* %in
  store i64 %ld, i64 addrspace(1)* %out
  ret void
 }
@ -17,9 +17,9 @@ define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspa
 ; GCN: s_load_dwordx4

 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in
+  %ld = load <2 x i64>, <2 x i64> addrspace(4)* %in
  store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
  ret void
 }
@ -29,9 +29,9 @@ entry:

 ; EG-DAG: VTX_READ_128
 ; EG-DAG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in
+  %ld = load <3 x i64>, <3 x i64> addrspace(4)* %in
  store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
  ret void
 }
@ -41,9 +41,9 @@ entry:

 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in
+  %ld = load <4 x i64>, <4 x i64> addrspace(4)* %in
  store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
  ret void
 }
@ -55,9 +55,9 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in
+  %ld = load <8 x i64>, <8 x i64> addrspace(4)* %in
  store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
  ret void
 }
@ -74,9 +74,9 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in
+  %ld = load <16 x i64>, <16 x i64> addrspace(4)* %in
  store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
  ret void
 }
--- a/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i8.ll
@ -10,9 +10,9 @@

 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
 entry:
-  %ld = load i8, i8 addrspace(2)* %in
+  %ld = load i8, i8 addrspace(4)* %in
  store i8 %ld, i8 addrspace(1)* %out
  ret void
 }
@ -22,9 +22,9 @@ entry:
 ; GCN-HSA: flat_load_ushort v

 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in
  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
  ret void
 }
@ -33,9 +33,9 @@ entry:
 ; GCN: s_load_dword s

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in
  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
  ret void
 }
@ -44,9 +44,9 @@ entry:
 ; GCN: s_load_dword s

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  %ld = load <4 x i8>, <4 x i8> addrspace(4)* %in
  store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
  ret void
 }
@ -55,9 +55,9 @@ entry:
 ; GCN: s_load_dwordx2

 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in
+  %ld = load <8 x i8>, <8 x i8> addrspace(4)* %in
  store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
  ret void
 }
@ -66,9 +66,9 @@ entry:
 ; GCN: s_load_dwordx4

 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
+  %ld = load <16 x i8>, <16 x i8> addrspace(4)* %in
  store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
  ret void
 }
@ -78,8 +78,8 @@ entry:
 ; GCN-HSA: flat_load_ubyte

 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %a = load i8, i8 addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %a = load i8, i8 addrspace(4)* %in
  %ext = zext i8 %a to i32
  store i32 %ext, i32 addrspace(1)* %out
  ret void
@ -92,8 +92,8 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %ld = load i8, i8 addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %ld = load i8, i8 addrspace(4)* %in
  %ext = sext i8 %ld to i32
  store i32 %ext, i32 addrspace(1)* %out
  ret void
@ -102,8 +102,8 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32:

 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
  %ext = zext <1 x i8> %load to <1 x i32>
  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
  ret void
@ -114,8 +114,8 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
  %ext = sext <1 x i8> %load to <1 x i32>
  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
  ret void
@ -129,8 +129,8 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1
 ; TODO: This should use DST, but for some there are redundant MOVs
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG: 8
-define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
  %ext = zext <2 x i8> %load to <2 x i32>
  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
  ret void
@ -150,8 +150,8 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
  %ext = sext <2 x i8> %load to <2 x i32>
  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
  ret void
@ -170,9 +170,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in
  %ext = zext <3 x i8> %ld to <3 x i32>
  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
  ret void
@ -193,9 +193,9 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in
  %ext = sext <3 x i8> %ld to <3 x i32>
  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
  ret void
@ -214,8 +214,8 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
  %ext = zext <4 x i8> %load to <4 x i32>
  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
  ret void
@ -236,8 +236,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
  %ext = sext <4 x i8> %load to <4 x i32>
  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
  ret void
@ -264,8 +264,8 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
  %ext = zext <8 x i8> %load to <8 x i32>
  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
  ret void
@ -294,8 +294,8 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
  %ext = sext <8 x i8> %load to <8 x i32>
  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
  ret void
@ -335,8 +335,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
  %ext = zext <16 x i8> %load to <16 x i32>
  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
  ret void
@ -378,8 +378,8 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspac
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
  %ext = sext <16 x i8> %load to <16 x i32>
  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
  ret void
@ -450,8 +450,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspac
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
  %ext = zext <32 x i8> %load to <32 x i32>
  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
  ret void
@ -526,8 +526,8 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspac
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
  %ext = sext <32 x i8> %load to <32 x i32>
  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
  ret void
@ -539,8 +539,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspac
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
-  %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
+  %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
  %ext = zext <64 x i8> %load to <64 x i32>
  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
  ret void
@ -552,8 +552,8 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspac
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
-  %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
+  %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
  %ext = sext <64 x i8> %load to <64 x i32>
  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
  ret void
@ -570,8 +570,8 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspac

 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %a = load i8, i8 addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %a = load i8, i8 addrspace(4)* %in
  %ext = zext i8 %a to i64
  store i64 %ext, i64 addrspace(1)* %out
  ret void
@ -589,8 +589,8 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %a = load i8, i8 addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %a = load i8, i8 addrspace(4)* %in
  %ext = sext i8 %a to i64
  store i64 %ext, i64 addrspace(1)* %out
  ret void
@ -600,8 +600,8 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i

 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
  %ext = zext <1 x i8> %load to <1 x i64>
  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
  ret void
@ -613,8 +613,8 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
  %ext = sext <1 x i8> %load to <1 x i64>
  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
  ret void
@ -623,8 +623,8 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64:

 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
  %ext = zext <2 x i8> %load to <2 x i64>
  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
  ret void
@ -633,8 +633,8 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64:

 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
  %ext = sext <2 x i8> %load to <2 x i64>
  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
  ret void
@ -643,8 +643,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64:

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
  %ext = zext <4 x i8> %load to <4 x i64>
  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
  ret void
@ -653,8 +653,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64:

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
  %ext = sext <4 x i8> %load to <4 x i64>
  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
  ret void
@ -663,8 +663,8 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64:

 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
  %ext = zext <8 x i8> %load to <8 x i64>
  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
  ret void
@ -673,8 +673,8 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64:

 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
  %ext = sext <8 x i8> %load to <8 x i64>
  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
  ret void
@ -683,8 +683,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64:

 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
  %ext = zext <16 x i8> %load to <16 x i64>
  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
  ret void
@ -693,8 +693,8 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspac
 ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64:

 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
  %ext = sext <16 x i8> %load to <16 x i64>
  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
  ret void
@ -704,8 +704,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspac

 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
  %ext = zext <32 x i8> %load to <32 x i64>
  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
  ret void
@ -715,24 +715,24 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspac

 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
  %ext = sext <32 x i8> %load to <32 x i64>
  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
  ret void
 }

 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64:
-; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
 ;   ret void
 ; }

 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64:
-; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
 ;   ret void
@ -744,8 +744,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspac

 ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
-define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %a = load i8, i8 addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %a = load i8, i8 addrspace(4)* %in
  %ext = zext i8 %a to i16
  store i16 %ext, i16 addrspace(1)* %out
  ret void
@ -759,16 +759,16 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]

 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %a = load i8, i8 addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %a = load i8, i8 addrspace(4)* %in
  %ext = sext i8 %a to i16
  store i16 %ext, i16 addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16:
-define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
  %ext = zext <1 x i8> %load to <1 x i16>
  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
  ret void
@ -778,8 +778,8 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1

 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
  %ext = sext <1 x i8> %load to <1 x i16>
  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
  ret void
@ -788,8 +788,8 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16:

 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
  %ext = zext <2 x i8> %load to <2 x i16>
  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
  ret void
@ -800,8 +800,8 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
  %ext = sext <2 x i8> %load to <2 x i16>
  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
  ret void
@ -810,8 +810,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16:

 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
  %ext = zext <4 x i8> %load to <4 x i16>
  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
  ret void
@ -824,8 +824,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
  %ext = sext <4 x i8> %load to <4 x i16>
  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
  ret void
@ -834,8 +834,8 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16:

 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
  %ext = zext <8 x i8> %load to <8 x i16>
  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
  ret void
@ -853,8 +853,8 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal

-define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
  %ext = sext <8 x i8> %load to <8 x i16>
  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
  ret void
@ -863,8 +863,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16:

 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
  %ext = zext <16 x i8> %load to <16 x i16>
  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
  ret void
@ -889,8 +889,8 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspac
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
  %ext = sext <16 x i8> %load to <16 x i16>
  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
  ret void
@ -900,8 +900,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspac

 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
  %ext = zext <32 x i8> %load to <32 x i16>
  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
  ret void
@ -943,24 +943,24 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspac
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
  %ext = sext <32 x i8> %load to <32 x i16>
  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
  ret void
 }

 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16:
-; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
 ;   ret void
 ; }

 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16:
-; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
 ;   ret void
--- a/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/test/CodeGen/AMDGPU/load-hi16.ll
@ -473,10 +473,10 @@ entry:
 ; GFX9-NEXT: s_setpc_b64

 ; VI: flat_load_ushort
-define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(2)* %in, i16 %reg) #0 {
+define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047
-  %load = load i16, i16 addrspace(2)* %gep
+  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
+  %load = load i16, i16 addrspace(4)* %gep
  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
@ -492,10 +492,10 @@ entry:
 ; GFX9-NEXT: s_setpc_b64

 ; VI: flat_load_ushort
-define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(2)* %in, half %reg) #0 {
+define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
 entry:
-  %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047
-  %load = load half, half addrspace(2)* %gep
+  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
+  %load = load half, half addrspace(4)* %gep
  %build0 = insertelement <2 x half> undef, half %reg, i32 0
  %build1 = insertelement <2 x half> %build0, half %load, i32 1
  store <2 x half> %build1, <2 x half> addrspace(1)* undef
@ -625,11 +625,11 @@ entry:
 ; GFX9-NEXT: s_waitcnt
 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
 ; GFX9-NEXT: s_setpc_b64
-define <2 x i16> @load_constant_v2i16_split(i16 addrspace(2)* %in) #0 {
+define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 1
-  %load0 = load volatile i16, i16 addrspace(2)* %in
-  %load1 = load volatile i16, i16 addrspace(2)* %gep
+  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
+  %load0 = load volatile i16, i16 addrspace(4)* %in
+  %load1 = load volatile i16, i16 addrspace(4)* %gep
  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
  ret <2 x i16> %build1
--- a/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/test/CodeGen/AMDGPU/load-lo16.ll
@ -559,11 +559,11 @@ entry:
 ; GFX9-NEXT: s_setpc_b64

 ; VI: flat_load_ushort
-define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(2)* %in, i32 %reg) #0 {
+define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
 entry:
  %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047
-  %load = load i16, i16 addrspace(2)* %gep
+  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
+  %load = load i16, i16 addrspace(4)* %gep
  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
  ret void
@ -578,11 +578,11 @@ entry:
 ; GFX9-NEXT: s_setpc_b64

 ; VI: flat_load_ushort
-define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(2)* %in, i32 %reg) #0 {
+define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 {
 entry:
  %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047
-  %load = load half, half addrspace(2)* %gep
+  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
+  %load = load half, half addrspace(4)* %gep
  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
  store <2 x half> %build1, <2 x half> addrspace(1)* undef
  ret void
--- a/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@ -5,17 +5,17 @@

 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
-declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0

 ; GCN-LABEL: {{^}}get_global_id_0:
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
 ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
 ; GCN: v_mad_u32_u24 v{{[0-9]+}}, s8, [[VWGSIZEX]], v0
 define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
-  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
-  %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1
-  %workgroup.size.xy = load i32, i32 addrspace(2)* %gep, align 4, !invariant.load !0
+  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %cast.dispatch.ptr = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)*
+  %gep = getelementptr inbounds i32, i32 addrspace(4)* %cast.dispatch.ptr, i64 1
+  %workgroup.size.xy = load i32, i32 addrspace(4)* %gep, align 4, !invariant.load !0
  %workgroup.size.x = and i32 %workgroup.size.xy, 65535

  %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
--- a/test/CodeGen/AMDGPU/missing-store.ll
+++ b/test/CodeGen/AMDGPU/missing-store.ll
@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s

-@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8
+@ptr_load = addrspace(3) global i32 addrspace(4)* undef, align 8

 ; Make sure when the load from %ptr2 is folded the chain isn't lost,
 ; resulting in losing the store to gptr
@ -16,11 +16,11 @@
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+  %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @ptr_load, align 8
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 2

  store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4

  store i32 %tmp2, i32 addrspace(1)* %out, align 4
  ret void
--- a/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
+++ b/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
@ -5,40 +5,40 @@

 ; CHECK-LABEL: {{^}}test_none:
 ; CHECK: buffer_load_format_x v0, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define amdgpu_vs float @test_none(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+define amdgpu_vs float @test_none(<4 x i32> addrspace(4)* inreg %base, i32 %i) {
 main_body:
-  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32
  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i1 0, i1 0)
  ret float %tmp7
 }

 ; CHECK-LABEL: {{^}}test_idxen:
 ; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen{{$}}
-define amdgpu_vs float @test_idxen(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+define amdgpu_vs float @test_idxen(<4 x i32> addrspace(4)* inreg %base, i32 %i) {
 main_body:
-  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32
  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i1 0, i1 0)
  ret float %tmp7
 }

 ; CHECK-LABEL: {{^}}test_offen:
 ; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
-define amdgpu_vs float @test_offen(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+define amdgpu_vs float @test_offen(<4 x i32> addrspace(4)* inreg %base, i32 %i) {
 main_body:
-  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32
  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 undef, i1 0, i1 0)
  ret float %tmp7
 }

 ; CHECK-LABEL: {{^}}test_both:
 ; CHECK: buffer_load_format_x v0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen{{$}}
-define amdgpu_vs float @test_both(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+define amdgpu_vs float @test_both(<4 x i32> addrspace(4)* inreg %base, i32 %i) {
 main_body:
-  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32
  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i1 0, i1 0)
  ret float %tmp7
 }
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@ -55,10 +55,10 @@ entry:

 ; CHECK-LABEL: {{^}}soffset_max_imm:
 ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
-define amdgpu_gs void @soffset_max_imm([6 x <4 x i32>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
+define amdgpu_gs void @soffset_max_imm([6 x <4 x i32>] addrspace(4)* byval, [17 x <4 x i32>] addrspace(4)* byval, [16 x <4 x i32>] addrspace(4)* byval, [32 x <8 x i32>] addrspace(4)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
 main_body:
-  %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(2)* %0, i32 0, i32 0
-  %tmp1 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp0
+  %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
+  %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
  %tmp2 = shl i32 %6, 2
  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
  %tmp4 = add i32 %6, 16
@ -74,10 +74,10 @@ main_body:
 ; CHECK-LABEL: {{^}}soffset_no_fold:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
 ; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
-define amdgpu_gs void @soffset_no_fold([6 x <4 x i32>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
+define amdgpu_gs void @soffset_no_fold([6 x <4 x i32>] addrspace(4)* byval, [17 x <4 x i32>] addrspace(4)* byval, [16 x <4 x i32>] addrspace(4)* byval, [32 x <8 x i32>] addrspace(4)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
 main_body:
-  %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(2)* %0, i32 0, i32 0
-  %tmp1 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp0
+  %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
+  %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
  %tmp2 = shl i32 %6, 2
  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
  %tmp4 = add i32 %6, 16
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@ -642,12 +642,12 @@ uniform.multi.exit.region:
  br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1

 uniform.if:
-  %sgpr0 = load volatile i32, i32 addrspace(2)* undef
+  %sgpr0 = load volatile i32, i32 addrspace(4)* undef
  %uniform.cond1 = icmp slt i32 %sgpr0, 1
  br i1 %uniform.cond1, label %uniform.then, label %uniform.endif

 uniform.then:
-  %sgpr1 = load volatile i32, i32 addrspace(2)* undef
+  %sgpr1 = load volatile i32, i32 addrspace(4)* undef
  %uniform.cond2 = icmp sge i32 %sgpr1, 4
  store volatile i32 33, i32 addrspace(1)* undef
  br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
--- a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@ -6,21 +6,21 @@
 ; EG: R_AMDGPU_ABS32 extern_const_addrspace

 ; CHECK-DAG: Name: extern_const_addrspace
-@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
+@extern_const_addrspace = external unnamed_addr addrspace(4) constant [5 x i32], align 4

 ; CHECK-DAG: Name: load_extern_const_init
 define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
-  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
+  %val = load i32, i32 addrspace(4)* getelementptr ([5 x i32], [5 x i32] addrspace(4)* @extern_const_addrspace, i64 0, i64 3), align 4
  store i32 %val, i32 addrspace(1)* %out, align 4
  ret void
 }

 ; CHECK-DAG: Name: undef_const_addrspace
-@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
+@undef_const_addrspace = unnamed_addr addrspace(4) constant [5 x i32] undef, align 4

 ; CHECK-DAG: Name: undef_const_addrspace
 define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
-  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
+  %val = load i32, i32 addrspace(4)* getelementptr ([5 x i32], [5 x i32] addrspace(4)* @undef_const_addrspace, i64 0, i64 3), align 4
  store i32 %val, i32 addrspace(1)* %out, align 4
  ret void
 }
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@ -194,9 +194,9 @@ define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out,
 ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
-define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
 entry:
-  %val = load i32, i32 addrspace(2)* %in
+  %val = load i32, i32 addrspace(4)* %in
  %mask = and i32 %val, 65535
  store i32 %mask, i32 addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/nullptr.ll
+++ b/test/CodeGen/AMDGPU/nullptr.ll
@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs | FileCheck -check-prefixes=CHECK,GCN %s
 ;RUN: llc < %s -march=r600 -mtriple=r600---amdgiz -verify-machineinstrs | FileCheck -check-prefixes=CHECK,R600 %s

-%struct.S = type { i32 addrspace(5)*, i32 addrspace(1)*, i32 addrspace(2)*, i32 addrspace(3)*, i32*, i32 addrspace(4)*}
+%struct.S = type { i32 addrspace(5)*, i32 addrspace(1)*, i32 addrspace(4)*, i32 addrspace(3)*, i32*, i32 addrspace(2)*}

 ; CHECK-LABEL: nullptr_priv:
 ; CHECK-NEXT: .long 0
@ -15,7 +15,7 @@
 ; CHECK-LABEL: nullptr_const:
 ; GCN-NEXT: .quad 0
 ; R600-NEXT: .long 0
-@nullptr_const = global i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*)
+@nullptr_const = global i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*)

 ; CHECK-LABEL: nullptr_local:
 ; CHECK-NEXT: .long -1
@ -23,7 +23,7 @@

 ; CHECK-LABEL: nullptr_region:
 ; CHECK-NEXT: .long -1
-@nullptr_region = global i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*)
+@nullptr_region = global i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*)

 ; CHECK-LABEL: nullptr6:
 ; R600-NEXT: .long 0
@ -113,7 +113,7 @@
@structWithPointers = addrspace(1) global %struct.S {
  i32 addrspace(5)* addrspacecast (i32* null to i32 addrspace(5)*),
  i32 addrspace(1)* addrspacecast (i32* null to i32 addrspace(1)*),
-  i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*),
+  i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*),
  i32 addrspace(3)* addrspacecast (i32* null to i32 addrspace(3)*),
  i32* null,
-  i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*)}, align 4
+  i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*)}, align 4
--- a/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/test/CodeGen/AMDGPU/pack.v2f16.ll
@ -8,9 +8,9 @@
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
-  %val0 = load volatile i32, i32 addrspace(2)* %in0
-  %val1 = load volatile i32, i32 addrspace(2)* %in1
+define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 {
+  %val0 = load volatile i32, i32 addrspace(4)* %in0
+  %val1 = load volatile i32, i32 addrspace(4)* %in1
  %lo.i = trunc i32 %val0 to i16
  %hi.i = trunc i32 %val1 to i16
  %lo = bitcast i16 %lo.i to half
@ -27,8 +27,8 @@ define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
-  %val1 = load i32, i32 addrspace(2)* %in1
+define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(4)* %in1) #0 {
+  %val1 = load i32, i32 addrspace(4)* %in1
  %hi.i = trunc i32 %val1 to i16
  %hi = bitcast i16 %hi.i to half
  %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
@ -43,8 +43,8 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234
 ; GFX9: ; use [[PACKED]]
-define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 {
-  %val0 = load i32, i32 addrspace(2)* %in0
+define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(4)* %in0) #0 {
+  %val0 = load i32, i32 addrspace(4)* %in0
  %lo.i = trunc i32 %val0 to i16
  %lo = bitcast i16 %lo.i to half
  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
--- a/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/test/CodeGen/AMDGPU/pack.v2i16.ll
@ -8,9 +8,9 @@
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
-  %val0 = load volatile i32, i32 addrspace(2)* %in0
-  %val1 = load volatile i32, i32 addrspace(2)* %in1
+define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 {
+  %val0 = load volatile i32, i32 addrspace(4)* %in0
+  %val1 = load volatile i32, i32 addrspace(4)* %in1
  %lo = trunc i32 %val0 to i16
  %hi = trunc i32 %val1 to i16
  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
@ -25,8 +25,8 @@ define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
-  %val1 = load i32, i32 addrspace(2)* %in1
+define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(4)* %in1) #0 {
+  %val1 = load i32, i32 addrspace(4)* %in1
  %hi = trunc i32 %val1 to i16
  %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
@ -40,8 +40,8 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8
 ; GFX9: ; use [[PACKED]]
-define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 {
-  %val0 = load i32, i32 addrspace(2)* %in0
+define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(4)* %in0) #0 {
+  %val0 = load i32, i32 addrspace(4)* %in0
  %lo = trunc i32 %val0 to i16
  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
  %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1
--- a/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll
+++ b/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll
@ -1,6 +1,6 @@
 ; RUN: llc -filetype=obj -march=r600 -mcpu=cypress -verify-machineinstrs < %s | llvm-readobj -relocations -symbols | FileCheck %s

-@arr = internal unnamed_addr addrspace(2) constant [4 x i32] [i32 4, i32 5, i32 6, i32 7], align 4
+@arr = internal unnamed_addr addrspace(4) constant [4 x i32] [i32 4, i32 5, i32 6, i32 7], align 4

 ; CHECK: Relocations [
 ; CHECK: Section (3) .rel.text {
@ -19,8 +19,8 @@
 ; CHECK: }
 define amdgpu_kernel void @test_constant_array_fixup(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
 entry:
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @arr, i32 0, i32 %idx
-  %val = load i32, i32 addrspace(2)* %arrayidx
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(4)* @arr, i32 0, i32 %idx
+  %val = load i32, i32 addrspace(4)* %arrayidx
  store i32 %val, i32 addrspace(1)* %out, align 4
  ret void
 }
--- a/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/test/CodeGen/AMDGPU/readcyclecounter.ll
@ -28,9 +28,9 @@ define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
 ; SI-DAG: s_memtime
 ; VI-DAG: s_memrealtime
 ; GCN-DAG: s_load_dword
-define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(2)* inreg %in) #0 {
+define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(4)* inreg %in) #0 {
  %cycle0 = call i64 @llvm.readcyclecounter()
-  %in.v = load i64, i64 addrspace(2)* %in
+  %in.v = load i64, i64 addrspace(4)* %in
  %r.64 = add i64 %cycle0, %in.v
  %r.32 = trunc i64 %r.64 to i32
  ret i32 %r.32
--- a/test/CodeGen/AMDGPU/ret.ll
+++ b/test/CodeGen/AMDGPU/ret.ll
@ -7,7 +7,7 @@
 ; GCN: s_waitcnt expcnt(0)
 ; GCN: v_add_f32_e32 v0, 1.0, v0
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
  %x = fadd float %arg3, 1.000000e+00
@ -26,7 +26,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
  ret { float, float, float, float } { float 1.000000e+00, float 2.000000e+00, float 4.000000e+00, float -1.000000e+00 }
@ -43,7 +43,7 @@ bb:
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v6
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
 bb:
  %i0 = extractelement <2 x i32> %arg4, i32 0
  %i1 = extractelement <2 x i32> %arg4, i32 1
@ -68,7 +68,7 @@ bb:
 ; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
 ; GCN: v_mov_b32_e32 v0, 1.0
 ; GCN-NOT: s_endpgm
-define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
 bb:
  ret float 1.000000e+00
 }
@ -82,7 +82,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v1, v2
 ; GCN: v_mov_b32_e32 v2, v3
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
 bb:
  %f = bitcast <2 x i32> %arg8 to <2 x float>
  %s = insertvalue { float, <2 x float> } undef, float %arg14, 0
@ -101,7 +101,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v3, v6
 ; GCN-DAG: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 {
 bb:
  %i0 = extractelement <2 x i32> %arg4, i32 0
  %i1 = extractelement <2 x i32> %arg4, i32 1
@ -130,7 +130,7 @@ bb:
 ; GCN: v_mov_b32_e32 v3, v8
 ; GCN: v_mov_b32_e32 v4, v12
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 {
 bb:
  %i0 = extractelement <2 x i32> %arg4, i32 0
  %i1 = extractelement <2 x i32> %arg4, i32 1
@ -159,7 +159,7 @@ bb:
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 {
 bb:
  %i0 = extractelement <2 x i32> %arg4, i32 0
  %i1 = extractelement <2 x i32> %arg4, i32 1
@ -181,7 +181,7 @@ bb:
 ; GCN: s_add_i32 s0, s3, 2
 ; GCN: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
  %x = add i32 %arg2, 2
  %a = insertvalue { i32, i32, i32 } undef, i32 %x, 0
@ -197,7 +197,7 @@ bb:
 ; GCN-DAG: s_mov_b32 s2, 7
 ; GCN-DAG: s_mov_b32 s3, 8
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
  %x = add i32 %arg2, 2
  ret { i32, i32, i32, i32 } { i32 5, i32 6, i32 7, i32 8 }
@ -212,7 +212,7 @@ bb:
 ; GCN-DAG: s_add_i32 s0, s3, 2
 ; GCN-DAG: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
  %v = fadd float %arg3, 1.000000e+00
@ -235,7 +235,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN: s_waitcnt expcnt(0)
-define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
  ret { { float, i32 }, { i32, <2 x float> } } { { float, i32 } { float 1.000000e+00, i32 2 }, { i32, <2 x float> } { i32 3, <2 x float> <float 2.000000e+00, float 4.000000e+00> } }
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@ -65,24 +65,24 @@ done:                                             ; preds = %loop
 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
-define amdgpu_kernel void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @smrd_valu(i32 addrspace(4)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
  %tmp = icmp ne i32 %a, 0
  br i1 %tmp, label %if, label %else

 if:                                               ; preds = %entry
-  %tmp1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
+  %tmp1 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %in
  br label %endif

 else:                                             ; preds = %entry
-  %tmp2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
-  %tmp3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %tmp2
+  %tmp2 = getelementptr i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %in
+  %tmp3 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %tmp2
  br label %endif

 endif:                                            ; preds = %else, %if
-  %tmp4 = phi i32 addrspace(2)* [ %tmp1, %if ], [ %tmp3, %else ]
-  %tmp5 = getelementptr i32, i32 addrspace(2)* %tmp4, i32 3000
-  %tmp6 = load i32, i32 addrspace(2)* %tmp5
+  %tmp4 = phi i32 addrspace(4)* [ %tmp1, %if ], [ %tmp3, %else ]
+  %tmp5 = getelementptr i32, i32 addrspace(4)* %tmp4, i32 3000
+  %tmp6 = load i32, i32 addrspace(4)* %tmp5
  store i32 %tmp6, i32 addrspace(1)* %out
  ret void
 }
@ -93,12 +93,12 @@ endif:                                            ; preds = %else, %if
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(4)* %in) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = add i32 %tmp, 4
-  %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
-  %tmp3 = load i32, i32 addrspace(2)* %tmp2
+  %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(4)* %in, i32 %tmp, i32 4
+  %tmp3 = load i32, i32 addrspace(4)* %tmp2
  store i32 %tmp3, i32 addrspace(1)* %out
  ret void
 }
@ -113,12 +113,12 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %c) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
-  %tmp3 = getelementptr i32, i32 addrspace(2)* %tmp2, i32 5000
-  %tmp4 = load i32, i32 addrspace(2)* %tmp3
+  %tmp2 = getelementptr i32, i32 addrspace(4)* %in, i32 %tmp
+  %tmp3 = getelementptr i32, i32 addrspace(4)* %tmp2, i32 5000
+  %tmp4 = load i32, i32 addrspace(4)* %tmp3
  %tmp5 = add i32 %tmp4, %c
  store i32 %tmp5, i32 addrspace(1)* %out
  ret void
@ -133,12 +133,12 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(4)* %in, i64 %c) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
-  %tmp3 = getelementptr i64, i64 addrspace(2)* %tmp2, i32 5000
-  %tmp4 = load i64, i64 addrspace(2)* %tmp3
+  %tmp2 = getelementptr i64, i64 addrspace(4)* %in, i32 %tmp
+  %tmp3 = getelementptr i64, i64 addrspace(4)* %tmp2, i32 5000
+  %tmp4 = load i64, i64 addrspace(4)* %tmp3
  %tmp5 = or i64 %tmp4, %c
  store i64 %tmp5, i64 addrspace(1)* %out
  ret void
@ -155,12 +155,12 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in, <4 x i32> %c) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
-  %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %tmp2, i32 1234
-  %tmp4 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp3
+  %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %in, i32 %tmp
+  %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %tmp2, i32 1234
+  %tmp4 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp3
  %tmp5 = or <4 x i32> %tmp4, %c
  store <4 x i32> %tmp5, <4 x i32> addrspace(1)* %out
  ret void
@ -189,12 +189,12 @@ entry:
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in, <8 x i32> %c) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
-  %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %tmp2, i32 1234
-  %tmp4 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp3
+  %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %in, i32 %tmp
+  %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %tmp2, i32 1234
+  %tmp4 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp3
  %tmp5 = or <8 x i32> %tmp4, %c
  store <8 x i32> %tmp5, <8 x i32> addrspace(1)* %out
  ret void
@ -230,12 +230,12 @@ entry:
 ; GCN-HSA: flat_load_dwordx4

 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in, <16 x i32> %c) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
-  %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %tmp2, i32 1234
-  %tmp4 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp3
+  %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %in, i32 %tmp
+  %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %tmp2, i32 1234
+  %tmp4 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp3
  %tmp5 = or <16 x i32> %tmp4, %c
  store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
  ret void
@ -247,12 +247,12 @@ entry:
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
 ; GCN-NOHSA: buffer_store_dword [[ADD]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
-define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
+define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(4)* %in, i32 %a) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = add i32 %tmp, 4
-  %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
-  %tmp3 = load i32, i32 addrspace(2)* %tmp2
+  %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(4)* %in, i32 %tmp, i32 4
+  %tmp3 = load i32, i32 addrspace(4)* %tmp2
  %tmp4 = add i32 %tmp3, %a
  store i32 %tmp4, i32 addrspace(1)* %out
  ret void
@ -261,12 +261,12 @@ entry:
 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
 ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
-define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(4)* %in) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = add i32 %tmp, 4
-  %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 255
-  %tmp3 = load i32, i32 addrspace(2)* %tmp2
+  %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(4)* %in, i32 %tmp, i32 255
+  %tmp3 = load i32, i32 addrspace(4)* %tmp2
  store i32 %tmp3, i32 addrspace(1)* %out
  ret void
 }
@ -275,12 +275,12 @@ entry:
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(4)* %in) #1 {
 entry:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = add i32 %tmp, 4
-  %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 256
-  %tmp3 = load i32, i32 addrspace(2)* %tmp2
+  %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(4)* %in, i32 %tmp, i32 256
+  %tmp3 = load i32, i32 addrspace(4)* %tmp2
  store i32 %tmp3, i32 addrspace(1)* %out
  ret void
 }
@ -290,12 +290,12 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
 entry:
  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
-  %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <8 x i32> addrspace(4)*
+  %tmp3 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp2, align 4
  store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
  ret void
 }
@ -313,12 +313,12 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
 entry:
  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
-  %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <8 x i32> addrspace(4)*
+  %tmp3 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp2, align 4

  %elt0 = extractelement <8 x i32> %tmp3, i32 0
  %elt1 = extractelement <8 x i32> %tmp3, i32 1
@ -350,12 +350,12 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
 entry:
  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
-  %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <16 x i32> addrspace(4)*
+  %tmp3 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp2, align 4
  store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
  ret void
 }
@ -385,12 +385,12 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
 entry:
  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
-  %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <16 x i32> addrspace(4)*
+  %tmp3 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp2, align 4

  %elt0 = extractelement <16 x i32> %tmp3, i32 0
  %elt1 = extractelement <16 x i32> %tmp3, i32 1
--- a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
+++ b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
@ -15,11 +15,11 @@
  bb:
    %0 = getelementptr i32, i32 addrspace(1)* %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3
    %tmp5 = alloca %struct.wombat, align 16, addrspace(5)
-    %1 = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-    %2 = bitcast i8 addrspace(2)* %1 to i32 addrspace(2)*
-    %3 = getelementptr inbounds i32, i32 addrspace(2)* %2, i64 1
-    %4 = bitcast i32 addrspace(2)* %3 to <2 x i32> addrspace(2)*, !amdgpu.uniform !3, !amdgpu.noclobber !3
-    %5 = load <2 x i32>, <2 x i32> addrspace(2)* %4, align 4, !invariant.load !3
+    %1 = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+    %2 = bitcast i8 addrspace(4)* %1 to i32 addrspace(4)*
+    %3 = getelementptr inbounds i32, i32 addrspace(4)* %2, i64 1
+    %4 = bitcast i32 addrspace(4)* %3 to <2 x i32> addrspace(4)*, !amdgpu.uniform !3, !amdgpu.noclobber !3
+    %5 = load <2 x i32>, <2 x i32> addrspace(4)* %4, align 4, !invariant.load !3
    %6 = extractelement <2 x i32> %5, i32 0
    %7 = extractelement <2 x i32> %5, i32 1
    %8 = lshr i32 %6, 16
@ -32,7 +32,7 @@
    %15 = add i32 %13, %14
    %16 = add i32 %15, %11
    %17 = getelementptr inbounds [256 x [16 x i8]], [256 x [16 x i8]] addrspace(3)* @sched_dbg_value_crash.tmp6, i32 0, i32 %16
-    %tmp7 = load i64, i64 addrspace(2)* null, align 536870912
+    %tmp7 = load i64, i64 addrspace(4)* null, align 536870912
    %tmp8 = tail call i32 @llvm.amdgcn.workitem.id.x() #3, !range !4
    %tmp9 = zext i32 %tmp8 to i64
    %tmp10 = add i64 %tmp7, %tmp9
@ -141,7 +141,7 @@
  declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-  declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+  declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
  declare i32 @llvm.amdgcn.workitem.id.y() #1
  declare i32 @llvm.amdgcn.workitem.id.z() #1
  declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i32, i1) #0
@ -199,9 +199,9 @@ body:             |
    %2:vgpr_32 = COPY $vgpr2
    %1:vgpr_32 = COPY $vgpr1
    %0:vgpr_32 = COPY $vgpr0
-    %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
-    %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
-    %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`)
+    %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`)
+    %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`)
    %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 24, 0
    %9:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 32, 0
    %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3, 4, 0
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@ -528,8 +528,8 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
-  %ld = load i32, i32 addrspace(2)* %ptr
+define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+  %ld = load i32, i32 addrspace(4)* %ptr
  %in = trunc i32 %ld to i16
  %shl = shl i16 %in, 15
  %sext = ashr i16 %shl, 15
@ -547,8 +547,8 @@ define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addr
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
-define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
-  %ld = load i32, i32 addrspace(2)* %ptr
+define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+  %ld = load i32, i32 addrspace(4)* %ptr
  %in = trunc i32 %ld to i16
  %shl = shl i16 %in, 14
  %sext = ashr i16 %shl, 14
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@ -4,10 +4,10 @@
 ; CHECK-LABEL: {{^}}phi1:
 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
-define amdgpu_ps void @phi1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @phi1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0)
  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32)
@ -28,10 +28,10 @@ ENDIF:                                            ; preds = %ELSE, %main_body

 ; Make sure this program doesn't crash
 ; CHECK-LABEL: {{^}}phi2:
-define amdgpu_ps void @phi2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
+define amdgpu_ps void @phi2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32)
  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 36)
@ -47,10 +47,10 @@ main_body:
  %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 84)
  %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 88)
  %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 92)
-  %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0
-  %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0
-  %tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg1, i32 0
-  %tmp39 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp38, !tbaa !0
+  %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %arg2, i32 0
+  %tmp37 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp36, !tbaa !0
+  %tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg1, i32 0
+  %tmp39 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp38, !tbaa !0
  %i.i = extractelement <2 x i32> %arg5, i32 0
  %j.i = extractelement <2 x i32> %arg5, i32 1
  %i.f.i = bitcast i32 %i.i to float
@ -173,10 +173,10 @@ ENDIF24:                                          ; preds = %IF25, %ENDIF

 ; We just want ot make sure the program doesn't crash
 ; CHECK-LABEL: {{^}}loop:
-define amdgpu_ps void @loop(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @loop(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0)
  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 4)
  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 8)
@ -226,15 +226,15 @@ ENDIF:                                            ; preds = %LOOP
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}}
 ; CHECK: exp
 ; CHECK: s_endpgm
-define amdgpu_ps void @sample_v3([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @sample_v3([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0
-  %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
+  %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 16)
-  %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
-  %tmp24 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp23, !tbaa !0
-  %tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0
-  %tmp26 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp25, !tbaa !0
+  %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
+  %tmp24 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp23, !tbaa !0
+  %tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
+  %tmp26 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp25, !tbaa !0
  %tmp27 = fcmp oeq float %tmp22, 0.000000e+00
  %tmp26.bc = bitcast <4 x i32> %tmp26 to <4 x i32>
  br i1 %tmp27, label %if, label %else
@ -290,7 +290,7 @@ endif:                                            ; preds = %if1, %if0, %entry
 ; This test is just checking that we don't crash / assertion fail.
 ; CHECK-LABEL: {{^}}copy2:
 ; CHECK: s_endpgm
-define amdgpu_ps void @copy2([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @copy2([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
  br label %LOOP68

@ -326,15 +326,15 @@ ENDIF69:                                          ; preds = %LOOP68
 ; [[END]]:
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}}
 ; CHECK: s_endpgm
-define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(4)* byval %arg, [17 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <4 x i32>] addrspace(4)* byval %arg2, [32 x <8 x i32>] addrspace(4)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg1, i32 0, i32 0
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !3
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i32 0, i32 0
+  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !3
  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp22, i32 16)
-  %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
-  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !3
-  %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
-  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !3
+  %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(4)* %arg3, i32 0, i32 0
+  %tmp26 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp25, !tbaa !3
+  %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg2, i32 0, i32 0
+  %tmp28 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp27, !tbaa !3
  %i.i = extractelement <2 x i32> %arg7, i32 0
  %j.i = extractelement <2 x i32> %arg7, i32 1
  %i.f.i = bitcast i32 %i.i to float
@ -382,11 +382,11 @@ bb71:                                             ; preds = %bb80, %bb38
 ; Check the resource descriptor is stored in an sgpr.
 ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(4)* byval %arg) #0 {
 bb:
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
-  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
-  %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
+  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(4)* %arg, i32 0, i32 %tid
+  %tmp8 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp7, align 32, !tbaa !0
  %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
  %tmp10 = extractelement <4 x float> %tmp, i32 0
  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10)
@ -397,11 +397,11 @@ bb:
 ; Check the sampler is stored in an sgpr.
 ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
+define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(4)* byval %arg) #0 {
 bb:
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
-  %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
-  %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
+  %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i32 0, i32 %tid
+  %tmp8 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp7, align 16, !tbaa !0
  %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
  %tmp10 = extractelement <4 x float> %tmp, i32 0
  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)
--- a/test/CodeGen/AMDGPU/si-lod-bias.ll
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@ -6,15 +6,15 @@

 ; GCN-LABEL: {{^}}main:
 ; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
-define amdgpu_ps void @main(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @main(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
-  %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0
-  %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0
-  %tmp24 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg1, i32 0
-  %tmp25 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp24, !tbaa !0
+  %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %arg2, i32 0
+  %tmp23 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp22, !tbaa !0
+  %tmp24 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg1, i32 0
+  %tmp25 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp24, !tbaa !0
  %i.i = extractelement <2 x i32> %arg5, i32 0
  %j.i = extractelement <2 x i32> %arg5, i32 1
  %i.f.i = bitcast i32 %i.i to float
--- a/test/CodeGen/AMDGPU/si-scheduler.ll
+++ b/test/CodeGen/AMDGPU/si-scheduler.ll
@ -16,12 +16,12 @@
 ; CHECK: s_waitcnt vmcnt(0)
 ; CHECK: exp
 ; CHECK: s_endpgm
-define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_ps void @main([6 x <16 x i8>] addrspace(4)* byval %arg, [17 x <16 x i8>] addrspace(4)* byval %arg1, [17 x <4 x i32>] addrspace(4)* byval %arg2, [34 x <8 x i32>] addrspace(4)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 main_body:
-  %tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)*
-  %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0
-  %tmp23 = bitcast [17 x <4 x i32>] addrspace(2)* %arg2 to <16 x i8> addrspace(2)*
-  %tmp24 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp23, align 16, !tbaa !0
+  %tmp = bitcast [34 x <8 x i32>] addrspace(4)* %arg3 to <32 x i8> addrspace(4)*
+  %tmp22 = load <32 x i8>, <32 x i8> addrspace(4)* %tmp, align 32, !tbaa !0
+  %tmp23 = bitcast [17 x <4 x i32>] addrspace(4)* %arg2 to <16 x i8> addrspace(4)*
+  %tmp24 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp23, align 16, !tbaa !0
  %i.i = extractelement <2 x i32> %arg11, i32 0
  %j.i = extractelement <2 x i32> %arg11, i32 1
  %i.f.i = bitcast i32 %i.i to float
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@ -24,10 +24,10 @@
 ; GCN: s_endpgm

 ; TOVGPR: ScratchSize: 0{{$}}
-define amdgpu_ps void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
+define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0
-  %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
+  %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 96)
  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 100)
  %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 104)
@ -66,39 +66,39 @@ main_body:
  %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 372)
  %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 376)
  %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 384)
-  %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
-  %tmp61 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp60, !tbaa !0
-  %tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0
-  %tmp63 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp62, !tbaa !0
+  %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
+  %tmp61 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp60, !tbaa !0
+  %tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
+  %tmp63 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp62, !tbaa !0
  %tmp63.bc = bitcast <4 x i32> %tmp63 to <4 x i32>
-  %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1
-  %tmp65 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp64, !tbaa !0
-  %tmp66 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 1
-  %tmp67 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp66, !tbaa !0
-  %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2
-  %tmp69 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp68, !tbaa !0
-  %tmp70 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 2
-  %tmp71 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp70, !tbaa !0
-  %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3
-  %tmp73 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp72, !tbaa !0
-  %tmp74 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 3
-  %tmp75 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp74, !tbaa !0
-  %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4
-  %tmp77 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp76, !tbaa !0
-  %tmp78 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 4
-  %tmp79 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp78, !tbaa !0
-  %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5
-  %tmp81 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp80, !tbaa !0
-  %tmp82 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 5
-  %tmp83 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp82, !tbaa !0
-  %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6
-  %tmp85 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp84, !tbaa !0
-  %tmp86 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 6
-  %tmp87 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp86, !tbaa !0
-  %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7
-  %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0
-  %tmp90 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 7
-  %tmp91 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp90, !tbaa !0
+  %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 1
+  %tmp65 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp64, !tbaa !0
+  %tmp66 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 1
+  %tmp67 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp66, !tbaa !0
+  %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 2
+  %tmp69 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp68, !tbaa !0
+  %tmp70 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 2
+  %tmp71 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp70, !tbaa !0
+  %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 3
+  %tmp73 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp72, !tbaa !0
+  %tmp74 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 3
+  %tmp75 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp74, !tbaa !0
+  %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 4
+  %tmp77 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp76, !tbaa !0
+  %tmp78 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 4
+  %tmp79 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp78, !tbaa !0
+  %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 5
+  %tmp81 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp80, !tbaa !0
+  %tmp82 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 5
+  %tmp83 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp82, !tbaa !0
+  %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 6
+  %tmp85 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp84, !tbaa !0
+  %tmp86 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 6
+  %tmp87 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp86, !tbaa !0
+  %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 7
+  %tmp89 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp88, !tbaa !0
+  %tmp90 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 7
+  %tmp91 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp90, !tbaa !0
  %i.i = extractelement <2 x i32> %arg6, i32 0
  %j.i = extractelement <2 x i32> %arg6, i32 1
  %i.f.i = bitcast i32 %i.i to float
@ -778,10 +778,10 @@ ENDIF66:                                          ; preds = %LOOP65
 ; GCN-LABEL: {{^}}main1:
 ; GCN: s_endpgm
 ; TOVGPR: ScratchSize: 0{{$}}
-define amdgpu_ps void @main1([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @main1([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 main_body:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0
-  %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
+  %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 0)
  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 4)
  %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 8)
@ -885,42 +885,42 @@ main_body:
  %tmp122 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 716)
  %tmp123 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 864)
  %tmp124 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 868)
-  %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
-  %tmp126 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp125, !tbaa !0
-  %tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0
-  %tmp128 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp127, !tbaa !0
-  %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1
-  %tmp130 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp129, !tbaa !0
-  %tmp131 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 1
-  %tmp132 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp131, !tbaa !0
-  %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2
-  %tmp134 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp133, !tbaa !0
-  %tmp135 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 2
-  %tmp136 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp135, !tbaa !0
-  %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3
-  %tmp138 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp137, !tbaa !0
-  %tmp139 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 3
-  %tmp140 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp139, !tbaa !0
-  %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4
-  %tmp142 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp141, !tbaa !0
-  %tmp143 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 4
-  %tmp144 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp143, !tbaa !0
-  %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5
-  %tmp146 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp145, !tbaa !0
-  %tmp147 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 5
-  %tmp148 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp147, !tbaa !0
-  %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6
-  %tmp150 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp149, !tbaa !0
-  %tmp151 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 6
-  %tmp152 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp151, !tbaa !0
-  %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7
-  %tmp154 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp153, !tbaa !0
-  %tmp155 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 7
-  %tmp156 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp155, !tbaa !0
-  %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 8
-  %tmp158 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp157, !tbaa !0
-  %tmp159 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 8
-  %tmp160 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp159, !tbaa !0
+  %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
+  %tmp126 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp125, !tbaa !0
+  %tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
+  %tmp128 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp127, !tbaa !0
+  %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 1
+  %tmp130 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp129, !tbaa !0
+  %tmp131 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 1
+  %tmp132 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp131, !tbaa !0
+  %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 2
+  %tmp134 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp133, !tbaa !0
+  %tmp135 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 2
+  %tmp136 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp135, !tbaa !0
+  %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 3
+  %tmp138 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp137, !tbaa !0
+  %tmp139 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 3
+  %tmp140 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp139, !tbaa !0
+  %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 4
+  %tmp142 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp141, !tbaa !0
+  %tmp143 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 4
+  %tmp144 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp143, !tbaa !0
+  %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 5
+  %tmp146 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp145, !tbaa !0
+  %tmp147 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 5
+  %tmp148 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp147, !tbaa !0
+  %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 6
+  %tmp150 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp149, !tbaa !0
+  %tmp151 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 6
+  %tmp152 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp151, !tbaa !0
+  %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 7
+  %tmp154 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp153, !tbaa !0
+  %tmp155 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 7
+  %tmp156 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp155, !tbaa !0
+  %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 8
+  %tmp158 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp157, !tbaa !0
+  %tmp159 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 8
+  %tmp160 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp159, !tbaa !0
  %tmp161 = fcmp ugt float %arg17, 0.000000e+00
  %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00
  %i.i = extractelement <2 x i32> %arg6, i32 0
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2


@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
-@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8
+@stored_constant_ptr = addrspace(3) global i32 addrspace(4)* undef, align 8
@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8

 ; GCN-LABEL: {{^}}reorder_local_load_global_store_local_load:
@ -100,14 +100,14 @@ define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
 define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+  %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @stored_constant_ptr, align 8

-  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 3

-  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4
  store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4

  %add = add nsw i32 %tmp1, %tmp2

@ -129,14 +129,14 @@ define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
 define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
-  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+  %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @stored_constant_ptr, align 8

-  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 3

-  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4
  store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4

  %add = add nsw i32 %tmp1, %tmp2

@ -151,13 +151,13 @@ define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 a
 ; GCN: ds_write_b32
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
-define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(4)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 2

-  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4
  store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4

  %add = add nsw i32 %tmp1, %tmp2

--- a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@ -12,10 +12,10 @@
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
+define amdgpu_kernel void @vccz_workaround(i32 addrspace(4)* %in, i32 addrspace(1)* %out, float %cond) {
 entry:
  %cnd = fcmp oeq float 0.0, %cond
-  %sgpr = load volatile i32, i32 addrspace(2)* %in
+  %sgpr = load volatile i32, i32 addrspace(4)* %in
  br i1 %cnd, label %if, label %endif

 if:
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@ -7,10 +7,10 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
-  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
+  %tmp1 = load i32, i32 addrspace(4)* %tmp
  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }
@ -19,10 +19,10 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
-  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
+  %tmp1 = load i32, i32 addrspace(4)* %tmp
  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }
@ -34,10 +34,10 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
-  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
+  %tmp1 = load i32, i32 addrspace(4)* %tmp
  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }
@ -49,10 +49,10 @@ entry:
 ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296
-  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296
+  %tmp1 = load i32, i32 addrspace(4)* %tmp
  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }
@ -63,10 +63,10 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
-  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
+  %tmp1 = load i32, i32 addrspace(4)* %tmp
  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }
@ -77,10 +77,10 @@ entry:
 ; SIVIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
-  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
+  %tmp1 = load i32, i32 addrspace(4)* %tmp
  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }
@ -106,10 +106,10 @@ main_body:
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
@ -120,10 +120,10 @@ main_body:
 ; GCN-LABEL: {{^}}smrd_load_const1:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020)
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
@ -137,10 +137,10 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024)
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
@ -152,10 +152,10 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572)
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
@ -167,10 +167,10 @@ main_body:
 ; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576)
  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
@ -257,9 +257,9 @@ main_body:

 ; GCN-LABEL: {{^}}smrd_sgpr_descriptor_promoted
 ; GCN: v_readfirstlane
-define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), i32) #0 {
+define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), i32) #0 {
 main_body:
-  %descptr = bitcast [0 x i8] addrspace(2)* %0 to <4 x i32> addrspace(2)*, !amdgpu.uniform !0
+  %descptr = bitcast [0 x i8] addrspace(4)* %0 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0
  br label %.outer_loop_header

 ret_block:                                       ; preds = %.outer, %.label22, %main_body
@ -275,7 +275,7 @@ ret_block:                                       ; preds = %.outer, %.label22, %
  br i1 %inner_br1, label %.inner_loop_body, label %ret_block

 .inner_loop_body:
-  %descriptor = load <4 x i32>, <4 x i32> addrspace(2)* %descptr, align 16, !invariant.load !0
+  %descriptor = load <4 x i32>, <4 x i32> addrspace(4)* %descptr, align 16, !invariant.load !0
  %load1result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 0)
  %inner_br2 = icmp uge i32 %1, 10
  br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@ -87,7 +87,7 @@ endif:
 ; GCN-NOT: v_readlane_b32 m0
 ; GCN-NOT: s_buffer_store_dword m0
 ; GCN-NOT: s_buffer_load_dword m0
-define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %m0) #0 {
+define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %m0) #0 {
 main_body:
  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0)
  %cmp = fcmp ueq float 0.000000e+00, %tmp
@ -191,7 +191,7 @@ endif:
 ; TOSMEM: s_endpgm
 define amdgpu_kernel void @restore_m0_lds(i32 %arg) {
  %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
-  %sval = load volatile i64, i64 addrspace(2)* undef
+  %sval = load volatile i64, i64 addrspace(4)* undef
  %cmp = icmp eq i32 %arg, 0
  br i1 %cmp, label %ret, label %bb

--- a/test/CodeGen/AMDGPU/split-smrd.ll
+++ b/test/CodeGen/AMDGPU/split-smrd.ll
@ -6,7 +6,7 @@

 ; GCN-LABEL: {{^}}split_smrd_add_worklist:
 ; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(4)* byval %arg) #0 {
 bb:
  %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96)
  %tmp1 = bitcast float %tmp to i32
@ -19,8 +19,8 @@ bb3:                                              ; preds = %bb
  %tmp4 = bitcast float %tmp to i32
  %tmp5 = add i32 %tmp4, 4
  %tmp6 = sext i32 %tmp5 to i64
-  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6
-  %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
+  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(4)* %arg, i64 0, i64 %tmp6
+  %tmp8 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp7, align 32, !tbaa !0
  %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float bitcast (i32 1061158912 to float), float bitcast (i32 1048576000 to float)>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
  %tmp10 = extractelement <4 x float> %tmp9, i32 0
  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)
--- a/test/CodeGen/AMDGPU/store-global.ll
+++ b/test/CodeGen/AMDGPU/store-global.ll
@ -394,11 +394,11 @@ entry:

 ; SIVI: buffer_store_dwordx2
 ; GFX9: global_store_dwordx2
-define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 {
 entry:
-  %0 = load i32, i32 addrspace(2)* %mem, align 4
-  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
-  %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
+  %0 = load i32, i32 addrspace(4)* %mem, align 4
+  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1
+  %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4
  store i32 %0, i32 addrspace(1)* %out, align 4
  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
--- a/test/CodeGen/AMDGPU/store-private.ll
+++ b/test/CodeGen/AMDGPU/store-private.ll
@ -689,11 +689,11 @@ entry:
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 {
 entry:
-  %0 = load i32, i32 addrspace(2)* %mem, align 4
-  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
-  %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
+  %0 = load i32, i32 addrspace(4)* %mem, align 4
+  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1
+  %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4
  store i32 %0, i32 addrspace(5)* %out, align 4
  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1
  store i32 %1, i32 addrspace(5)* %arrayidx1, align 4
--- a/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@ -27,9 +27,9 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i

 ; VI: s_sub_i32
 ; VI: s_sub_i32
-define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
-  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
-  %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
+define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
+  %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
  %add = sub <2 x i16> %a, %b
  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
  ret void
@ -38,8 +38,8 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GCN-LABEL: {{^}}s_test_sub_self_v2i16:
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]]
 ; GCN: buffer_store_dword [[ZERO]]
-define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
-  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
  %add = sub <2 x i16> %a, %a
  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/test/CodeGen/AMDGPU/target-cpu.ll
@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s

-declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1
+declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #1

 declare i32 @llvm.amdgcn.workitem.id.x() #1

@ -15,10 +15,10 @@ declare void @llvm.amdgcn.s.dcache.wb() #0
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 define amdgpu_kernel void @target_none() #0 {
-  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
-  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
-  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %id.ext = sext i32 %id to i64
  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
@ -31,10 +31,10 @@ define amdgpu_kernel void @target_none() #0 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 define amdgpu_kernel void @target_tahiti() #1 {
-  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
-  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
-  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %id.ext = sext i32 %id to i64
  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
@ -47,10 +47,10 @@ define amdgpu_kernel void @target_tahiti() #1 {
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; CHECK: s_dcache_inv_vol
 define amdgpu_kernel void @target_bonaire() #3 {
-  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
-  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
-  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %id.ext = sext i32 %id to i64
  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
@ -64,10 +64,10 @@ define amdgpu_kernel void @target_bonaire() #3 {
 ; CHECK: flat_store_dword
 ; CHECK: s_dcache_wb{{$}}
 define amdgpu_kernel void @target_fiji() #4 {
-  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
-  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
-  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
-  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %id.ext = sext i32 %id to i64
  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
--- a/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll
@ -418,8 +418,8 @@ define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspa
 ; UNALIGNED: s_load_dword

 ; SI: buffer_store_dword
-define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
-  %v = load i32, i32 addrspace(2)* %p, align 1
+define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
+  %v = load i32, i32 addrspace(4)* %p, align 1
  store i32 %v, i32 addrspace(1)* %r, align 4
  ret void
 }
@ -430,8 +430,8 @@ define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32

 ; UNALIGNED: s_load_dword
 ; UNALIGNED: buffer_store_dword
-define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
-  %v = load i32, i32 addrspace(2)* %p, align 2
+define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
+  %v = load i32, i32 addrspace(4)* %p, align 2
  store i32 %v, i32 addrspace(1)* %r, align 4
  ret void
 }
@ -444,8 +444,8 @@ define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 ad

 ; UNALIGNED: s_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
-  %v = load i64, i64 addrspace(2)* %p, align 2
+define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
+  %v = load i64, i64 addrspace(4)* %p, align 2
  store i64 %v, i64 addrspace(1)* %r, align 4
  ret void
 }
@ -453,8 +453,8 @@ define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 ad
 ; SI-LABEL: {{^}}constant_align4_load_i64:
 ; SI: s_load_dwordx2
 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
-  %v = load i64, i64 addrspace(2)* %p, align 4
+define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
+  %v = load i64, i64 addrspace(4)* %p, align 4
  store i64 %v, i64 addrspace(1)* %r, align 4
  ret void
 }
@ -462,8 +462,8 @@ define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 ad
 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
 ; SI: s_load_dwordx4
 ; SI: buffer_store_dwordx4
-define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
-  %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4
+define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
+  %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 4
  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
  ret void
 }
@ -482,8 +482,8 @@ define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p
 ; UNALIGNED: buffer_load_dwordx2

 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
-  %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1
+define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(4)* %p, <2 x i32> addrspace(1)* %r) #0 {
+  %v = load <2 x i32>, <2 x i32> addrspace(4)* %p, align 1
  store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
  ret void
 }
@ -512,8 +512,8 @@ define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)*
 ; UNALIGNED: buffer_load_dwordx4

 ; SI: buffer_store_dwordx4
-define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
-  %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
+define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
+  %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 1
  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
  ret void
 }
@ -521,8 +521,8 @@ define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)*
 ; SI-LABEL: {{^}}constant_align4_load_i8:
 ; SI: s_load_dword
 ; SI: buffer_store_byte
-define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
-  %v = load i8, i8 addrspace(2)* %p, align 4
+define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
+  %v = load i8, i8 addrspace(4)* %p, align 4
  store i8 %v, i8 addrspace(1)* %r, align 4
  ret void
 }
@ -530,8 +530,8 @@ define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrs
 ; SI-LABEL: {{^}}constant_align2_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
-  %v = load i8, i8 addrspace(2)* %p, align 2
+define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
+  %v = load i8, i8 addrspace(4)* %p, align 2
  store i8 %v, i8 addrspace(1)* %r, align 2
  ret void
 }
@ -541,10 +541,10 @@ define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrs
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
-  %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1
-  %v0 = load i32, i32 addrspace(2)* %p, align 4
-  %v1 = load i32, i32 addrspace(2)* %gep0, align 4
+define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
+  %gep0 = getelementptr i32, i32 addrspace(4)* %p, i64 1
+  %v0 = load i32, i32 addrspace(4)* %p, align 4
+  %v1 = load i32, i32 addrspace(4)* %gep0, align 4

  %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1
  store i32 %v0, i32 addrspace(1)* %r, align 4
--- a/test/CodeGen/AMDGPU/uniform-crash.ll
+++ b/test/CodeGen/AMDGPU/uniform-crash.ll
@ -35,7 +35,7 @@ bb2:                                              ; preds = %bb
  br label %bb3

 bb3:                                              ; preds = %bb3, %bb2
-  %val = load volatile i32, i32 addrspace(2)* undef
+  %val = load volatile i32, i32 addrspace(4)* undef
  %tmp4 = icmp eq i32 %val, %arg1
  br i1 %tmp4, label %bb5, label %bb3

--- a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
+++ b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
@ -36,11 +36,11 @@ define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #2 ; encoding: [0x40,0x02,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #2 ; encoding: [0x40,0x02,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00

-@t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3]
+@t = internal addrspace(4) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3]

 define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
-  %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in
-  %v = load i32, i32 addrspace(2)* %a
+  %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(4)* @t, i32 0, i32 %in
+  %v = load i32, i32 addrspace(4)* %a
  store i32 %v, i32 addrspace(1)* %out
  ret void
 }
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@ -27,15 +27,15 @@
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1536

-define amdgpu_vs void @main([9 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <4 x i32>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* byval %arg, [17 x <4 x i32>] addrspace(4)* byval %arg1, [17 x <4 x i32>] addrspace(4)* byval %arg2, [34 x <8 x i32>] addrspace(4)* byval %arg3, [16 x <4 x i32>] addrspace(4)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg1, i64 0, i64 0
-  %tmp11 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, align 16, !tbaa !0
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i64 0, i64 0
+  %tmp11 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, align 16, !tbaa !0
  %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 0)
  %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 16)
  %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 32)
-  %tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg4, i64 0, i64 0
-  %tmp16 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp15, align 16, !tbaa !0
+  %tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg4, i64 0, i64 0
+  %tmp16 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp15, align 16, !tbaa !0
  %tmp17 = add i32 %arg5, %arg7
  %tmp16.cast = bitcast <4 x i32> %tmp16 to <4 x i32>
  %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i1 false, i1 false)
--- a/test/CodeGen/AMDGPU/wait.ll
+++ b/test/CodeGen/AMDGPU/wait.ll
@ -11,19 +11,19 @@
 ; DEFAULT: exp
 ; DEFAULT: s_waitcnt lgkmcnt(0)
 ; DEFAULT: s_endpgm
-define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
+define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
-  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 0
+  %tmp10 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, !tbaa !0
  %tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32>
  %tmp11 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i1 false, i1 false)
  %tmp12 = extractelement <4 x float> %tmp11, i32 0
  %tmp13 = extractelement <4 x float> %tmp11, i32 1
  call void @llvm.amdgcn.s.barrier() #1
  %tmp14 = extractelement <4 x float> %tmp11, i32 2
-  %tmp15 = load float, float addrspace(2)* %constptr, align 4
-  %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1
-  %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0
+  %tmp15 = load float, float addrspace(4)* %constptr, align 4
+  %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 1
+  %tmp17 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp16, !tbaa !0
  %tmp17.cast = bitcast <16 x i8> %tmp17 to <4 x i32>
  %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i1 false, i1 false)
  %tmp19 = extractelement <4 x float> %tmp18, i32 0
@ -46,10 +46,10 @@ main_body:
 ; ILPMAX: exp pos0
 ; ILPMAX-NEXT: exp param0
 ; ILPMAX: s_endpgm
-define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(4)* byval %arg, [17 x <16 x i8>] addrspace(4)* byval %arg1, [17 x <4 x i32>] addrspace(4)* byval %arg2, [34 x <8 x i32>] addrspace(4)* byval %arg3, [16 x <16 x i8>] addrspace(4)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 main_body:
-  %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0
-  %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
+  %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(4)* %arg4, i64 0, i64 0
+  %tmp11 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, align 16, !tbaa !0
  %tmp12 = add i32 %arg5, %arg7
  %tmp11.cast = bitcast <16 x i8> %tmp11 to <4 x i32>
  %tmp13 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i1 false, i1 false)
@ -57,8 +57,8 @@ main_body:
  %tmp15 = extractelement <4 x float> %tmp13, i32 1
  %tmp16 = extractelement <4 x float> %tmp13, i32 2
  %tmp17 = extractelement <4 x float> %tmp13, i32 3
-  %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 1
-  %tmp19 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp18, align 16, !tbaa !0
+  %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(4)* %arg4, i64 0, i64 1
+  %tmp19 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp18, align 16, !tbaa !0
  %tmp20 = add i32 %arg5, %arg7
  %tmp19.cast = bitcast <16 x i8> %tmp19 to <4 x i32>
  %tmp21 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i1 false, i1 false)
--- a/test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ b/test/CodeGen/AMDGPU/waitcnt-looptest.ll
@ -22,19 +22,19 @@ bb:
  br label %bb18

 bb1:                                              ; preds = %bb18
-  %tmp = tail call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
-  %tmp4 = getelementptr inbounds i8, i8 addrspace(2)* %tmp, i64 4
-  %tmp5 = bitcast i8 addrspace(2)* %tmp4 to i16 addrspace(2)*
-  %tmp6 = load i16, i16 addrspace(2)* %tmp5, align 4
+  %tmp4 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4
+  %tmp5 = bitcast i8 addrspace(4)* %tmp4 to i16 addrspace(4)*
+  %tmp6 = load i16, i16 addrspace(4)* %tmp5, align 4
  %tmp7 = zext i16 %tmp6 to i32
  %tmp8 = mul i32 %tmp3, %tmp7
  %tmp9 = add i32 %tmp8, %tmp2
-  %tmp10 = tail call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %tmp10 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
  %tmp11 = zext i32 %tmp9 to i64
-  %tmp12 = bitcast i8 addrspace(2)* %tmp10 to i64 addrspace(2)*
-  %tmp13 = load i64, i64 addrspace(2)* %tmp12, align 8
+  %tmp12 = bitcast i8 addrspace(4)* %tmp10 to i64 addrspace(4)*
+  %tmp13 = load i64, i64 addrspace(4)* %tmp12, align 8
  %tmp14 = add i64 %tmp13, %tmp11
  %tmp15 = zext i1 %tmp99 to i32
  %tmp16 = and i64 %tmp14, 4294967295
@ -131,7 +131,7 @@ bb18:                                             ; preds = %bb18, %bb
 }

 ; Function Attrs: nounwind readnone speculatable
-declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1

 ; Function Attrs: nounwind readnone speculatable
 declare i32 @llvm.amdgcn.workitem.id.x() #1
@ -140,7 +140,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.x() #1

 ; Function Attrs: nounwind readnone speculatable
-declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #1
+declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1

 attributes #0 = { "target-cpu"="fiji" "target-features"="-flat-for-global" }
 attributes #1 = { nounwind readnone speculatable }
--- a/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
+++ b/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
@ -1,12 +1,12 @@
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck -check-prefix=OPT %s

-declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0

 ; OPT-LABEL: @constant_load_i1
 ; OPT: load i1
 ; OPT-NEXT: store i1
-define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
-  %val = load i1, i1 addrspace(2)* %in
+define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 {
+  %val = load i1, i1 addrspace(4)* %in
  store i1 %val, i1 addrspace(1)* %out
  ret void
 }
@ -14,8 +14,8 @@ define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(
 ; OPT-LABEL: @constant_load_i1_align2
 ; OPT: load i1
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
-  %val = load i1, i1 addrspace(2)* %in, align 2
+define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 {
+  %val = load i1, i1 addrspace(4)* %in, align 2
  store i1 %val, i1 addrspace(1)* %out, align 2
  ret void
 }
@ -25,8 +25,8 @@ define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 add
 ; OPT-NEXT: load i32
 ; OPT-NEXT: trunc
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
-  %val = load i1, i1 addrspace(2)* %in, align 4
+define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 {
+  %val = load i1, i1 addrspace(4)* %in, align 4
  store i1 %val, i1 addrspace(1)* %out, align 4
  ret void
 }
@ -34,8 +34,8 @@ define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 add
 ; OPT-LABEL: @constant_load_i8
 ; OPT: load i8
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %val = load i8, i8 addrspace(2)* %in
+define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %val = load i8, i8 addrspace(4)* %in
  store i8 %val, i8 addrspace(1)* %out
  ret void
 }
@ -43,8 +43,8 @@ define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(
 ; OPT-LABEL: @constant_load_i8_align2
 ; OPT: load i8
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %val = load i8, i8 addrspace(2)* %in, align 2
+define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %val = load i8, i8 addrspace(4)* %in, align 2
  store i8 %val, i8 addrspace(1)* %out, align 2
  ret void
 }
@ -54,8 +54,8 @@ define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 add
 ; OPT-NEXT: load i32
 ; OPT-NEXT: trunc
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
-  %val = load i8, i8 addrspace(2)* %in, align 4
+define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+  %val = load i8, i8 addrspace(4)* %in, align 4
  store i8 %val, i8 addrspace(1)* %out, align 4
  ret void
 }
@ -64,8 +64,8 @@ define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addr
 ; OPT-LABEL: @constant_load_v2i8
 ; OPT: load <2 x i8>
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in
  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
  ret void
 }
@ -76,32 +76,32 @@ define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x
 ; OPT-NEXT: trunc
 ; OPT-NEXT: bitcast
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in, align 4
+define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in, align 4
  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out, align 4
  ret void
 }

 ; OPT-LABEL: @constant_load_v3i8
 ; OPT: bitcast <3 x i8>
-; OPT-NEXT: load i32, i32 addrspace(2)
+; OPT-NEXT: load i32, i32 addrspace(4)
 ; OPT-NEXT: trunc i32
 ; OPT-NEXT: bitcast i24
 ; OPT-NEXT: store <3 x i8>
-define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
-  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
+  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in
  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
  ret void
 }

 ; OPT-LABEL: @constant_load_v3i8_align4
 ; OPT: bitcast <3 x i8>
-; OPT-NEXT: load i32, i32 addrspace(2)
+; OPT-NEXT: load i32, i32 addrspace(4)
 ; OPT-NEXT: trunc i32
 ; OPT-NEXT: bitcast i24
 ; OPT-NEXT: store <3 x i8>
-define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
-  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in, align 4
+define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
+  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in, align 4
  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out, align 4
  ret void
 }
@ -110,8 +110,8 @@ define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out
 ; OPT: load i16
 ; OPT: sext
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
-  %ld = load i16, i16 addrspace(2)* %in
+define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+  %ld = load i16, i16 addrspace(4)* %in
  %ext = sext i16 %ld to i32
  store i32 %ext, i32 addrspace(1)* %out
  ret void
@ -123,8 +123,8 @@ define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspa
 ; OPT-NEXT: trunc
 ; OPT-NEXT: sext
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
-  %ld = load i16, i16 addrspace(2)* %in, align 4
+define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+  %ld = load i16, i16 addrspace(4)* %in, align 4
  %ext = sext i16 %ld to i32
  store i32 %ext, i32 addrspace(1)* %out, align 4
  ret void
@ -133,8 +133,8 @@ define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16
 ; OPT-LABEL: @constant_load_f16
 ; OPT: load half
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrspace(2)* %in) #0 {
-  %ld = load half, half addrspace(2)* %in
+define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrspace(4)* %in) #0 {
+  %ld = load half, half addrspace(4)* %in
  store half %ld, half addrspace(1)* %out
  ret void
 }
@ -142,8 +142,8 @@ define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrs
 ; OPT-LABEL: @constant_load_v2f16
 ; OPT: load <2 x half>
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %in) #0 {
-  %ld = load <2 x half>, <2 x half> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %in) #0 {
+  %ld = load <2 x half>, <2 x half> addrspace(4)* %in
  store <2 x half> %ld, <2 x half> addrspace(1)* %out
  ret void
 }
@ -151,8 +151,8 @@ define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2
 ; OPT-LABEL: @load_volatile
 ; OPT: load volatile i16
 ; OPT-NEXT: store
-define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
-  %a = load volatile i16, i16 addrspace(2)* %in
+define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(4)* %in) {
+  %a = load volatile i16, i16 addrspace(4)* %in
  store i16 %a, i16 addrspace(1)* %out
  ret void
 }
@ -160,8 +160,8 @@ define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2
 ; OPT-LABEL: @constant_load_v2i8_volatile
 ; OPT: load volatile <2 x i8>
 ; OPT-NEXT: store
-define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
-  %ld = load volatile <2 x i8>, <2 x i8> addrspace(2)* %in
+define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+  %ld = load volatile <2 x i8>, <2 x i8> addrspace(4)* %in
  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
  ret void
 }
@ -182,8 +182,8 @@ define amdgpu_kernel void @constant_load_v2i8_addrspace1(<2 x i8> addrspace(1)*
 ; OPT-NEXT: zext
 ; OPT-NEXT: store
 define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
-  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  %val = load i8, i8 addrspace(2)* %dispatch.ptr, align 4
+  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %val = load i8, i8 addrspace(4)* %dispatch.ptr, align 4
  %ld = zext i8 %val to i32
  store i32 %ld, i32 addrspace(1)* %ptr
  ret void
--- a/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
@ -2,64 +2,64 @@

 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

-@array = internal addrspace(2) constant [4096 x [32 x float]] zeroinitializer, align 4
+@array = internal addrspace(4) constant [4096 x [32 x float]] zeroinitializer, align 4

 ; IR-LABEL: @sum_of_array(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1
-; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32
-; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 1
+; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 32
+; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 33
 define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
  %tmp = sext i32 %y to i64
  %tmp1 = sext i32 %x to i64
-  %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp
-  %tmp4 = load float, float addrspace(2)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp1, i64 %tmp
+  %tmp4 = load float, float addrspace(4)* %tmp2, align 4
  %tmp5 = fadd float %tmp4, 0.000000e+00
  %tmp6 = add i32 %y, 1
  %tmp7 = sext i32 %tmp6 to i64
-  %tmp8 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp7
-  %tmp10 = load float, float addrspace(2)* %tmp8, align 4
+  %tmp8 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp1, i64 %tmp7
+  %tmp10 = load float, float addrspace(4)* %tmp8, align 4
  %tmp11 = fadd float %tmp5, %tmp10
  %tmp12 = add i32 %x, 1
  %tmp13 = sext i32 %tmp12 to i64
-  %tmp14 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp13, i64 %tmp
-  %tmp16 = load float, float addrspace(2)* %tmp14, align 4
+  %tmp14 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp13, i64 %tmp
+  %tmp16 = load float, float addrspace(4)* %tmp14, align 4
  %tmp17 = fadd float %tmp11, %tmp16
-  %tmp18 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp13, i64 %tmp7
-  %tmp20 = load float, float addrspace(2)* %tmp18, align 4
+  %tmp18 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp13, i64 %tmp7
+  %tmp20 = load float, float addrspace(4)* %tmp18, align 4
  %tmp21 = fadd float %tmp17, %tmp20
  store float %tmp21, float addrspace(1)* %output, align 4
  ret void
 }

-@array2 = internal addrspace(2) constant [4096 x [4 x float]] zeroinitializer, align 4
+@array2 = internal addrspace(4) constant [4096 x [4 x float]] zeroinitializer, align 4

 ; Some of the indices go over the maximum mubuf offset, so don't split them.

 ; IR-LABEL: @sum_of_array_over_max_mubuf_offset(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 255
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 255
 ; IR: add i32 %x, 256
-; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
  %tmp = sext i32 %y to i64
  %tmp1 = sext i32 %x to i64
-  %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp
-  %tmp4 = load float, float addrspace(2)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp1, i64 %tmp
+  %tmp4 = load float, float addrspace(4)* %tmp2, align 4
  %tmp5 = fadd float %tmp4, 0.000000e+00
  %tmp6 = add i32 %y, 255
  %tmp7 = sext i32 %tmp6 to i64
-  %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp7
-  %tmp10 = load float, float addrspace(2)* %tmp8, align 4
+  %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp1, i64 %tmp7
+  %tmp10 = load float, float addrspace(4)* %tmp8, align 4
  %tmp11 = fadd float %tmp5, %tmp10
  %tmp12 = add i32 %x, 256
  %tmp13 = sext i32 %tmp12 to i64
-  %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp13, i64 %tmp
-  %tmp16 = load float, float addrspace(2)* %tmp14, align 4
+  %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp13, i64 %tmp
+  %tmp16 = load float, float addrspace(4)* %tmp14, align 4
  %tmp17 = fadd float %tmp11, %tmp16
-  %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp13, i64 %tmp7
-  %tmp20 = load float, float addrspace(2)* %tmp18, align 4
+  %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp13, i64 %tmp7
+  %tmp20 = load float, float addrspace(4)* %tmp18, align 4
  %tmp21 = fadd float %tmp17, %tmp20
  store float %tmp21, float addrspace(1)* %output, align 4
  ret void
@ -97,18 +97,18 @@ define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y
 ; IR: getelementptr {{.*}} !amdgpu.uniform
 ; IR: getelementptr {{.*}} !amdgpu.uniform
 ; IR: getelementptr {{.*}} !amdgpu.uniform
-define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata([0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata([0 x <4 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
 main_body:
  %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
  %23 = bitcast float %22 to i32
  %24 = shl i32 %23, 1
-  %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(2)* %1, i32 0, i32 %24, !amdgpu.uniform !0
-  %26 = load <8 x i32>, <8 x i32> addrspace(2)* %25, align 32, !invariant.load !0
+  %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(4)* %1, i32 0, i32 %24, !amdgpu.uniform !0
+  %26 = load <8 x i32>, <8 x i32> addrspace(4)* %25, align 32, !invariant.load !0
  %27 = shl i32 %23, 2
  %28 = or i32 %27, 3
-  %29 = bitcast [0 x <8 x i32>] addrspace(2)* %1 to [0 x <4 x i32>] addrspace(2)*
-  %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %29, i32 0, i32 %28, !amdgpu.uniform !0
-  %31 = load <4 x i32>, <4 x i32> addrspace(2)* %30, align 16, !invariant.load !0
+  %29 = bitcast [0 x <8 x i32>] addrspace(4)* %1 to [0 x <4 x i32>] addrspace(4)*
+  %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(4)* %29, i32 0, i32 %28, !amdgpu.uniform !0
+  %31 = load <4 x i32>, <4 x i32> addrspace(4)* %30, align 16, !invariant.load !0
  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
  %33 = extractelement <4 x float> %32, i32 0
  %34 = extractelement <4 x float> %32, i32 1