[AMDGPU] Add maximum NSA size limit ISA feature

Add maximum NSA size limit as an ISA feature. Use this to reduce NSA usage on GFX10.1 to avoid stability issues with 4 and 5 dwords NSA instructions. Maintain use of longer NSA instructions on GFX10.3. Note: this also contains some minor fixes for GlobalISel which did not work correctly with non-NSA form instructions on GFX10. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D103348
2024-11-22 02:33:06 +01:00 · 2021-07-22 10:43:25 +09:00 · 2021-07-22 10:43:25 +09:00 · 924e1a4716
commit 924e1a4716
parent 9458751292
17 changed files with 475 additions and 268 deletions
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@ -629,6 +629,16 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
  "Does not need SW waitstates"
 >;

+class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
+  "nsa-max-size-"#Value,
+  "NSAMaxSize",
+  !cast<string>(Value),
+  "The maximum non-sequential address size in VGPRs."
+>;
+
+def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>;
+def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@ -1031,6 +1041,7 @@ def FeatureISAVersion10_1_0 : FeatureSet<
     FeatureLDSBankCount32,
     FeatureDLInsts,
     FeatureNSAEncoding,
+     FeatureNSAMaxSize5,
     FeatureWavefrontSize32,
     FeatureScalarStores,
     FeatureScalarAtomics,
@ -1052,6 +1063,7 @@ def FeatureISAVersion10_1_1 : FeatureSet<
     FeatureDot6Insts,
     FeatureDot7Insts,
     FeatureNSAEncoding,
+     FeatureNSAMaxSize5,
     FeatureWavefrontSize32,
     FeatureScalarStores,
     FeatureScalarAtomics,
@ -1073,6 +1085,7 @@ def FeatureISAVersion10_1_2 : FeatureSet<
     FeatureDot6Insts,
     FeatureDot7Insts,
     FeatureNSAEncoding,
+     FeatureNSAMaxSize5,
     FeatureWavefrontSize32,
     FeatureScalarStores,
     FeatureScalarAtomics,
@ -1090,6 +1103,7 @@ def FeatureISAVersion10_1_3 : FeatureSet<
     FeatureLDSBankCount32,
     FeatureDLInsts,
     FeatureNSAEncoding,
+     FeatureNSAMaxSize5,
     FeatureWavefrontSize32,
     FeatureScalarStores,
     FeatureScalarAtomics,
@ -1113,6 +1127,7 @@ def FeatureISAVersion10_3_0 : FeatureSet<
   FeatureDot6Insts,
   FeatureDot7Insts,
   FeatureNSAEncoding,
+   FeatureNSAMaxSize13,
   FeatureWavefrontSize32,
   FeatureShaderCyclesRegister]>;

--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@ -4153,12 +4153,11 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,

    Register AddrReg = SrcOp.getReg();

-    if (I < Intr->GradientStart) {
-      AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
-      PackedAddrs.push_back(AddrReg);
-    } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
-               (I >= Intr->CoordStart && !IsA16)) {
+    if ((I < Intr->GradientStart) ||
+        (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
+        (I >= Intr->CoordStart && !IsA16)) {
      // Handle any gradient or coordinate operands that should not be packed
+      AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
      PackedAddrs.push_back(AddrReg);
    } else {
      // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
@ -4202,9 +4201,8 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,

  int NumAddrRegs = AddrRegs.size();
  if (NumAddrRegs != 1) {
-    // Round up to 8 elements for v5-v7
-    // FIXME: Missing intermediate sized register classes and instructions.
-    if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
+    // Above 8 elements round up to next power of 2 (i.e. 16).
+    if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
      const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
      auto Undef = B.buildUndef(S32);
      AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
@ -4375,7 +4373,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
                                IsG16);

      // See also below in the non-a16 branch
-      const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
+      const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
+                          PackedRegs.size() <= ST.getNSAMaxSize();

      if (!UseNSA && PackedRegs.size() > 1) {
        LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
@ -4412,7 +4411,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
    //
    // SIShrinkInstructions will convert NSA encodings to non-NSA after register
    // allocation when possible.
-    const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
+    const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
+                        CorrectedNumVAddrs <= ST.getNSAMaxSize();

    if (!UseNSA && Intr->NumVAddrs > 1)
      convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@ -264,6 +264,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
    HasGFX10A16(false),
    HasG16(false),
    HasNSAEncoding(false),
+    NSAMaxSize(0),
    GFX10_AEncoding(false),
    GFX10_BEncoding(false),
    HasDLInsts(false),
--- a/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/lib/Target/AMDGPU/GCNSubtarget.h
@ -136,6 +136,7 @@ protected:
  bool HasGFX10A16;
  bool HasG16;
  bool HasNSAEncoding;
+  unsigned NSAMaxSize;
  bool GFX10_AEncoding;
  bool GFX10_BEncoding;
  bool HasDLInsts;
@ -878,6 +879,8 @@ public:

  bool hasNSAEncoding() const { return HasNSAEncoding; }

+  unsigned getNSAMaxSize() const { return NSAMaxSize; }
+
  bool hasGFX10_AEncoding() const {
    return GFX10_AEncoding;
  }
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -6211,8 +6211,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
  //
  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
  // allocation when possible.
-  bool UseNSA =
-      ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
+  bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
+                VAddrs.size() >= 3 &&
+                VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
  SDValue VAddr;
  if (!UseNSA)
    VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
@ -1669,7 +1669,8 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
  ; GFX10:   [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[COPY28]](s32)
  ; GFX10:   [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY29]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -3511,7 +3512,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
  ; GFX10:   [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32)
  ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
  ; GFX10:   [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource")
+  ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource")
  ; GFX10:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
  ; GFX10:   SI_RETURN_TO_EPILOG implicit $vgpr0
 main_body:
@ -3606,7 +3608,8 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
  ; GFX10:   [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32)
  ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
  ; GFX10:   [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource")
+  ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
@ -27,7 +27,8 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
  ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
  ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -69,7 +70,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
  ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
  ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -119,7 +122,11 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
  ; GFX10:   [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32)
  ; GFX10:   [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
+  ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -159,7 +166,8 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
  ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
  ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -203,7 +211,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
  ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
  ; GFX10:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -242,7 +252,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
  ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
  ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -285,7 +297,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
  ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
  ; GFX10:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -326,7 +341,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
  ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
  ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -371,7 +388,11 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
  ; GFX10:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
  ; GFX10:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+  ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -409,7 +430,8 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
  ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
  ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -451,7 +473,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
  ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
  ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -491,7 +515,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
  ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
  ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -535,7 +560,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
  ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
  ; GFX10:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -574,7 +601,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
  ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
  ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -617,7 +646,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
  ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
  ; GFX10:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -658,7 +690,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
  ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
  ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -703,7 +737,11 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
  ; GFX10:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
  ; GFX10:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+  ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
@ -750,7 +788,11 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
  ; GFX10:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
  ; GFX10:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32) from custom "ImageResource")
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+  ; GFX10:   [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
+  ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32) from custom "ImageResource")
  ; GFX10:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
  ; GFX10:   SI_RETURN_TO_EPILOG implicit $vgpr0
 main_body:
@ -793,7 +835,11 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
  ; GFX10:   [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
  ; GFX10:   [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32)
  ; GFX10:   [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32)
-  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>) from custom "ImageResource")
+  ; GFX10:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+  ; GFX10:   [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+  ; GFX10:   [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
+  ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
+  ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>) from custom "ImageResource")
  ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
  ; GFX10:   $vgpr0 = COPY [[UV]](s32)
  ; GFX10:   $vgpr1 = COPY [[UV1]](s32)
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@ -505,7 +505,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
 ; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX6-NEXT:    image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT:    image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
@ -161,7 +161,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
 ; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX6-NEXT:    image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT:    image_gather4_c_cl_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -255,7 +255,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
 ; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX6-NEXT:    image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT:    image_gather4_c_b_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -299,7 +299,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX6-NEXT:    s_mov_b32 s9, s11
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
-; GFX6-NEXT:    image_gather4_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT:    image_gather4_b_cl_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -343,7 +343,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
 ; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX6-NEXT:    image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT:    image_gather4_c_b_cl_o v[0:3], v[0:5], s[0:7], s[8:11] dmask:0x1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@ -428,7 +428,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX6-NEXT:    s_mov_b32 s9, s11
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
-; GFX6-NEXT:    image_gather4_c_l_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT:    image_gather4_c_l_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@ -35,15 +35,17 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
 ; GFX10-LABEL: sample_d_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v9, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v9, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v9, v4
-; GFX10-NEXT:    v_and_or_b32 v3, v5, v9, s12
-; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10-NEXT:    v_and_or_b32 v3, v9, v11, s12
+; GFX10-NEXT:    v_and_or_b32 v4, v10, v11, v4
+; GFX10-NEXT:    v_and_or_b32 v2, v0, v11, v1
+; GFX10-NEXT:    v_and_or_b32 v5, v5, v11, s12
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -131,12 +133,15 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_c_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v8, v4
-; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX10-NEXT:    v_and_or_b32 v4, v9, v0, v4
+; GFX10-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -255,12 +260,15 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_c_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v8, v4
-; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX10-NEXT:    v_and_or_b32 v4, v9, v0, v4
+; GFX10-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -271,12 +279,17 @@ main_body:
 define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; GFX10-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v9, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v9, v5
-; GFX10-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; GFX10-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10-NEXT:    v_and_or_b32 v4, v10, v0, v1
+; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -287,12 +300,17 @@ main_body:
 define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; GFX10-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v9, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v9, v5
-; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; GFX10-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10-NEXT:    v_and_or_b32 v4, v10, v0, v1
+; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
--- a/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/test/CodeGen/AMDGPU/cluster_stores.ll
@ -351,21 +351,28 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
 ;
 ; GFX10-LABEL: cluster_image_sample:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v0
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v13, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v8, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v9, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 1.0
-; GFX10-NEXT:    v_add_f32_e32 v11, 1.0, v2
-; GFX10-NEXT:    v_add_f32_e32 v12, 1.0, v3
-; GFX10-NEXT:    v_add_f32_e32 v14, 2.0, v2
-; GFX10-NEXT:    v_add_f32_e32 v15, 2.0, v3
-; GFX10-NEXT:    image_sample_d v[2:5], [v11, v12, v13, v13, v13, v13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT:    image_sample_d v[6:9], [v14, v15, v10, v10, v10, v10], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_mov_b32_e32 v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v2, 1.0, v8
+; GFX10-NEXT:    v_add_f32_e32 v3, 1.0, v9
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_mov_b32_e32 v7, v4
+; GFX10-NEXT:    v_add_f32_e32 v8, 2.0, v8
+; GFX10-NEXT:    v_add_f32_e32 v9, 2.0, v9
+; GFX10-NEXT:    v_mov_b32_e32 v11, v10
+; GFX10-NEXT:    v_mov_b32_e32 v12, v10
+; GFX10-NEXT:    v_mov_b32_e32 v13, v10
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    image_sample_d v[14:17], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_sample_d v[18:21], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_add_f32_e32 v5, v17, v21
+; GFX10-NEXT:    v_add_f32_e32 v4, v16, v20
+; GFX10-NEXT:    v_add_f32_e32 v3, v15, v19
+; GFX10-NEXT:    v_add_f32_e32 v2, v14, v18
 ; GFX10-NEXT:    image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
 ; GFX10-NEXT:    s_endpgm
 entry:
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NONSA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010,NSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030,NSA %s

 ; GCN-LABEL: {{^}}sample_2d:
 ;
@ -24,7 +25,8 @@ main_body:
 }

 ; GCN-LABEL: {{^}}sample_d_3d:
-; NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
+; GFX1010: image_sample_d v[0:3], v[7:22],
+; GFX1030: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
 main_body:
  %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@ -33,8 +35,8 @@ main_body:

 ; GCN-LABEL: {{^}}sample_contig_nsa:
 ; NONSA: image_sample_c_l v5, v[0:4],
-; NSA: image_sample_c_l v8, v[0:4],
-; NSA: image_sample v9, [v6, v7, v5],
+; NSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
+; NSA: image_sample v{{[0-9]+}}, [v6, v7, v5],
 define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) {
 main_body:
  %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@ -45,8 +47,8 @@ main_body:
 }

 ; GCN-LABEL: {{^}}sample_nsa_nsa:
-; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0],
-; NSA: image_sample v9, [v6, v7, v5],
+; NSA: image_sample_c_l v{{[0-9]+}}, [v1, v2, v3, v4, v0],
+; NSA: image_sample v{{[0-9]+}}, [v6, v7, v5],
 define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) {
 main_body:
  %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@ -57,8 +59,8 @@ main_body:
 }

 ; GCN-LABEL: {{^}}sample_nsa_contig:
-; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0],
-; NSA: image_sample v9, v[5:7],
+; NSA: image_sample_c_l v{{[0-9]+}}, [v1, v2, v3, v4, v0],
+; NSA: image_sample v{{[0-9]+}}, v[5:7],
 define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) {
 main_body:
  %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@ -69,10 +71,10 @@ main_body:
 }

 ; GCN-LABEL: {{^}}sample_contig_contig:
-; NSA: image_sample_c_l v8, v[0:4],
-; NSA: image_sample v9, v[5:7],
-; NONSA: image_sample_c_l v8, v[0:4],
-; NONSA: image_sample v9, v[5:7],
+; NSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
+; NSA: image_sample v{{[0-9]+}}, v[5:7],
+; NONSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
+; NONSA: image_sample v{{[0-9]+}}, v[5:7],
 define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) {
 main_body:
  %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@ -600,14 +600,17 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ;
 ; GFX10-LABEL: sample_d_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v6, v9, v6
-; GFX10-NEXT:    v_and_b32_e32 v3, v9, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, v9, v0
-; GFX10-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX10-NEXT:    v_mov_b32_e32 v12, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v10, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v3, v2, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshl_or_b32 v11, v7, 16, v5
+; GFX10-NEXT:    v_lshl_or_b32 v9, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -1156,14 +1159,17 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v6, v9, v6
-; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    v_mov_b32_e32 v13, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v6
+; GFX10-NEXT:    v_and_b32_e32 v4, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_lshl_or_b32 v12, v7, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v11, v5, 16, v4
+; GFX10-NEXT:    v_lshl_or_b32 v10, v3, 16, v0
+; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -1190,14 +1196,17 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v6, v9, v6
-; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    v_mov_b32_e32 v13, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v6
+; GFX10-NEXT:    v_and_b32_e32 v4, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_lshl_or_b32 v12, v7, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v11, v5, 16, v4
+; GFX10-NEXT:    v_lshl_or_b32 v10, v3, 16, v0
+; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@ -1739,14 +1739,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ; GFX10-LABEL: sample_c_d_o_2darray_V1_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
-; GFX10-NEXT:    image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v11 ; encoding: [0x0b,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v12 ; encoding: [0x0c,0x03,0x14,0x7e]
+; GFX10-NEXT:    image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x28,0x04,0xe9,0xf0,0x00,0x09,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v11, v1, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x01,0x0c,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e]
+; GFX10-NEXT:    global_store_dword v11, v10, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x0a,0x0c,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -1807,14 +1806,14 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
 ;
 ; GFX10-LABEL: sample_c_d_o_2darray_V2_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v11, v0 ; encoding: [0x00,0x03,0x16,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1 ; encoding: [0x01,0x03,0x14,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    image_sample_c_d_o v[0:2], [v11, v10, v9, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x06,0xe9,0xf0,0x0b,0x00,0x40,0x00,0x0a,0x09,0x03,0x04,0x05,0x06,0x07,0x08]
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e]
+; GFX10-NEXT:    image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x28,0x06,0xe9,0xf0,0x00,0x09,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v10 ; encoding: [0x0a,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
  %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
@ -47,9 +47,16 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, half %s, half %t, half %r) {
 ; GFX10-LABEL: sample_d_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX10-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT:    image_sample_d v[0:3], [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX10-NEXT:    v_mov_b32_e32 v15, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GFX10-NEXT:    v_mov_b32_e32 v13, v5
+; GFX10-NEXT:    v_mov_b32_e32 v12, v4
+; GFX10-NEXT:    v_mov_b32_e32 v11, v3
+; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_lshl_or_b32 v14, v7, 16, v0
+; GFX10-NEXT:    image_sample_d v[0:3], v[8:15], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@ -132,9 +139,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) {
 ; GFX10-LABEL: sample_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    image_sample_d_cl v[0:3], [v0, v1, v2, v3, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_mov_b32_e32 v11, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_lshl_or_b32 v10, v5, 16, v0
+; GFX10-NEXT:    image_sample_d_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@ -177,9 +189,15 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) {
 ; GFX10-LABEL: sample_c_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
-; GFX10-NEXT:    image_sample_c_d_cl v[0:3], [v0, v1, v2, v3, v4, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_mov_b32_e32 v13, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX10-NEXT:    v_mov_b32_e32 v11, v4
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-NEXT:    v_lshl_or_b32 v12, v6, 16, v0
+; GFX10-NEXT:    image_sample_c_d_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@ -302,9 +320,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) {
 ; GFX10-LABEL: sample_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    image_sample_cd_cl v[0:3], [v0, v1, v2, v3, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_mov_b32_e32 v11, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_lshl_or_b32 v10, v5, 16, v0
+; GFX10-NEXT:    image_sample_cd_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@ -347,9 +370,15 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) {
 ; GFX10-LABEL: sample_c_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
-; GFX10-NEXT:    image_sample_c_cd_cl v[0:3], [v0, v1, v2, v3, v4, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_mov_b32_e32 v13, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX10-NEXT:    v_mov_b32_e32 v11, v4
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-NEXT:    v_lshl_or_b32 v12, v6, 16, v0
+; GFX10-NEXT:    image_sample_c_cd_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@ -371,9 +400,16 @@ main_body:
 define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice) {
 ; GFX10-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX10-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT:    image_sample_c_d_o v0, [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    v_mov_b32_e32 v15, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GFX10-NEXT:    v_mov_b32_e32 v13, v5
+; GFX10-NEXT:    v_mov_b32_e32 v12, v4
+; GFX10-NEXT:    v_mov_b32_e32 v11, v3
+; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_lshl_or_b32 v14, v7, 16, v0
+; GFX10-NEXT:    image_sample_c_d_o v0, v[8:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@ -395,9 +431,16 @@ main_body:
 define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice) {
 ; GFX10-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX10-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT:    image_sample_c_d_o v[0:1], [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    v_mov_b32_e32 v15, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GFX10-NEXT:    v_mov_b32_e32 v13, v5
+; GFX10-NEXT:    v_mov_b32_e32 v12, v4
+; GFX10-NEXT:    v_mov_b32_e32 v11, v3
+; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_lshl_or_b32 v14, v7, 16, v0
+; GFX10-NEXT:    image_sample_c_d_o v[0:1], v[8:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@ -489,26 +532,30 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
 ; GFX10-LABEL: sample_g16_noa16_d_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v9, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, v9, v0
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v9, v2, v9
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v9
+; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_d_3d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v10, v3
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT:    v_and_or_b32 v0, v0, v9, v1
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v2, v9, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v3, v9, v4
-; GFX10GISEL-NEXT:    v_and_or_b32 v3, v5, v9, s12
-; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10GISEL-NEXT:    v_and_or_b32 v3, v9, v11, s12
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v11, v4
+; GFX10GISEL-NEXT:    v_and_or_b32 v2, v0, v11, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, v5, v11, s12
+; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
 main_body:
@ -636,23 +683,28 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_g16_noa16_c_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v8, v2
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v3, v8, v4
-; GFX10GISEL-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, v9, v0, v4
+; GFX10GISEL-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX10GISEL-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
 main_body:
@ -828,23 +880,28 @@ main_body:
 define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_g16_noa16_c_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10GISEL-NEXT:    v_and_or_b32 v1, v1, v8, v2
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v3, v8, v4
-; GFX10GISEL-NEXT:    image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, v9, v0, v4
+; GFX10GISEL-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX10GISEL-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
 main_body:
@ -855,23 +912,32 @@ main_body:
 define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_o_2darray_V1:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, v4
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v2, v9, v3
-; GFX10GISEL-NEXT:    v_and_or_b32 v3, v4, v9, v5
-; GFX10GISEL-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v0, v1
+; GFX10GISEL-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
 main_body:
@ -882,23 +948,32 @@ main_body:
 define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_o_2darray_V2:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, v4
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10GISEL-NEXT:    v_and_or_b32 v2, v2, v9, v3
-; GFX10GISEL-NEXT:    v_and_or_b32 v3, v4, v9, v5
-; GFX10GISEL-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v0, v1
+; GFX10GISEL-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
 main_body:
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@ -31,12 +31,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
 ; GFX10-LABEL: sample_d_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v9, v3 ; encoding: [0x09,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v9, v0 ; encoding: [0x09,0x01,0x00,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
-; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x15,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x03,0x05,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff ; encoding: [0xff,0x02,0x04,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v9, v2, v9 ; encoding: [0x02,0x13,0x12,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0 ; encoding: [0x02,0x01,0x00,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v9 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x25,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -112,12 +114,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_c_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
-; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -220,12 +224,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_c_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
-; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -236,12 +242,16 @@ main_body:
 define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; GFX10-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x04,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -252,12 +262,16 @@ main_body:
 define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; GFX10-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36]
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x06,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@ -31,12 +31,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
 ; GFX10-LABEL: sample_d_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v9, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, v9, v0
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10-NEXT:    v_mov_b32_e32 v9, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v9, v2, v9
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v9
+; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -112,12 +114,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_c_d_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -220,12 +224,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
 ; GFX10-LABEL: sample_c_cd_cl_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
-; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -236,12 +242,16 @@ main_body:
 define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; GFX10-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@ -252,12 +262,16 @@ main_body:
 define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
 ; GFX10-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v10, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, v0, v9
+; GFX10-NEXT:    v_lshl_or_b32 v5, v5, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body: