[GlobalISel] Add X,Y<dead> = G_UNMERGE Z -> X = G_TRUNC Z

Add a combiner helper that replaces G_UNMERGE where all the destination lanes are dead except the first one with a G_TRUNC. Differential Revision: https://reviews.llvm.org/D87174
2025-01-31 20:51:52 +01:00 · 2020-09-04 17:09:38 -07:00 · 2020-09-04 17:09:38 -07:00 · b9fe935943
commit b9fe935943
parent a1f6a63fbc
9 changed files with 1581 additions and 100 deletions
--- a/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@ -258,6 +258,10 @@ public:
  bool applyCombineUnmergeConstant(MachineInstr &MI,
                                   SmallVectorImpl<APInt> &Csts);

+  /// Transform X, Y<dead> = G_UNMERGE Z -> X = G_TRUNC Z.
+  bool matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI);
+  bool applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI);
+
  /// Transform IntToPtr(PtrToInt(x)) to x if cast is in the same address space.
  bool matchCombineI2PToP2I(MachineInstr &MI, Register &Reg);
  bool applyCombineI2PToP2I(MachineInstr &MI, Register &Reg);
--- a/include/llvm/Target/GlobalISel/Combine.td
+++ b/include/llvm/Target/GlobalISel/Combine.td
@ -421,6 +421,14 @@ def unmerge_cst : GICombineRule<
  (apply [{ return Helper.applyCombineUnmergeConstant(*${d}, ${info}); }])
 >;

+// Transform x,y<dead> = unmerge z -> x = trunc z.
+def unmerge_dead_to_trunc : GICombineRule<
+  (defs root:$d),
+  (match (wip_match_opcode G_UNMERGE_VALUES): $d,
+  [{ return Helper.matchCombineUnmergeWithDeadLanesToTrunc(*${d}); }]),
+  (apply [{ return Helper.applyCombineUnmergeWithDeadLanesToTrunc(*${d}); }])
+>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                     undef_to_negative_one,
@ -452,4 +460,4 @@ def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain,
    width_reduction_combines, select_combines,
    known_bits_simplifications, ext_ext_fold,
    not_cmp_fold, opt_brcond_by_inverting_cond,
-    unmerge_merge, fabs_fabs_fold, unmerge_cst]>;
+    unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc]>;
--- a/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@ -1654,6 +1654,39 @@ bool CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI,
  return true;
 }

+bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  // Check that all the lanes are dead except the first one.
+  for (unsigned Idx = 1, EndIdx = MI.getNumDefs(); Idx != EndIdx; ++Idx) {
+    if (!MRI.use_nodbg_empty(MI.getOperand(Idx).getReg()))
+      return false;
+  }
+  return true;
+}
+
+bool CombinerHelper::applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
+  Builder.setInstrAndDebugLoc(MI);
+  Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg();
+  // Truncating a vector is going to truncate every single lane,
+  // whereas we want the full lowbits.
+  // Do the operation on a scalar instead.
+  LLT SrcTy = MRI.getType(SrcReg);
+  if (SrcTy.isVector())
+    SrcReg =
+        Builder.buildCast(LLT::scalar(SrcTy.getSizeInBits()), SrcReg).getReg(0);
+
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT Dst0Ty = MRI.getType(Dst0Reg);
+  if (Dst0Ty.isVector()) {
+    auto MIB = Builder.buildTrunc(LLT::scalar(Dst0Ty.getSizeInBits()), SrcReg);
+    Builder.buildCast(Dst0Reg, MIB);
+  } else
+    Builder.buildTrunc(Dst0Reg, SrcReg);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
                                                unsigned TargetShiftSize,
                                                unsigned &ShiftVal) {
--- a/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@ -292,3 +292,80 @@ body:             |
    $h2 = COPY %3(s16)
    $h3 = COPY %4(s16)
 ...
+
+# Transform unmerge into trunc when only the first definition is live.
+---
+name:            test_combine_unmerge_dead_to_trunc
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK: $h0 = COPY [[TRUNC]](s16)
+    %0:_(s64) = COPY $x0
+    %1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(s64)
+    $h0 = COPY %1(s16)
+...
+
+# Don't transform unmerge into trunc when middle lanes are live.
+---
+name:            test_dont_combine_unmerge_dead_to_trunc
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_dont_combine_unmerge_dead_to_trunc
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: $h0 = COPY [[UV2]](s16)
+    %0:_(s64) = COPY $x0
+    %1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(s64)
+    $h0 = COPY %3(s16)
+...
+
+# Transform unmerge into trunc when only the first definition is live, even
+# if the input and output types are vectors.
+---
+name:            test_combine_unmerge_dead_to_trunc_vec_in_n_out
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_in_n_out
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<2 x s32>)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
+    ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[TRUNC]](s32)
+    ; CHECK: $w0 = COPY [[BITCAST1]](<2 x s16>)
+    %0:_(<2 x s32>) = COPY $x0
+    %1:_(<2 x s16>),%2:_(<2 x s16>) = G_UNMERGE_VALUES %0(<2 x s32>)
+    $w0 = COPY %1(<2 x s16>)
+...
+
+# Transform unmerge into trunc when only the first definition is live, even
+# if the input type is vector.
+---
+name:            test_combine_unmerge_dead_to_trunc_vec_in
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_in
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<2 x s32>)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s64)
+    ; CHECK: $h0 = COPY [[TRUNC]](s16)
+    %0:_(<2 x s32>) = COPY $x0
+    %1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(<2 x s32>)
+    $h0 = COPY %1(s16)
+...
+
+# Transform unmerge into trunc when only the first definition is live, even
+# if the output type are vector.
+---
+name:            test_combine_unmerge_dead_to_trunc_vec_out
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_out
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[TRUNC]](s32)
+    ; CHECK: $w0 = COPY [[BITCAST]](<2 x s16>)
+    %0:_(s64) = COPY $x0
+    %1:_(<2 x s16>),%2:_(<2 x s16>) = G_UNMERGE_VALUES %0(s64)
+    $w0 = COPY %1(<2 x s16>)
+...
--- a/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
@ -12,9 +12,9 @@ body:             |
    ; CHECK-LABEL: name: narrow_shl_s64_32_s64amt
    ; CHECK: liveins: $vgpr0_vgpr1
    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[TRUNC]](s32)
    ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
    %0:_(s64) = COPY $vgpr0_vgpr1
    %1:_(s64) = G_CONSTANT i64 32
@ -32,9 +32,9 @@ body:             |
    ; CHECK-LABEL: name: narrow_shl_s64_32
    ; CHECK: liveins: $vgpr0_vgpr1
    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[TRUNC]](s32)
    ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
    %0:_(s64) = COPY $vgpr0_vgpr1
    %1:_(s32) = G_CONSTANT i32 32
@ -52,9 +52,9 @@ body:             |
    ; CHECK-LABEL: name: narrow_shl_s64_33
    ; CHECK: liveins: $vgpr0_vgpr1
    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C]](s32)
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[SHL]](s32)
    ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
@ -93,9 +93,9 @@ body:             |
    ; CHECK-LABEL: name: narrow_shl_s64_63
    ; CHECK: liveins: $vgpr0_vgpr1
    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C]](s32)
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[SHL]](s32)
    ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
@ -110,15 +110,16 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
  ; UNPACKED:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; UNPACKED:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-  ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; UNPACKED:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
+  ; UNPACKED:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
+  ; UNPACKED:   [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
+  ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
  ; UNPACKED:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; UNPACKED:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-  ; UNPACKED:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>)
-  ; UNPACKED:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; UNPACKED:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+  ; UNPACKED:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+  ; UNPACKED:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+  ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; UNPACKED:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
  ; UNPACKED:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-  ; UNPACKED:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+  ; UNPACKED:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
  ; UNPACKED:   [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
  ; UNPACKED:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
  ; UNPACKED:   S_ENDPGM 0
@ -140,9 +141,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; PACKED:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
  ; PACKED:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
-  ; PACKED:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; PACKED:   [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
+  ; PACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
+  ; PACKED:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; PACKED:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+  ; PACKED:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+  ; PACKED:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+  ; PACKED:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+  ; PACKED:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]]
+  ; PACKED:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; PACKED:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]]
+  ; PACKED:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+  ; PACKED:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+  ; PACKED:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+  ; PACKED:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+  ; PACKED:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]]
+  ; PACKED:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; PACKED:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
+  ; PACKED:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+  ; PACKED:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
+  ; PACKED:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[DEF]](<2 x s16>)
+  ; PACKED:   [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0
  ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; PACKED:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
+  ; PACKED:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[EXTRACT]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
  ; PACKED:   S_ENDPGM 0
  call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
  ret void
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
@ -174,22 +174,20 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
  ; GFX6:   [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4)
  ; GFX6:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
-  ; GFX6:   [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2
-  ; GFX6:   [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5
-  ; GFX6:   [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8
-  ; GFX6:   [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11
-  ; GFX6:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0
-  ; GFX6:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1
-  ; GFX6:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2
-  ; GFX6:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; GFX6:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
+  ; GFX6:   [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]]
+  ; GFX6:   [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2
+  ; GFX6:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0
+  ; GFX6:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1
+  ; GFX6:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2
+  ; GFX6:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; GFX6:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
  ; GFX6:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX6:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
-  ; GFX6:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX6:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX6:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
  ; GFX6:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX6:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
-  ; GFX6:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+  ; GFX6:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX6:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
  ; GFX6:   $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
  ; GFX6:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
  ; GFX7-LABEL: name: s_buffer_load_v3i32
@ -203,22 +201,20 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
  ; GFX7:   [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4)
  ; GFX7:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
-  ; GFX7:   [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2
-  ; GFX7:   [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5
-  ; GFX7:   [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8
-  ; GFX7:   [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11
-  ; GFX7:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0
-  ; GFX7:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1
-  ; GFX7:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2
-  ; GFX7:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; GFX7:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
+  ; GFX7:   [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]]
+  ; GFX7:   [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2
+  ; GFX7:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0
+  ; GFX7:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1
+  ; GFX7:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2
+  ; GFX7:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; GFX7:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
  ; GFX7:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX7:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
-  ; GFX7:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX7:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX7:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
  ; GFX7:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX7:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
-  ; GFX7:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+  ; GFX7:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX7:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
  ; GFX7:   $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
  ; GFX7:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
  ; GFX8-LABEL: name: s_buffer_load_v3i32
@ -232,22 +228,20 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
  ; GFX8:   [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4)
  ; GFX8:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
-  ; GFX8:   [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2
-  ; GFX8:   [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5
-  ; GFX8:   [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8
-  ; GFX8:   [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11
-  ; GFX8:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0
-  ; GFX8:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1
-  ; GFX8:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2
-  ; GFX8:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; GFX8:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
+  ; GFX8:   [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]]
+  ; GFX8:   [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2
+  ; GFX8:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0
+  ; GFX8:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1
+  ; GFX8:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2
+  ; GFX8:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; GFX8:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
  ; GFX8:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX8:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
-  ; GFX8:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX8:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
  ; GFX8:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX8:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
-  ; GFX8:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+  ; GFX8:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX8:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
  ; GFX8:   $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
  ; GFX8:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
  %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@ -1600,15 +1594,12 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
  ; GFX6:   [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]]
  ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11
  ; GFX6:   [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2
-  ; GFX6:   [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5
-  ; GFX6:   [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8
-  ; GFX6:   [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11
-  ; GFX6:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0
-  ; GFX6:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1
-  ; GFX6:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2
-  ; GFX6:   $vgpr0 = COPY [[COPY11]]
-  ; GFX6:   $vgpr1 = COPY [[COPY12]]
-  ; GFX6:   $vgpr2 = COPY [[COPY13]]
+  ; GFX6:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0
+  ; GFX6:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1
+  ; GFX6:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2
+  ; GFX6:   $vgpr0 = COPY [[COPY8]]
+  ; GFX6:   $vgpr1 = COPY [[COPY9]]
+  ; GFX6:   $vgpr2 = COPY [[COPY10]]
  ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
  ; GFX7-LABEL: name: s_buffer_load_v3f32_vgpr_offset
  ; GFX7: bb.1 (%ir-block.0):
@ -1626,15 +1617,12 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
  ; GFX7:   [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]]
  ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11
  ; GFX7:   [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2
-  ; GFX7:   [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5
-  ; GFX7:   [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8
-  ; GFX7:   [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11
-  ; GFX7:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0
-  ; GFX7:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1
-  ; GFX7:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2
-  ; GFX7:   $vgpr0 = COPY [[COPY11]]
-  ; GFX7:   $vgpr1 = COPY [[COPY12]]
-  ; GFX7:   $vgpr2 = COPY [[COPY13]]
+  ; GFX7:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0
+  ; GFX7:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1
+  ; GFX7:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2
+  ; GFX7:   $vgpr0 = COPY [[COPY8]]
+  ; GFX7:   $vgpr1 = COPY [[COPY9]]
+  ; GFX7:   $vgpr2 = COPY [[COPY10]]
  ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
  ; GFX8-LABEL: name: s_buffer_load_v3f32_vgpr_offset
  ; GFX8: bb.1 (%ir-block.0):
@ -1652,15 +1640,12 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
  ; GFX8:   [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]]
  ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11
  ; GFX8:   [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2
-  ; GFX8:   [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5
-  ; GFX8:   [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8
-  ; GFX8:   [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11
-  ; GFX8:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0
-  ; GFX8:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1
-  ; GFX8:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2
-  ; GFX8:   $vgpr0 = COPY [[COPY11]]
-  ; GFX8:   $vgpr1 = COPY [[COPY12]]
-  ; GFX8:   $vgpr2 = COPY [[COPY13]]
+  ; GFX8:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0
+  ; GFX8:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1
+  ; GFX8:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2
+  ; GFX8:   $vgpr0 = COPY [[COPY8]]
+  ; GFX8:   $vgpr1 = COPY [[COPY9]]
+  ; GFX8:   $vgpr2 = COPY [[COPY10]]
  ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
  %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
  ret <3 x float> %val
--- a/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir
@ -37,8 +37,9 @@ body:             |
    ; GCN-LABEL: name: select_from_same_results_of_unmerge_values
    ; GCN: liveins: $vgpr0
    ; GCN: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
-    ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-    ; GCN: $vgpr0 = COPY [[UV]](s32)
+    ; GCN: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[DEF]](<2 x s32>)
+    ; GCN: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
+    ; GCN: $vgpr0 = COPY [[TRUNC]](s32)
    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
    %0:_(<2 x s32>) = G_IMPLICIT_DEF
    %1:_(s32) = COPY $vgpr0
--- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll