1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

AMDGPU/GlobalISel: Fix RegBankSelect for G_INSERT_VECTOR_ELT

The result and source vector are going to be tied, so these need to be
the same bank.

The inserted value also needs to be broken down based on the result
bank, not the inserted value itself.
This commit is contained in:
Matt Arsenault 2020-01-03 20:22:01 -05:00 committed by Matt Arsenault
parent 45831d88f1
commit 5341a3bed9
3 changed files with 104 additions and 34 deletions

View File

@ -132,7 +132,8 @@ const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
};
// For some instructions which can operate 64-bit only for the scalar version.
// For some instructions which can operate 64-bit only for the scalar
// version. Otherwise, these need to be split into 2 32-bit operations.
const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
/*32-bit sgpr*/ {&SGPROnly64BreakDown[0], 1},
/*2 x 32-bit sgpr*/ {&SGPROnly64BreakDown[1], 2},
@ -207,6 +208,18 @@ const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID,
return &ValMappingsSGPR64OnlyVGPR32[2];
}
/// Split any 64-bit value into 2 32-bit pieces. Unlike
/// getValueMappingSGPR64Only, this splits both VGPRs and SGPRs.
const RegisterBankInfo::ValueMapping *getValueMappingSplit64(unsigned BankID,
unsigned Size) {
assert(Size == 64);
if (BankID == AMDGPU::VGPRRegBankID)
return &ValMappingsSGPR64OnlyVGPR32[4];
assert(BankID == AMDGPU::SGPRRegBankID);
return &ValMappingsSGPR64OnlyVGPR32[1];
}
const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] {
/* 256-bit load */ {0, 256, SGPRRegBank},
/* 512-bit load */ {0, 512, SGPRRegBank},

View File

@ -1288,13 +1288,16 @@ void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static void substituteSimpleCopyRegs(
static bool substituteSimpleCopyRegs(
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
if (!SrcReg.empty()) {
assert(SrcReg.size() == 1);
OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
return true;
}
return false;
}
/// Handle register layout difference for f16 images for some subtargets.
@ -2117,17 +2120,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_INSERT_VECTOR_ELT: {
SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
Register DstReg = MI.getOperand(0).getReg();
LLT VecTy = MRI.getType(DstReg);
assert(OpdMapper.getVRegs(0).empty());
assert(OpdMapper.getVRegs(1).empty());
assert(OpdMapper.getVRegs(3).empty());
if (substituteSimpleCopyRegs(OpdMapper, 1))
MRI.setType(MI.getOperand(1).getReg(), VecTy);
if (InsRegs.empty()) {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, { 3 });
return;
}
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
Register InsReg = MI.getOperand(2).getReg();
Register IdxReg = MI.getOperand(3).getReg();
@ -2951,15 +2958,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
MRI, *TRI);
unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
InsertSize);
OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
// This is a weird case, because we need to break down the mapping based on
// the register bank of a different operand.
if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
InsertSize);
} else {
assert(InsertSize == 32 || InsertSize == 64);
OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
}
// The index can be either if the source vector is VGPR.
OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);

View File

@ -56,7 +56,8 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32)
; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>)
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32)
; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = COPY $vgpr0
@ -80,16 +81,17 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>)
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: .1:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32)
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32)
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@ -120,16 +122,17 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>)
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: .1:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32)
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32)
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@ -285,8 +288,15 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17
; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[COPY2]](s32)
; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>)
; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY2]], [[C]](s32)
; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32)
; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32)
; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>)
; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>)
%0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
%1:_(s64) = COPY $sgpr16_sgpr17
%2:_(s32) = COPY $sgpr18
@ -308,8 +318,9 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr16
; CHECK: [[COPY3:%[0-9]+]]:vgpr(<8 x s64>) = COPY [[COPY]](<8 x s64>)
; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY3]](<8 x s64>)
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY2]], [[C]](s32)
; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
@ -339,16 +350,32 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[COPY3:%[0-9]+]]:vgpr(<8 x s64>) = COPY [[COPY]](<8 x s64>)
; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY3]](<8 x s64>)
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF4:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF
; CHECK: [[DEF5:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: .1:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF5]], %bb.0, %26, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %9(s32), %bb.1
; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %10(s32), %bb.1
; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1
; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %12(<16 x s32>), %bb.1
; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[V_READFIRSTLANE_B32_]](s32)
; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32)
; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32)
; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>)
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@ -356,7 +383,7 @@ body: |
; CHECK: successors: %bb.3(0x80000000)
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
; CHECK: .3:
; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>)
; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>)
%0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
%1:_(s64) = COPY $sgpr16_sgpr17
%2:_(s32) = COPY $vgpr0
@ -379,8 +406,9 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; CHECK: [[COPY3:%[0-9]+]]:vgpr(<8 x s64>) = COPY [[COPY]](<8 x s64>)
; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY3]](<8 x s64>)
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
@ -391,11 +419,11 @@ body: |
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: .1:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF5]], %bb.0, %25, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1
; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1
; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1
; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF5]], %bb.0, %26, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %9(s32), %bb.1
; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %10(s32), %bb.1
; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1
; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %12(<16 x s32>), %bb.1
; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec
@ -464,16 +492,31 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
; CHECK: [[DEF:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF4:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF
; CHECK: [[DEF5:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: .1:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF5]], %bb.0, %25, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1
; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1
; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1
; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1
; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[V_READFIRSTLANE_B32_]](s32)
; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32)
; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32)
; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>)
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@ -481,7 +524,7 @@ body: |
; CHECK: successors: %bb.3(0x80000000)
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
; CHECK: .3:
; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>)
; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>)
%0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
%1:_(s64) = COPY $sgpr0_sgpr1
%2:_(s32) = COPY $vgpr16