1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[AMDGPU][GlobalISel] Use scalar min/max instructions

SALU min/max s32 instructions exist so use them. This means that
regbankselect can handle min/max much like add/sub/mul/shifts.

Differential Revision: https://reviews.llvm.org/D96047
This commit is contained in:
Jay Foad 2021-02-04 16:08:39 +00:00
parent 425d60e18c
commit 414015c7e8
10 changed files with 1560 additions and 2824 deletions

View File

@ -591,21 +591,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
case TargetOpcode::G_SMIN:
case TargetOpcode::G_SMAX:
case TargetOpcode::G_UMIN:
case TargetOpcode::G_UMAX: {
static const OpRegBankEntry<3> Table[2] = {
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
// Scalar requires cmp+select, and extends if 16-bit.
// FIXME: Should there be separate costs for 32 and 16-bit
{ { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
};
const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
case TargetOpcode::G_SADDE:
@ -1576,23 +1561,8 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
return true;
}
// FIXME: Duplicated from LegalizerHelper
static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
switch (Opc) {
case TargetOpcode::G_SMIN:
return CmpInst::ICMP_SLT;
case TargetOpcode::G_SMAX:
return CmpInst::ICMP_SGT;
case TargetOpcode::G_UMIN:
return CmpInst::ICMP_ULT;
case TargetOpcode::G_UMAX:
return CmpInst::ICMP_UGT;
default:
llvm_unreachable("not in integer min/max");
}
}
static unsigned minMaxToExtend(unsigned Opc) {
// Return a suitable opcode for extending the operands of Opc when widening.
static unsigned getExtendOp(unsigned Opc) {
switch (Opc) {
case TargetOpcode::G_SMIN:
case TargetOpcode::G_SMAX:
@ -1601,7 +1571,7 @@ static unsigned minMaxToExtend(unsigned Opc) {
case TargetOpcode::G_UMAX:
return TargetOpcode::G_ZEXT;
default:
llvm_unreachable("not in integer min/max");
return TargetOpcode::G_ANYEXT;
}
}
@ -1628,30 +1598,6 @@ unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
}
static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
CmpInst::Predicate Pred,
Register Dst, Register Src0,
Register Src1) {
const LLT CmpType = LLT::scalar(32);
auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
return B.buildSelect(Dst, Cmp, Src0, Src1);
}
// FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
Register CmpReg = Sel->getOperand(1).getReg();
B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
MI.eraseFromParent();
}
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static bool substituteSimpleCopyRegs(
@ -2341,7 +2287,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
case AMDGPU::G_LSHR:
case AMDGPU::G_ASHR: {
case AMDGPU::G_ASHR:
case AMDGPU::G_SMIN:
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
@ -2365,10 +2315,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register WideSrc0Lo, WideSrc0Hi;
Register WideSrc1Lo, WideSrc1Hi;
unsigned ExtendOp = getExtendOp(MI.getOpcode());
std::tie(WideSrc0Lo, WideSrc0Hi)
= unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
= unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
std::tie(WideSrc1Lo, WideSrc1Hi)
= unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
= unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
@ -2390,73 +2341,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_SMIN:
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX: {
Register DstReg = MI.getOperand(0).getReg();
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank)
break;
MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
// Turn scalar min/max into a compare and select.
LLT Ty = MRI.getType(DstReg);
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
const LLT V2S16 = LLT::vector(2, 16);
if (Ty == V2S16) {
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
B.setChangeObserver(ApplySALU);
// Need to widen to s32, and expand as cmp + select, and avoid producing
// illegal vector extends or unmerges that would need further
// legalization.
//
// TODO: Should we just readfirstlane? That should probably be handled
// with a UniformVGPR register bank that wouldn't need special
// consideration here.
Register Dst = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
Register WideSrc0Lo, WideSrc0Hi;
Register WideSrc1Lo, WideSrc1Hi;
unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
Register Lo = MRI.createGenericVirtualRegister(S32);
Register Hi = MRI.createGenericVirtualRegister(S32);
const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
B.buildBuildVectorTrunc(Dst, {Lo, Hi});
MI.eraseFromParent();
} else if (Ty == S16) {
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
B.setChangeObserver(ApplySALU);
LegalizerHelper Helper(*MF, ApplySALU, B);
// Need to widen to s32, and expand as cmp + select.
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widenScalar should have succeeded");
// FIXME: This is relying on widenScalar leaving MI in place.
lowerScalarMinMax(B, MI);
} else
lowerScalarMinMax(B, MI);
return;
}
case AMDGPU::G_SEXT_INREG: {
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
if (SrcRegs.empty())

View File

@ -84,8 +84,6 @@ public:
bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper,
bool Signed) const;
void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;

View File

@ -13,8 +13,7 @@ body: |
; CHECK-LABEL: name: smax_s32_ss
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[COPY1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[COPY]], [[COPY1]]
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_SMAX %0, %1
@ -90,9 +89,8 @@ body: |
; CHECK-LABEL: name: smax_s32_ss_vgpr_use
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[COPY1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
; CHECK: $vgpr0 = COPY [[SELECT]](s32)
; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[COPY]], [[COPY1]]
; CHECK: $vgpr0 = COPY [[SMAX]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_SMAX %0, %1
@ -114,9 +112,8 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
; CHECK: [[SEXT1:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[SEXT]](s32), [[SEXT1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT]], [[SEXT1]]
; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT]], [[SEXT1]]
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SMAX]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
; CHECK: $sgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $sgpr0
@ -144,9 +141,8 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
; CHECK: [[SEXT1:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[SEXT]](s32), [[SEXT1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT]], [[SEXT1]]
; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT]], [[SEXT1]]
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SMAX]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $sgpr0
@ -178,11 +174,9 @@ body: |
; CHECK: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[SEXT_INREG]](s32), [[SEXT_INREG1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT_INREG]], [[SEXT_INREG1]]
; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[ASHR]](s32), [[ASHR1]]
; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[ASHR]], [[ASHR1]]
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT]](s32), [[SELECT1]](s32)
; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
; CHECK: [[SMAX1:%[0-9]+]]:sgpr(s32) = G_SMAX [[ASHR]], [[ASHR1]]
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMAX]](s32), [[SMAX1]](s32)
; CHECK: $sgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
%0:_(<2 x s16>) = COPY $sgpr0
%1:_(<2 x s16>) = COPY $sgpr1

View File

@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
# XUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
---
name: smin_s32_ss
@ -13,9 +13,8 @@ body: |
; CHECK-LABEL: name: smin_s32_ss
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
; CHECK: $sgpr0 = COPY [[SELECT]](s32)
; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[COPY]], [[COPY1]]
; CHECK: $sgpr0 = COPY [[SMIN]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_SMIN %0, %1
@ -93,9 +92,8 @@ body: |
; CHECK-LABEL: name: smin_s32_ss_vgpr_use
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
; CHECK: $vgpr0 = COPY [[SELECT]](s32)
; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[COPY]], [[COPY1]]
; CHECK: $vgpr0 = COPY [[SMIN]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_SMIN %0, %1
@ -117,9 +115,8 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
; CHECK: [[SEXT1:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[SEXT]](s32), [[SEXT1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT]], [[SEXT1]]
; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT]], [[SEXT1]]
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SMIN]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
; CHECK: $sgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $sgpr0
@ -147,9 +144,8 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
; CHECK: [[SEXT1:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[SEXT]](s32), [[SEXT1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT]], [[SEXT1]]
; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT]], [[SEXT1]]
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SMIN]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $sgpr0
@ -181,11 +177,9 @@ body: |
; CHECK: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[SEXT_INREG]], [[SEXT_INREG1]]
; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[ASHR]](s32), [[ASHR1]]
; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[ASHR]], [[ASHR1]]
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT]](s32), [[SELECT1]](s32)
; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
; CHECK: [[SMIN1:%[0-9]+]]:sgpr(s32) = G_SMIN [[ASHR]], [[ASHR1]]
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMIN]](s32), [[SMIN1]](s32)
; CHECK: $sgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
%0:_(<2 x s16>) = COPY $sgpr0
%1:_(<2 x s16>) = COPY $sgpr1

View File

@ -13,9 +13,8 @@ body: |
; CHECK-LABEL: name: umax_s32_ss
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
; CHECK: $sgpr0 = COPY [[SELECT]](s32)
; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[COPY]], [[COPY1]]
; CHECK: $sgpr0 = COPY [[UMAX]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_UMAX %0, %1
@ -93,9 +92,8 @@ body: |
; CHECK-LABEL: name: umax_s32_ss_vgpr_use
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
; CHECK: $vgpr0 = COPY [[SELECT]](s32)
; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[COPY]], [[COPY1]]
; CHECK: $vgpr0 = COPY [[UMAX]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_UMAX %0, %1
@ -117,9 +115,8 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16)
; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[ZEXT]](s32), [[ZEXT1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[ZEXT]], [[ZEXT1]]
; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[ZEXT]], [[ZEXT1]]
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[UMAX]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
; CHECK: $sgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $sgpr0
@ -147,9 +144,8 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16)
; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[ZEXT]](s32), [[ZEXT1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[ZEXT]], [[ZEXT1]]
; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[ZEXT]], [[ZEXT1]]
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[UMAX]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $sgpr0
@ -183,11 +179,9 @@ body: |
; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; CHECK: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[AND]], [[AND1]]
; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ugt), [[LSHR]](s32), [[LSHR1]]
; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[LSHR]], [[LSHR1]]
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT]](s32), [[SELECT1]](s32)
; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[AND]], [[AND1]]
; CHECK: [[UMAX1:%[0-9]+]]:sgpr(s32) = G_UMAX [[LSHR]], [[LSHR1]]
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMAX]](s32), [[UMAX1]](s32)
; CHECK: $sgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
%0:_(<2 x s16>) = COPY $sgpr0
%1:_(<2 x s16>) = COPY $sgpr1

View File

@ -13,9 +13,8 @@ body: |
; CHECK-LABEL: name: umin_s32_ss
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
; CHECK: $sgpr0 = COPY [[SELECT]](s32)
; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[COPY]], [[COPY1]]
; CHECK: $sgpr0 = COPY [[UMIN]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_UMIN %0, %1
@ -97,9 +96,8 @@ body: |
; CHECK-LABEL: name: umin_s32_ss_vgpr_use
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
; CHECK: $vgpr0 = COPY [[SELECT]](s32)
; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[COPY]], [[COPY1]]
; CHECK: $vgpr0 = COPY [[UMIN]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_UMIN %0, %1
@ -121,9 +119,8 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16)
; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[ZEXT]](s32), [[ZEXT1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[ZEXT]], [[ZEXT1]]
; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[ZEXT]], [[ZEXT1]]
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[UMIN]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
; CHECK: $sgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $sgpr0
@ -151,9 +148,8 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16)
; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC %8(s32)
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[ZEXT]](s32), [[ZEXT1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[ZEXT]], [[ZEXT1]]
; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[ZEXT]], [[ZEXT1]]
; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[UMIN]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s16)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $sgpr0
@ -187,11 +183,9 @@ body: |
; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; CHECK: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[AND]], [[AND1]]
; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[LSHR]](s32), [[LSHR1]]
; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[LSHR]], [[LSHR1]]
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT]](s32), [[SELECT1]](s32)
; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[AND]], [[AND1]]
; CHECK: [[UMIN1:%[0-9]+]]:sgpr(s32) = G_UMIN [[LSHR]], [[LSHR1]]
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMIN]](s32), [[UMIN1]](s32)
; CHECK: $sgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
%0:_(<2 x s16>) = COPY $sgpr0
%1:_(<2 x s16>) = COPY $sgpr1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -53,8 +53,7 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX6-NEXT: s_lshl_b32 s0, s0, 25
; GFX6-NEXT: s_lshl_b32 s1, s1, 25
; GFX6-NEXT: s_not_b32 s2, s0
; GFX6-NEXT: s_cmp_lt_u32 s2, s1
; GFX6-NEXT: s_cselect_b32 s1, s2, s1
; GFX6-NEXT: s_min_u32 s1, s2, s1
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 25
; GFX6-NEXT: ; return to shader part epilog
@ -143,8 +142,7 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX6-NEXT: s_lshl_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 24
; GFX6-NEXT: s_not_b32 s2, s0
; GFX6-NEXT: s_cmp_lt_u32 s2, s1
; GFX6-NEXT: s_cselect_b32 s1, s2, s1
; GFX6-NEXT: s_min_u32 s1, s2, s1
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: ; return to shader part epilog
@ -272,17 +270,15 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX6-NEXT: s_lshr_b32 s3, s1, 8
; GFX6-NEXT: s_lshl_b32 s1, s1, 24
; GFX6-NEXT: s_not_b32 s4, s0
; GFX6-NEXT: s_cmp_lt_u32 s4, s1
; GFX6-NEXT: s_cselect_b32 s1, s4, s1
; GFX6-NEXT: s_min_u32 s1, s4, s1
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 24
; GFX6-NEXT: s_lshl_b32 s2, s3, 24
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_not_b32 s3, s1
; GFX6-NEXT: s_cmp_lt_u32 s3, s2
; GFX6-NEXT: s_cselect_b32 s2, s3, s2
; GFX6-NEXT: s_min_u32 s2, s3, s2
; GFX6-NEXT: s_add_i32 s1, s1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 8
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
@ -521,31 +517,27 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_lshr_b32 s7, s1, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 24
; GFX6-NEXT: s_not_b32 s8, s0
; GFX6-NEXT: s_cmp_lt_u32 s8, s1
; GFX6-NEXT: s_cselect_b32 s1, s8, s1
; GFX6-NEXT: s_min_u32 s1, s8, s1
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 24
; GFX6-NEXT: s_lshl_b32 s2, s5, 24
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_not_b32 s5, s1
; GFX6-NEXT: s_cmp_lt_u32 s5, s2
; GFX6-NEXT: s_cselect_b32 s2, s5, s2
; GFX6-NEXT: s_min_u32 s2, s5, s2
; GFX6-NEXT: s_add_i32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 24
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_lshl_b32 s3, s6, 24
; GFX6-NEXT: s_not_b32 s5, s2
; GFX6-NEXT: s_cmp_lt_u32 s5, s3
; GFX6-NEXT: s_cselect_b32 s3, s5, s3
; GFX6-NEXT: s_min_u32 s3, s5, s3
; GFX6-NEXT: s_add_i32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
; GFX6-NEXT: s_not_b32 s5, s3
; GFX6-NEXT: s_cmp_lt_u32 s5, s4
; GFX6-NEXT: s_cselect_b32 s4, s5, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: s_add_i32 s3, s3, s4
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 8
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
@ -736,8 +728,7 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX6-NEXT: s_lshl_b32 s0, s0, 8
; GFX6-NEXT: s_lshl_b32 s1, s1, 8
; GFX6-NEXT: s_not_b32 s2, s0
; GFX6-NEXT: s_cmp_lt_u32 s2, s1
; GFX6-NEXT: s_cselect_b32 s1, s2, s1
; GFX6-NEXT: s_min_u32 s1, s2, s1
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 8
; GFX6-NEXT: ; return to shader part epilog
@ -809,8 +800,7 @@ define amdgpu_ps i32 @s_uaddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
; GFX6-LABEL: s_uaddsat_i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_not_b32 s2, s0
; GFX6-NEXT: s_cmp_lt_u32 s2, s1
; GFX6-NEXT: s_cselect_b32 s1, s2, s1
; GFX6-NEXT: s_min_u32 s1, s2, s1
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
@ -932,12 +922,10 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
; GFX6-LABEL: s_uaddsat_v2i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_not_b32 s4, s0
; GFX6-NEXT: s_cmp_lt_u32 s4, s2
; GFX6-NEXT: s_cselect_b32 s2, s4, s2
; GFX6-NEXT: s_min_u32 s2, s4, s2
; GFX6-NEXT: s_add_i32 s0, s0, s2
; GFX6-NEXT: s_not_b32 s2, s1
; GFX6-NEXT: s_cmp_lt_u32 s2, s3
; GFX6-NEXT: s_cselect_b32 s2, s2, s3
; GFX6-NEXT: s_min_u32 s2, s2, s3
; GFX6-NEXT: s_add_i32 s1, s1, s2
; GFX6-NEXT: ; return to shader part epilog
;
@ -1019,16 +1007,13 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
; GFX6-LABEL: s_uaddsat_v3i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_not_b32 s6, s0
; GFX6-NEXT: s_cmp_lt_u32 s6, s3
; GFX6-NEXT: s_cselect_b32 s3, s6, s3
; GFX6-NEXT: s_min_u32 s3, s6, s3
; GFX6-NEXT: s_add_i32 s0, s0, s3
; GFX6-NEXT: s_not_b32 s3, s1
; GFX6-NEXT: s_cmp_lt_u32 s3, s4
; GFX6-NEXT: s_cselect_b32 s3, s3, s4
; GFX6-NEXT: s_min_u32 s3, s3, s4
; GFX6-NEXT: s_add_i32 s1, s1, s3
; GFX6-NEXT: s_not_b32 s3, s2
; GFX6-NEXT: s_cmp_lt_u32 s3, s5
; GFX6-NEXT: s_cselect_b32 s3, s3, s5
; GFX6-NEXT: s_min_u32 s3, s3, s5
; GFX6-NEXT: s_add_i32 s2, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
@ -1124,20 +1109,16 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
; GFX6-LABEL: s_uaddsat_v4i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_not_b32 s8, s0
; GFX6-NEXT: s_cmp_lt_u32 s8, s4
; GFX6-NEXT: s_cselect_b32 s4, s8, s4
; GFX6-NEXT: s_min_u32 s4, s8, s4
; GFX6-NEXT: s_add_i32 s0, s0, s4
; GFX6-NEXT: s_not_b32 s4, s1
; GFX6-NEXT: s_cmp_lt_u32 s4, s5
; GFX6-NEXT: s_cselect_b32 s4, s4, s5
; GFX6-NEXT: s_min_u32 s4, s4, s5
; GFX6-NEXT: s_add_i32 s1, s1, s4
; GFX6-NEXT: s_not_b32 s4, s2
; GFX6-NEXT: s_cmp_lt_u32 s4, s6
; GFX6-NEXT: s_cselect_b32 s4, s4, s6
; GFX6-NEXT: s_min_u32 s4, s4, s6
; GFX6-NEXT: s_add_i32 s2, s2, s4
; GFX6-NEXT: s_not_b32 s4, s3
; GFX6-NEXT: s_cmp_lt_u32 s4, s7
; GFX6-NEXT: s_cselect_b32 s4, s4, s7
; GFX6-NEXT: s_min_u32 s4, s4, s7
; GFX6-NEXT: s_add_i32 s3, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
@ -1247,24 +1228,19 @@ define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
; GFX6-LABEL: s_uaddsat_v5i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_not_b32 s10, s0
; GFX6-NEXT: s_cmp_lt_u32 s10, s5
; GFX6-NEXT: s_cselect_b32 s5, s10, s5
; GFX6-NEXT: s_min_u32 s5, s10, s5
; GFX6-NEXT: s_add_i32 s0, s0, s5
; GFX6-NEXT: s_not_b32 s5, s1
; GFX6-NEXT: s_cmp_lt_u32 s5, s6
; GFX6-NEXT: s_cselect_b32 s5, s5, s6
; GFX6-NEXT: s_min_u32 s5, s5, s6
; GFX6-NEXT: s_add_i32 s1, s1, s5
; GFX6-NEXT: s_not_b32 s5, s2
; GFX6-NEXT: s_cmp_lt_u32 s5, s7
; GFX6-NEXT: s_cselect_b32 s5, s5, s7
; GFX6-NEXT: s_min_u32 s5, s5, s7
; GFX6-NEXT: s_add_i32 s2, s2, s5
; GFX6-NEXT: s_not_b32 s5, s3
; GFX6-NEXT: s_cmp_lt_u32 s5, s8
; GFX6-NEXT: s_cselect_b32 s5, s5, s8
; GFX6-NEXT: s_min_u32 s5, s5, s8
; GFX6-NEXT: s_add_i32 s3, s3, s5
; GFX6-NEXT: s_not_b32 s5, s4
; GFX6-NEXT: s_cmp_lt_u32 s5, s9
; GFX6-NEXT: s_cselect_b32 s5, s5, s9
; GFX6-NEXT: s_min_u32 s5, s5, s9
; GFX6-NEXT: s_add_i32 s4, s4, s5
; GFX6-NEXT: ; return to shader part epilog
;
@ -1448,68 +1424,52 @@ define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
; GFX6-LABEL: s_uaddsat_v16i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_not_b32 s32, s0
; GFX6-NEXT: s_cmp_lt_u32 s32, s16
; GFX6-NEXT: s_cselect_b32 s16, s32, s16
; GFX6-NEXT: s_min_u32 s16, s32, s16
; GFX6-NEXT: s_add_i32 s0, s0, s16
; GFX6-NEXT: s_not_b32 s16, s1
; GFX6-NEXT: s_cmp_lt_u32 s16, s17
; GFX6-NEXT: s_cselect_b32 s16, s16, s17
; GFX6-NEXT: s_min_u32 s16, s16, s17
; GFX6-NEXT: s_add_i32 s1, s1, s16
; GFX6-NEXT: s_not_b32 s16, s2
; GFX6-NEXT: s_cmp_lt_u32 s16, s18
; GFX6-NEXT: s_cselect_b32 s16, s16, s18
; GFX6-NEXT: s_min_u32 s16, s16, s18
; GFX6-NEXT: s_add_i32 s2, s2, s16
; GFX6-NEXT: s_not_b32 s16, s3
; GFX6-NEXT: s_cmp_lt_u32 s16, s19
; GFX6-NEXT: s_cselect_b32 s16, s16, s19
; GFX6-NEXT: s_min_u32 s16, s16, s19
; GFX6-NEXT: s_add_i32 s3, s3, s16
; GFX6-NEXT: s_not_b32 s16, s4
; GFX6-NEXT: s_cmp_lt_u32 s16, s20
; GFX6-NEXT: s_cselect_b32 s16, s16, s20
; GFX6-NEXT: s_min_u32 s16, s16, s20
; GFX6-NEXT: s_add_i32 s4, s4, s16
; GFX6-NEXT: s_not_b32 s16, s5
; GFX6-NEXT: s_cmp_lt_u32 s16, s21
; GFX6-NEXT: s_cselect_b32 s16, s16, s21
; GFX6-NEXT: s_min_u32 s16, s16, s21
; GFX6-NEXT: s_add_i32 s5, s5, s16
; GFX6-NEXT: s_not_b32 s16, s6
; GFX6-NEXT: s_cmp_lt_u32 s16, s22
; GFX6-NEXT: s_cselect_b32 s16, s16, s22
; GFX6-NEXT: s_min_u32 s16, s16, s22
; GFX6-NEXT: s_add_i32 s6, s6, s16
; GFX6-NEXT: s_not_b32 s16, s7
; GFX6-NEXT: s_cmp_lt_u32 s16, s23
; GFX6-NEXT: s_cselect_b32 s16, s16, s23
; GFX6-NEXT: s_min_u32 s16, s16, s23
; GFX6-NEXT: s_add_i32 s7, s7, s16
; GFX6-NEXT: s_not_b32 s16, s8
; GFX6-NEXT: s_cmp_lt_u32 s16, s24
; GFX6-NEXT: s_cselect_b32 s16, s16, s24
; GFX6-NEXT: s_min_u32 s16, s16, s24
; GFX6-NEXT: s_add_i32 s8, s8, s16
; GFX6-NEXT: s_not_b32 s16, s9
; GFX6-NEXT: s_cmp_lt_u32 s16, s25
; GFX6-NEXT: s_cselect_b32 s16, s16, s25
; GFX6-NEXT: s_min_u32 s16, s16, s25
; GFX6-NEXT: s_add_i32 s9, s9, s16
; GFX6-NEXT: s_not_b32 s16, s10
; GFX6-NEXT: s_cmp_lt_u32 s16, s26
; GFX6-NEXT: s_cselect_b32 s16, s16, s26
; GFX6-NEXT: s_min_u32 s16, s16, s26
; GFX6-NEXT: s_add_i32 s10, s10, s16
; GFX6-NEXT: s_not_b32 s16, s11
; GFX6-NEXT: s_cmp_lt_u32 s16, s27
; GFX6-NEXT: s_cselect_b32 s16, s16, s27
; GFX6-NEXT: s_min_u32 s16, s16, s27
; GFX6-NEXT: s_add_i32 s11, s11, s16
; GFX6-NEXT: s_not_b32 s16, s12
; GFX6-NEXT: s_cmp_lt_u32 s16, s28
; GFX6-NEXT: s_cselect_b32 s16, s16, s28
; GFX6-NEXT: s_min_u32 s16, s16, s28
; GFX6-NEXT: s_add_i32 s12, s12, s16
; GFX6-NEXT: s_not_b32 s16, s13
; GFX6-NEXT: s_cmp_lt_u32 s16, s29
; GFX6-NEXT: s_cselect_b32 s16, s16, s29
; GFX6-NEXT: s_min_u32 s16, s16, s29
; GFX6-NEXT: s_add_i32 s13, s13, s16
; GFX6-NEXT: s_not_b32 s16, s14
; GFX6-NEXT: s_cmp_lt_u32 s16, s30
; GFX6-NEXT: s_cselect_b32 s16, s16, s30
; GFX6-NEXT: s_min_u32 s16, s16, s30
; GFX6-NEXT: s_add_i32 s14, s14, s16
; GFX6-NEXT: s_not_b32 s16, s15
; GFX6-NEXT: s_cmp_lt_u32 s16, s31
; GFX6-NEXT: s_cselect_b32 s16, s16, s31
; GFX6-NEXT: s_min_u32 s16, s16, s31
; GFX6-NEXT: s_add_i32 s15, s15, s16
; GFX6-NEXT: ; return to shader part epilog
;
@ -1696,8 +1656,7 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_not_b32 s2, s0
; GFX6-NEXT: s_cmp_lt_u32 s2, s1
; GFX6-NEXT: s_cselect_b32 s1, s2, s1
; GFX6-NEXT: s_min_u32 s1, s2, s1
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: ; return to shader part epilog
@ -1835,17 +1794,15 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_not_b32 s4, s0
; GFX6-NEXT: s_cmp_lt_u32 s4, s2
; GFX6-NEXT: s_cselect_b32 s2, s4, s2
; GFX6-NEXT: s_add_i32 s0, s0, s2
; GFX6-NEXT: s_min_u32 s2, s4, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_not_b32 s3, s1
; GFX6-NEXT: s_cmp_lt_u32 s3, s2
; GFX6-NEXT: s_cselect_b32 s2, s3, s2
; GFX6-NEXT: s_min_u32 s2, s3, s2
; GFX6-NEXT: s_add_i32 s1, s1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
@ -2053,33 +2010,29 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_not_b32 s8, s0
; GFX6-NEXT: s_cmp_lt_u32 s8, s4
; GFX6-NEXT: s_cselect_b32 s4, s8, s4
; GFX6-NEXT: s_add_i32 s0, s0, s4
; GFX6-NEXT: s_min_u32 s4, s8, s4
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s0, s0, s4
; GFX6-NEXT: s_lshl_b32 s4, s5, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_not_b32 s5, s1
; GFX6-NEXT: s_cmp_lt_u32 s5, s4
; GFX6-NEXT: s_cselect_b32 s4, s5, s4
; GFX6-NEXT: s_add_i32 s1, s1, s4
; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s1, s1, s4
; GFX6-NEXT: s_lshl_b32 s4, s6, 16
; GFX6-NEXT: s_not_b32 s5, s2
; GFX6-NEXT: s_cmp_lt_u32 s5, s4
; GFX6-NEXT: s_cselect_b32 s4, s5, s4
; GFX6-NEXT: s_add_i32 s2, s2, s4
; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_add_i32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s7, 16
; GFX6-NEXT: s_not_b32 s5, s3
; GFX6-NEXT: s_cmp_lt_u32 s5, s4
; GFX6-NEXT: s_cselect_b32 s4, s5, s4
; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s3, s3, s4
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s1, s3, 16
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: ; return to shader part epilog
@ -2234,49 +2187,43 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s6, s6, 16
; GFX6-NEXT: s_not_b32 s12, s0
; GFX6-NEXT: s_cmp_lt_u32 s12, s6
; GFX6-NEXT: s_cselect_b32 s6, s12, s6
; GFX6-NEXT: s_add_i32 s0, s0, s6
; GFX6-NEXT: s_min_u32 s6, s12, s6
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s0, s0, s6
; GFX6-NEXT: s_lshl_b32 s6, s7, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_not_b32 s7, s1
; GFX6-NEXT: s_cmp_lt_u32 s7, s6
; GFX6-NEXT: s_cselect_b32 s6, s7, s6
; GFX6-NEXT: s_add_i32 s1, s1, s6
; GFX6-NEXT: s_min_u32 s6, s7, s6
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s1, s1, s6
; GFX6-NEXT: s_lshl_b32 s6, s8, 16
; GFX6-NEXT: s_not_b32 s7, s2
; GFX6-NEXT: s_cmp_lt_u32 s7, s6
; GFX6-NEXT: s_cselect_b32 s6, s7, s6
; GFX6-NEXT: s_add_i32 s2, s2, s6
; GFX6-NEXT: s_min_u32 s6, s7, s6
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_add_i32 s2, s2, s6
; GFX6-NEXT: s_lshl_b32 s6, s9, 16
; GFX6-NEXT: s_not_b32 s7, s3
; GFX6-NEXT: s_cmp_lt_u32 s7, s6
; GFX6-NEXT: s_cselect_b32 s6, s7, s6
; GFX6-NEXT: s_add_i32 s3, s3, s6
; GFX6-NEXT: s_min_u32 s6, s7, s6
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_add_i32 s3, s3, s6
; GFX6-NEXT: s_lshl_b32 s6, s10, 16
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_cmp_lt_u32 s7, s6
; GFX6-NEXT: s_cselect_b32 s6, s7, s6
; GFX6-NEXT: s_add_i32 s4, s4, s6
; GFX6-NEXT: s_min_u32 s6, s7, s6
; GFX6-NEXT: s_lshl_b32 s5, s5, 16
; GFX6-NEXT: s_lshr_b32 s4, s4, 16
; GFX6-NEXT: s_add_i32 s4, s4, s6
; GFX6-NEXT: s_lshl_b32 s6, s11, 16
; GFX6-NEXT: s_not_b32 s7, s5
; GFX6-NEXT: s_cmp_lt_u32 s7, s6
; GFX6-NEXT: s_cselect_b32 s6, s7, s6
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_min_u32 s6, s7, s6
; GFX6-NEXT: s_add_i32 s5, s5, s6
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s1, s3, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_lshr_b32 s4, s4, 16
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_or_b32 s2, s4, s2
; GFX6-NEXT: ; return to shader part epilog
@ -2454,65 +2401,57 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s8, s8, 16
; GFX6-NEXT: s_not_b32 s16, s0
; GFX6-NEXT: s_cmp_lt_u32 s16, s8
; GFX6-NEXT: s_cselect_b32 s8, s16, s8
; GFX6-NEXT: s_add_i32 s0, s0, s8
; GFX6-NEXT: s_min_u32 s8, s16, s8
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s0, s0, s8
; GFX6-NEXT: s_lshl_b32 s8, s9, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_not_b32 s9, s1
; GFX6-NEXT: s_cmp_lt_u32 s9, s8
; GFX6-NEXT: s_cselect_b32 s8, s9, s8
; GFX6-NEXT: s_add_i32 s1, s1, s8
; GFX6-NEXT: s_min_u32 s8, s9, s8
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s1, s1, s8
; GFX6-NEXT: s_lshl_b32 s8, s10, 16
; GFX6-NEXT: s_not_b32 s9, s2
; GFX6-NEXT: s_cmp_lt_u32 s9, s8
; GFX6-NEXT: s_cselect_b32 s8, s9, s8
; GFX6-NEXT: s_add_i32 s2, s2, s8
; GFX6-NEXT: s_min_u32 s8, s9, s8
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_add_i32 s2, s2, s8
; GFX6-NEXT: s_lshl_b32 s8, s11, 16
; GFX6-NEXT: s_not_b32 s9, s3
; GFX6-NEXT: s_cmp_lt_u32 s9, s8
; GFX6-NEXT: s_cselect_b32 s8, s9, s8
; GFX6-NEXT: s_add_i32 s3, s3, s8
; GFX6-NEXT: s_min_u32 s8, s9, s8
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_add_i32 s3, s3, s8
; GFX6-NEXT: s_lshl_b32 s8, s12, 16
; GFX6-NEXT: s_not_b32 s9, s4
; GFX6-NEXT: s_cmp_lt_u32 s9, s8
; GFX6-NEXT: s_cselect_b32 s8, s9, s8
; GFX6-NEXT: s_add_i32 s4, s4, s8
; GFX6-NEXT: s_min_u32 s8, s9, s8
; GFX6-NEXT: s_lshl_b32 s5, s5, 16
; GFX6-NEXT: s_lshr_b32 s4, s4, 16
; GFX6-NEXT: s_add_i32 s4, s4, s8
; GFX6-NEXT: s_lshl_b32 s8, s13, 16
; GFX6-NEXT: s_not_b32 s9, s5
; GFX6-NEXT: s_cmp_lt_u32 s9, s8
; GFX6-NEXT: s_cselect_b32 s8, s9, s8
; GFX6-NEXT: s_add_i32 s5, s5, s8
; GFX6-NEXT: s_min_u32 s8, s9, s8
; GFX6-NEXT: s_lshl_b32 s6, s6, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
; GFX6-NEXT: s_add_i32 s5, s5, s8
; GFX6-NEXT: s_lshl_b32 s8, s14, 16
; GFX6-NEXT: s_not_b32 s9, s6
; GFX6-NEXT: s_cmp_lt_u32 s9, s8
; GFX6-NEXT: s_cselect_b32 s8, s9, s8
; GFX6-NEXT: s_add_i32 s6, s6, s8
; GFX6-NEXT: s_min_u32 s8, s9, s8
; GFX6-NEXT: s_lshl_b32 s7, s7, 16
; GFX6-NEXT: s_lshr_b32 s6, s6, 16
; GFX6-NEXT: s_add_i32 s6, s6, s8
; GFX6-NEXT: s_lshl_b32 s8, s15, 16
; GFX6-NEXT: s_not_b32 s9, s7
; GFX6-NEXT: s_cmp_lt_u32 s9, s8
; GFX6-NEXT: s_cselect_b32 s8, s9, s8
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_min_u32 s8, s9, s8
; GFX6-NEXT: s_add_i32 s7, s7, s8
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s1, s3, 16
; GFX6-NEXT: s_lshr_b32 s7, s7, 16
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_lshr_b32 s4, s4, 16
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_lshr_b32 s6, s6, 16
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_or_b32 s2, s4, s2
; GFX6-NEXT: s_or_b32 s3, s6, s3
; GFX6-NEXT: ; return to shader part epilog

View File

@ -51,8 +51,7 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s0, 25
; GFX6-NEXT: s_lshl_b32 s1, s1, 25
; GFX6-NEXT: s_cmp_lt_u32 s0, s1
; GFX6-NEXT: s_cselect_b32 s1, s0, s1
; GFX6-NEXT: s_min_u32 s1, s0, s1
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 25
; GFX6-NEXT: ; return to shader part epilog
@ -139,8 +138,7 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 24
; GFX6-NEXT: s_cmp_lt_u32 s0, s1
; GFX6-NEXT: s_cselect_b32 s1, s0, s1
; GFX6-NEXT: s_min_u32 s1, s0, s1
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: ; return to shader part epilog
@ -265,16 +263,14 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX6-NEXT: s_lshr_b32 s3, s1, 8
; GFX6-NEXT: s_lshl_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 24
; GFX6-NEXT: s_cmp_lt_u32 s0, s1
; GFX6-NEXT: s_cselect_b32 s1, s0, s1
; GFX6-NEXT: s_min_u32 s1, s0, s1
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 24
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s2, s3, 24
; GFX6-NEXT: s_cmp_lt_u32 s1, s2
; GFX6-NEXT: s_cselect_b32 s2, s1, s2
; GFX6-NEXT: s_min_u32 s2, s1, s2
; GFX6-NEXT: s_sub_i32 s1, s1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 8
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
@ -508,28 +504,24 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_lshr_b32 s7, s1, 24
; GFX6-NEXT: s_lshl_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 24
; GFX6-NEXT: s_cmp_lt_u32 s0, s1
; GFX6-NEXT: s_cselect_b32 s1, s0, s1
; GFX6-NEXT: s_min_u32 s1, s0, s1
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 24
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s2, s5, 24
; GFX6-NEXT: s_cmp_lt_u32 s1, s2
; GFX6-NEXT: s_cselect_b32 s2, s1, s2
; GFX6-NEXT: s_min_u32 s2, s1, s2
; GFX6-NEXT: s_sub_i32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 24
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_lshl_b32 s3, s6, 24
; GFX6-NEXT: s_cmp_lt_u32 s2, s3
; GFX6-NEXT: s_cselect_b32 s3, s2, s3
; GFX6-NEXT: s_min_u32 s3, s2, s3
; GFX6-NEXT: s_sub_i32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
; GFX6-NEXT: s_cmp_lt_u32 s3, s4
; GFX6-NEXT: s_cselect_b32 s4, s3, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
; GFX6-NEXT: s_min_u32 s4, s3, s4
; GFX6-NEXT: s_sub_i32 s3, s3, s4
; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_lshl_b32 s1, s1, 8
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
@ -718,8 +710,7 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s0, 8
; GFX6-NEXT: s_lshl_b32 s1, s1, 8
; GFX6-NEXT: s_cmp_lt_u32 s0, s1
; GFX6-NEXT: s_cselect_b32 s1, s0, s1
; GFX6-NEXT: s_min_u32 s1, s0, s1
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 8
; GFX6-NEXT: ; return to shader part epilog
@ -789,8 +780,7 @@ define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
; GFX6-LABEL: s_usubsat_i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_cmp_lt_u32 s0, s1
; GFX6-NEXT: s_cselect_b32 s1, s0, s1
; GFX6-NEXT: s_min_u32 s1, s0, s1
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
@ -907,11 +897,9 @@ define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v2i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_cmp_lt_u32 s0, s2
; GFX6-NEXT: s_cselect_b32 s2, s0, s2
; GFX6-NEXT: s_min_u32 s2, s0, s2
; GFX6-NEXT: s_sub_i32 s0, s0, s2
; GFX6-NEXT: s_cmp_lt_u32 s1, s3
; GFX6-NEXT: s_cselect_b32 s2, s1, s3
; GFX6-NEXT: s_min_u32 s2, s1, s3
; GFX6-NEXT: s_sub_i32 s1, s1, s2
; GFX6-NEXT: ; return to shader part epilog
;
@ -989,14 +977,11 @@ define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v3i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_cmp_lt_u32 s0, s3
; GFX6-NEXT: s_cselect_b32 s3, s0, s3
; GFX6-NEXT: s_min_u32 s3, s0, s3
; GFX6-NEXT: s_sub_i32 s0, s0, s3
; GFX6-NEXT: s_cmp_lt_u32 s1, s4
; GFX6-NEXT: s_cselect_b32 s3, s1, s4
; GFX6-NEXT: s_min_u32 s3, s1, s4
; GFX6-NEXT: s_sub_i32 s1, s1, s3
; GFX6-NEXT: s_cmp_lt_u32 s2, s5
; GFX6-NEXT: s_cselect_b32 s3, s2, s5
; GFX6-NEXT: s_min_u32 s3, s2, s5
; GFX6-NEXT: s_sub_i32 s2, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
@ -1087,17 +1072,13 @@ define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v4i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_cmp_lt_u32 s0, s4
; GFX6-NEXT: s_cselect_b32 s4, s0, s4
; GFX6-NEXT: s_min_u32 s4, s0, s4
; GFX6-NEXT: s_sub_i32 s0, s0, s4
; GFX6-NEXT: s_cmp_lt_u32 s1, s5
; GFX6-NEXT: s_cselect_b32 s4, s1, s5
; GFX6-NEXT: s_min_u32 s4, s1, s5
; GFX6-NEXT: s_sub_i32 s1, s1, s4
; GFX6-NEXT: s_cmp_lt_u32 s2, s6
; GFX6-NEXT: s_cselect_b32 s4, s2, s6
; GFX6-NEXT: s_min_u32 s4, s2, s6
; GFX6-NEXT: s_sub_i32 s2, s2, s4
; GFX6-NEXT: s_cmp_lt_u32 s3, s7
; GFX6-NEXT: s_cselect_b32 s4, s3, s7
; GFX6-NEXT: s_min_u32 s4, s3, s7
; GFX6-NEXT: s_sub_i32 s3, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
@ -1201,20 +1182,15 @@ define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v5i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_cmp_lt_u32 s0, s5
; GFX6-NEXT: s_cselect_b32 s5, s0, s5
; GFX6-NEXT: s_min_u32 s5, s0, s5
; GFX6-NEXT: s_sub_i32 s0, s0, s5
; GFX6-NEXT: s_cmp_lt_u32 s1, s6
; GFX6-NEXT: s_cselect_b32 s5, s1, s6
; GFX6-NEXT: s_min_u32 s5, s1, s6
; GFX6-NEXT: s_sub_i32 s1, s1, s5
; GFX6-NEXT: s_cmp_lt_u32 s2, s7
; GFX6-NEXT: s_cselect_b32 s5, s2, s7
; GFX6-NEXT: s_min_u32 s5, s2, s7
; GFX6-NEXT: s_sub_i32 s2, s2, s5
; GFX6-NEXT: s_cmp_lt_u32 s3, s8
; GFX6-NEXT: s_cselect_b32 s5, s3, s8
; GFX6-NEXT: s_min_u32 s5, s3, s8
; GFX6-NEXT: s_sub_i32 s3, s3, s5
; GFX6-NEXT: s_cmp_lt_u32 s4, s9
; GFX6-NEXT: s_cselect_b32 s5, s4, s9
; GFX6-NEXT: s_min_u32 s5, s4, s9
; GFX6-NEXT: s_sub_i32 s4, s4, s5
; GFX6-NEXT: ; return to shader part epilog
;
@ -1381,53 +1357,37 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v16i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_cmp_lt_u32 s0, s16
; GFX6-NEXT: s_cselect_b32 s16, s0, s16
; GFX6-NEXT: s_min_u32 s16, s0, s16
; GFX6-NEXT: s_sub_i32 s0, s0, s16
; GFX6-NEXT: s_cmp_lt_u32 s1, s17
; GFX6-NEXT: s_cselect_b32 s16, s1, s17
; GFX6-NEXT: s_min_u32 s16, s1, s17
; GFX6-NEXT: s_sub_i32 s1, s1, s16
; GFX6-NEXT: s_cmp_lt_u32 s2, s18
; GFX6-NEXT: s_cselect_b32 s16, s2, s18
; GFX6-NEXT: s_min_u32 s16, s2, s18
; GFX6-NEXT: s_sub_i32 s2, s2, s16
; GFX6-NEXT: s_cmp_lt_u32 s3, s19
; GFX6-NEXT: s_cselect_b32 s16, s3, s19
; GFX6-NEXT: s_min_u32 s16, s3, s19
; GFX6-NEXT: s_sub_i32 s3, s3, s16
; GFX6-NEXT: s_cmp_lt_u32 s4, s20
; GFX6-NEXT: s_cselect_b32 s16, s4, s20
; GFX6-NEXT: s_min_u32 s16, s4, s20
; GFX6-NEXT: s_sub_i32 s4, s4, s16
; GFX6-NEXT: s_cmp_lt_u32 s5, s21
; GFX6-NEXT: s_cselect_b32 s16, s5, s21
; GFX6-NEXT: s_min_u32 s16, s5, s21
; GFX6-NEXT: s_sub_i32 s5, s5, s16
; GFX6-NEXT: s_cmp_lt_u32 s6, s22
; GFX6-NEXT: s_cselect_b32 s16, s6, s22
; GFX6-NEXT: s_min_u32 s16, s6, s22
; GFX6-NEXT: s_sub_i32 s6, s6, s16
; GFX6-NEXT: s_cmp_lt_u32 s7, s23
; GFX6-NEXT: s_cselect_b32 s16, s7, s23
; GFX6-NEXT: s_min_u32 s16, s7, s23
; GFX6-NEXT: s_sub_i32 s7, s7, s16
; GFX6-NEXT: s_cmp_lt_u32 s8, s24
; GFX6-NEXT: s_cselect_b32 s16, s8, s24
; GFX6-NEXT: s_min_u32 s16, s8, s24
; GFX6-NEXT: s_sub_i32 s8, s8, s16
; GFX6-NEXT: s_cmp_lt_u32 s9, s25
; GFX6-NEXT: s_cselect_b32 s16, s9, s25
; GFX6-NEXT: s_min_u32 s16, s9, s25
; GFX6-NEXT: s_sub_i32 s9, s9, s16
; GFX6-NEXT: s_cmp_lt_u32 s10, s26
; GFX6-NEXT: s_cselect_b32 s16, s10, s26
; GFX6-NEXT: s_min_u32 s16, s10, s26
; GFX6-NEXT: s_sub_i32 s10, s10, s16
; GFX6-NEXT: s_cmp_lt_u32 s11, s27
; GFX6-NEXT: s_cselect_b32 s16, s11, s27
; GFX6-NEXT: s_min_u32 s16, s11, s27
; GFX6-NEXT: s_sub_i32 s11, s11, s16
; GFX6-NEXT: s_cmp_lt_u32 s12, s28
; GFX6-NEXT: s_cselect_b32 s16, s12, s28
; GFX6-NEXT: s_min_u32 s16, s12, s28
; GFX6-NEXT: s_sub_i32 s12, s12, s16
; GFX6-NEXT: s_cmp_lt_u32 s13, s29
; GFX6-NEXT: s_cselect_b32 s16, s13, s29
; GFX6-NEXT: s_min_u32 s16, s13, s29
; GFX6-NEXT: s_sub_i32 s13, s13, s16
; GFX6-NEXT: s_cmp_lt_u32 s14, s30
; GFX6-NEXT: s_cselect_b32 s16, s14, s30
; GFX6-NEXT: s_min_u32 s16, s14, s30
; GFX6-NEXT: s_sub_i32 s14, s14, s16
; GFX6-NEXT: s_cmp_lt_u32 s15, s31
; GFX6-NEXT: s_cselect_b32 s16, s15, s31
; GFX6-NEXT: s_min_u32 s16, s15, s31
; GFX6-NEXT: s_sub_i32 s15, s15, s16
; GFX6-NEXT: ; return to shader part epilog
;
@ -1612,8 +1572,7 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_cmp_lt_u32 s0, s1
; GFX6-NEXT: s_cselect_b32 s1, s0, s1
; GFX6-NEXT: s_min_u32 s1, s0, s1
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: ; return to shader part epilog
@ -1746,16 +1705,14 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_cmp_lt_u32 s0, s2
; GFX6-NEXT: s_cselect_b32 s2, s0, s2
; GFX6-NEXT: s_min_u32 s2, s0, s2
; GFX6-NEXT: s_sub_i32 s0, s0, s2
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_cmp_lt_u32 s1, s2
; GFX6-NEXT: s_cselect_b32 s2, s1, s2
; GFX6-NEXT: s_min_u32 s2, s1, s2
; GFX6-NEXT: s_sub_i32 s1, s1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
@ -1954,30 +1911,26 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_cmp_lt_u32 s0, s4
; GFX6-NEXT: s_cselect_b32 s4, s0, s4
; GFX6-NEXT: s_min_u32 s4, s0, s4
; GFX6-NEXT: s_sub_i32 s0, s0, s4
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s4, s5, 16
; GFX6-NEXT: s_cmp_lt_u32 s1, s4
; GFX6-NEXT: s_cselect_b32 s4, s1, s4
; GFX6-NEXT: s_min_u32 s4, s1, s4
; GFX6-NEXT: s_sub_i32 s1, s1, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s4, s6, 16
; GFX6-NEXT: s_cmp_lt_u32 s2, s4
; GFX6-NEXT: s_cselect_b32 s4, s2, s4
; GFX6-NEXT: s_min_u32 s4, s2, s4
; GFX6-NEXT: s_sub_i32 s2, s2, s4
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_lshl_b32 s4, s7, 16
; GFX6-NEXT: s_cmp_lt_u32 s3, s4
; GFX6-NEXT: s_cselect_b32 s4, s3, s4
; GFX6-NEXT: s_min_u32 s4, s3, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_sub_i32 s3, s3, s4
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s1, s3, 16
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: ; return to shader part epilog
@ -2125,44 +2078,38 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s6, s6, 16
; GFX6-NEXT: s_cmp_lt_u32 s0, s6
; GFX6-NEXT: s_cselect_b32 s6, s0, s6
; GFX6-NEXT: s_min_u32 s6, s0, s6
; GFX6-NEXT: s_sub_i32 s0, s0, s6
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s6, s7, 16
; GFX6-NEXT: s_cmp_lt_u32 s1, s6
; GFX6-NEXT: s_cselect_b32 s6, s1, s6
; GFX6-NEXT: s_min_u32 s6, s1, s6
; GFX6-NEXT: s_sub_i32 s1, s1, s6
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s6, s8, 16
; GFX6-NEXT: s_cmp_lt_u32 s2, s6
; GFX6-NEXT: s_cselect_b32 s6, s2, s6
; GFX6-NEXT: s_min_u32 s6, s2, s6
; GFX6-NEXT: s_sub_i32 s2, s2, s6
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_lshl_b32 s6, s9, 16
; GFX6-NEXT: s_cmp_lt_u32 s3, s6
; GFX6-NEXT: s_cselect_b32 s6, s3, s6
; GFX6-NEXT: s_min_u32 s6, s3, s6
; GFX6-NEXT: s_sub_i32 s3, s3, s6
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_lshl_b32 s6, s10, 16
; GFX6-NEXT: s_cmp_lt_u32 s4, s6
; GFX6-NEXT: s_cselect_b32 s6, s4, s6
; GFX6-NEXT: s_min_u32 s6, s4, s6
; GFX6-NEXT: s_sub_i32 s4, s4, s6
; GFX6-NEXT: s_lshr_b32 s4, s4, 16
; GFX6-NEXT: s_lshl_b32 s5, s5, 16
; GFX6-NEXT: s_lshl_b32 s6, s11, 16
; GFX6-NEXT: s_cmp_lt_u32 s5, s6
; GFX6-NEXT: s_cselect_b32 s6, s5, s6
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_min_u32 s6, s5, s6
; GFX6-NEXT: s_sub_i32 s5, s5, s6
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s1, s3, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_lshr_b32 s4, s4, 16
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_or_b32 s2, s4, s2
; GFX6-NEXT: ; return to shader part epilog
@ -2331,58 +2278,50 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s8, s8, 16
; GFX6-NEXT: s_cmp_lt_u32 s0, s8
; GFX6-NEXT: s_cselect_b32 s8, s0, s8
; GFX6-NEXT: s_min_u32 s8, s0, s8
; GFX6-NEXT: s_sub_i32 s0, s0, s8
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s8, s9, 16
; GFX6-NEXT: s_cmp_lt_u32 s1, s8
; GFX6-NEXT: s_cselect_b32 s8, s1, s8
; GFX6-NEXT: s_min_u32 s8, s1, s8
; GFX6-NEXT: s_sub_i32 s1, s1, s8
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s8, s10, 16
; GFX6-NEXT: s_cmp_lt_u32 s2, s8
; GFX6-NEXT: s_cselect_b32 s8, s2, s8
; GFX6-NEXT: s_min_u32 s8, s2, s8
; GFX6-NEXT: s_sub_i32 s2, s2, s8
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_lshl_b32 s8, s11, 16
; GFX6-NEXT: s_cmp_lt_u32 s3, s8
; GFX6-NEXT: s_cselect_b32 s8, s3, s8
; GFX6-NEXT: s_min_u32 s8, s3, s8
; GFX6-NEXT: s_sub_i32 s3, s3, s8
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_lshl_b32 s8, s12, 16
; GFX6-NEXT: s_cmp_lt_u32 s4, s8
; GFX6-NEXT: s_cselect_b32 s8, s4, s8
; GFX6-NEXT: s_min_u32 s8, s4, s8
; GFX6-NEXT: s_sub_i32 s4, s4, s8
; GFX6-NEXT: s_lshr_b32 s4, s4, 16
; GFX6-NEXT: s_lshl_b32 s5, s5, 16
; GFX6-NEXT: s_lshl_b32 s8, s13, 16
; GFX6-NEXT: s_cmp_lt_u32 s5, s8
; GFX6-NEXT: s_cselect_b32 s8, s5, s8
; GFX6-NEXT: s_min_u32 s8, s5, s8
; GFX6-NEXT: s_sub_i32 s5, s5, s8
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
; GFX6-NEXT: s_lshl_b32 s6, s6, 16
; GFX6-NEXT: s_lshl_b32 s8, s14, 16
; GFX6-NEXT: s_cmp_lt_u32 s6, s8
; GFX6-NEXT: s_cselect_b32 s8, s6, s8
; GFX6-NEXT: s_min_u32 s8, s6, s8
; GFX6-NEXT: s_sub_i32 s6, s6, s8
; GFX6-NEXT: s_lshr_b32 s6, s6, 16
; GFX6-NEXT: s_lshl_b32 s7, s7, 16
; GFX6-NEXT: s_lshl_b32 s8, s15, 16
; GFX6-NEXT: s_cmp_lt_u32 s7, s8
; GFX6-NEXT: s_cselect_b32 s8, s7, s8
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
; GFX6-NEXT: s_min_u32 s8, s7, s8
; GFX6-NEXT: s_sub_i32 s7, s7, s8
; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshl_b32 s1, s3, 16
; GFX6-NEXT: s_lshr_b32 s7, s7, 16
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_lshr_b32 s4, s4, 16
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_lshr_b32 s6, s6, 16
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_or_b32 s2, s4, s2
; GFX6-NEXT: s_or_b32 s3, s6, s3
; GFX6-NEXT: ; return to shader part epilog