1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 02:33:06 +01:00

GlobalISel: Merge and cleanup more AMDGPU call lowering code

This merges more AMDGPU ABI lowering code into the generic call
lowering. Start cleaning up by factoring away more of the pack/unpack
logic into the buildCopy{To|From}Parts functions. These could use more
improvement, and the SelectionDAG versions are significantly more
complex, and we'll eventually have to emulate all of those cases too.

This is mostly NFC, but does result in some minor instruction
reordering. It also removes some of the limitations with mismatched
sizes the old code had. However, similarly to the merge on the input,
this is forcing gfx6/gfx7 to use the gfx8+ ABI (which is what we
actually want, but SelectionDAG is stuck using the weird emergent
ABI).

This also changes the load/store size for stack passed EVTs for
AArch64, which makes it consistent with the DAG behavior.
This commit is contained in:
Matt Arsenault 2021-02-09 12:09:20 -05:00
parent 77745abf15
commit 7cd8d213c3
30 changed files with 765 additions and 668 deletions

View File

@ -256,16 +256,32 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
return B.buildConcatVectors(DstRegs[0], SrcRegs);
}
const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
Register Undef = B.buildUndef(PartLLT).getReg(0);
// We need to create an unmerge to the result registers, which may require
// widening the original value.
Register UnmergeSrcReg;
if (LCMTy != PartLLT) {
// e.g. A <3 x s16> value was split to <2 x s16>
// %register_value0:_(<2 x s16>)
// %register_value1:_(<2 x s16>)
// %undef:_(<2 x s16>) = G_IMPLICIT_DEF
// %concat:_<6 x s16>) = G_CONCAT_VECTORS %reg_value0, %reg_value1, %undef
// %dst_reg:_(<3 x s16>), %dead:_(<3 x s16>) = G_UNMERGE_VALUES %concat
const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
Register Undef = B.buildUndef(PartLLT).getReg(0);
// Build vector of undefs.
SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
// Build vector of undefs.
SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
// Replace the first sources with the real registers.
std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
// Replace the first sources with the real registers.
std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
UnmergeSrcReg = B.buildConcatVectors(LCMTy, WidenedSrcs).getReg(0);
} else {
// We don't need to widen anything if we're extracting a scalar which was
// promoted to a vector e.g. s8 -> v4s8 -> s8
assert(SrcRegs.size() == 1);
UnmergeSrcReg = SrcRegs[0];
}
auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
SmallVector<Register, 8> PadDstRegs(NumDst);
@ -275,17 +291,27 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
for (int I = DstRegs.size(); I != NumDst; ++I)
PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
return B.buildUnmerge(PadDstRegs, Widened);
return B.buildUnmerge(PadDstRegs, UnmergeSrcReg);
}
/// Create a sequence of instructions to combine pieces split into register
/// typed values to the original IR value. \p OrigRegs contains the destination
/// value registers of type \p LLTy, and \p Regs contains the legalized pieces
/// with type \p PartLLT.
static void buildCopyToParts(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) {
/// with type \p PartLLT. This is used for incoming values (physregs to vregs).
static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) {
MachineRegisterInfo &MRI = *B.getMRI();
// We could just insert a regular copy, but this is unreachable at the moment.
assert(LLTy != PartLLT && "identical part types shouldn't reach here");
if (PartLLT.isVector() == LLTy.isVector() &&
PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits()) {
assert(OrigRegs.size() == 1 && Regs.size() == 1);
B.buildTrunc(OrigRegs[0], Regs[0]);
return;
}
if (!LLTy.isVector() && !PartLLT.isVector()) {
assert(OrigRegs.size() == 1);
LLT OrigTy = MRI.getType(OrigRegs[0]);
@ -301,9 +327,9 @@ static void buildCopyToParts(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
return;
}
if (LLTy.isVector() && PartLLT.isVector()) {
assert(OrigRegs.size() == 1);
assert(LLTy.getElementType() == PartLLT.getElementType());
if (PartLLT.isVector()) {
assert(OrigRegs.size() == 1 &&
LLTy.getScalarType() == PartLLT.getElementType());
mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
return;
}
@ -353,6 +379,71 @@ static void buildCopyToParts(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
}
}
/// Create a sequence of instructions to expand the value in \p SrcReg (of type
/// \p SrcTy) to the types in \p DstRegs (of type \p PartTy). \p ExtendOp should
/// contain the type of scalar value extension if necessary.
///
/// This is used for outgoing values (vregs to physregs)
static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
Register SrcReg, LLT SrcTy, LLT PartTy,
unsigned ExtendOp = TargetOpcode::G_ANYEXT) {
// We could just insert a regular copy, but this is unreachable at the moment.
assert(SrcTy != PartTy && "identical part types shouldn't reach here");
const unsigned PartSize = PartTy.getSizeInBits();
if (PartTy.isVector() == SrcTy.isVector() &&
PartTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits()) {
assert(DstRegs.size() == 1);
B.buildInstr(ExtendOp, {DstRegs[0]}, {SrcReg});
return;
}
if (SrcTy.isVector() && !PartTy.isVector() &&
PartSize > SrcTy.getElementType().getSizeInBits()) {
// Vector was scalarized, and the elements extended.
auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
for (int i = 0, e = DstRegs.size(); i != e; ++i)
B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
return;
}
LLT GCDTy = getGCDType(SrcTy, PartTy);
if (GCDTy == PartTy) {
// If this already evenly divisible, we can create a simple unmerge.
B.buildUnmerge(DstRegs, SrcReg);
return;
}
MachineRegisterInfo &MRI = *B.getMRI();
LLT DstTy = MRI.getType(DstRegs[0]);
LLT LCMTy = getLCMType(SrcTy, PartTy);
const unsigned LCMSize = LCMTy.getSizeInBits();
const unsigned DstSize = DstTy.getSizeInBits();
const unsigned SrcSize = SrcTy.getSizeInBits();
Register UnmergeSrc = SrcReg;
if (LCMSize != SrcSize) {
// Widen to the common type.
Register Undef = B.buildUndef(SrcTy).getReg(0);
SmallVector<Register, 8> MergeParts(1, SrcReg);
for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
MergeParts.push_back(Undef);
UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
}
// Unmerge to the original registers and pad with dead defs.
SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
Size += DstSize) {
UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
}
B.buildUnmerge(UnmergeResults, UnmergeSrc);
}
bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
SmallVectorImpl<ArgInfo> &Args,
ValueHandler &Handler,
@ -367,6 +458,14 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
ThisReturnReg);
}
static unsigned extendOpFromFlags(llvm::ISD::ArgFlagsTy Flags) {
if (Flags.isSExt())
return TargetOpcode::G_SEXT;
if (Flags.isZExt())
return TargetOpcode::G_ZEXT;
return TargetOpcode::G_ANYEXT;
}
bool CallLowering::handleAssignments(CCState &CCInfo,
SmallVectorImpl<CCValAssign> &ArgLocs,
MachineIRBuilder &MIRBuilder,
@ -374,6 +473,7 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
ValueHandler &Handler,
Register ThisReturnReg) const {
MachineFunction &MF = MIRBuilder.getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
@ -399,10 +499,20 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
Args[i].Flags[0], CCInfo))
return false;
// If we couldn't directly assign this part, some casting may be
// necessary. Create the new register, but defer inserting the conversion
// instructions.
assert(Args[i].OrigRegs.empty());
Args[i].OrigRegs.push_back(Args[i].Regs[0]);
assert(Args[i].Regs.size() == 1);
const LLT VATy(NewVT);
Args[i].Regs[0] = MRI.createGenericVirtualRegister(VATy);
continue;
}
assert(NumParts > 1);
const LLT NewLLT(NewVT);
// For incoming arguments (physregs to vregs), we could have values in
// physregs (or memlocs) which we want to extract and copy to vregs.
@ -419,13 +529,11 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
Args[i].OrigRegs.push_back(Args[i].Regs[0]);
Args[i].Regs.clear();
Args[i].Flags.clear();
LLT NewLLT = getLLTForMVT(NewVT);
// For each split register, create and assign a vreg that will store
// the incoming component of the larger value. These will later be
// merged to form the final vreg.
for (unsigned Part = 0; Part < NumParts; ++Part) {
Register Reg =
MIRBuilder.getMRI()->createGenericVirtualRegister(NewLLT);
Register Reg = MRI.createGenericVirtualRegister(NewLLT);
ISD::ArgFlagsTy Flags = OrigFlags;
if (Part == 0) {
Flags.setSplit();
@ -443,12 +551,13 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
}
}
} else {
assert(Args[i].Regs.size() == 1);
// This type is passed via multiple registers in the calling convention.
// We need to extract the individual parts.
Register LargeReg = Args[i].Regs[0];
LLT SmallTy = LLT::scalar(NewVT.getSizeInBits());
auto Unmerge = MIRBuilder.buildUnmerge(SmallTy, LargeReg);
assert(Unmerge->getNumOperands() == NumParts + 1);
assert(Args[i].OrigRegs.empty());
Args[i].OrigRegs.push_back(Args[i].Regs[0]);
ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
// We're going to replace the regs and flags with the split ones.
Args[i].Regs.clear();
@ -471,7 +580,9 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
Flags.setReturned(false);
}
Args[i].Regs.push_back(Unmerge.getReg(PartIdx));
Register NewReg = MRI.createGenericVirtualRegister(NewLLT);
Args[i].Regs.push_back(NewReg);
Args[i].Flags.push_back(Flags);
if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full,
Args[i], Args[i].Flags[PartIdx], CCInfo))
@ -495,7 +606,6 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
continue;
}
EVT OrigVT = EVT::getEVT(Args[i].Ty);
EVT VAVT = VA.getValVT();
const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
const LLT VATy(VAVT.getSimpleVT());
@ -503,12 +613,18 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
// Expected to be multiple regs for a single incoming arg.
// There should be Regs.size() ArgLocs per argument.
unsigned NumArgRegs = Args[i].Regs.size();
MachineRegisterInfo &MRI = MF.getRegInfo();
assert((j + (NumArgRegs - 1)) < ArgLocs.size() &&
"Too many regs for number of args");
// Coerce into outgoing value types before register assignment.
if (!Handler.isIncomingArgumentHandler() && OrigTy != VATy) {
assert(Args[i].OrigRegs.size() == 1);
buildCopyToRegs(MIRBuilder, Args[i].Regs, Args[i].OrigRegs[0], OrigTy,
VATy, extendOpFromFlags(Args[i].Flags[0]));
}
for (unsigned Part = 0; Part < NumArgRegs; ++Part) {
Register ArgReg = Args[i].Regs[Part];
LLT ArgRegTy = MRI.getType(ArgReg);
// There should be Regs.size() ArgLocs per argument.
VA = ArgLocs[j + Part];
if (VA.isMemLoc()) {
@ -536,57 +652,16 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
continue;
}
// GlobalISel does not currently work for scalable vectors.
if (OrigVT.getFixedSizeInBits() >= VAVT.getFixedSizeInBits() ||
!Handler.isIncomingArgumentHandler()) {
// This is an argument that might have been split. There should be
// Regs.size() ArgLocs per argument.
// Insert the argument copies. If VAVT < OrigVT, we'll insert the merge
// to the original register after handling all of the parts.
Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
continue;
}
// This ArgLoc covers multiple pieces, so we need to split it.
Register NewReg = MRI.createGenericVirtualRegister(VATy);
Handler.assignValueToReg(NewReg, VA.getLocReg(), VA);
// If it's a vector type, we either need to truncate the elements
// or do an unmerge to get the lower block of elements.
if (VATy.isVector() &&
VATy.getNumElements() > OrigVT.getVectorNumElements()) {
// Just handle the case where the VA type is a multiple of original
// type.
if (VATy.getNumElements() % OrigVT.getVectorNumElements() != 0) {
LLVM_DEBUG(dbgs() << "Incoming promoted vector arg elts is not a "
"multiple of orig type elt: "
<< VATy << " vs " << OrigTy);
return false;
}
SmallVector<Register, 4> DstRegs = {ArgReg};
unsigned NumParts =
VATy.getNumElements() / OrigVT.getVectorNumElements() - 1;
for (unsigned Idx = 0; Idx < NumParts; ++Idx)
DstRegs.push_back(
MIRBuilder.getMRI()->createGenericVirtualRegister(OrigTy));
MIRBuilder.buildUnmerge(DstRegs, {NewReg});
} else if (VATy.getScalarSizeInBits() > ArgRegTy.getScalarSizeInBits()) {
MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0);
} else {
MIRBuilder.buildCopy(ArgReg, NewReg);
}
Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
}
// Now that all pieces have been handled, re-pack any arguments into any
// wider, original registers.
if (Handler.isIncomingArgumentHandler()) {
// Now that all pieces have been assigned, re-pack the register typed values
// into the original value typed registers.
if (Handler.isIncomingArgumentHandler() && OrigTy != VATy) {
// Merge the split registers into the expected larger result vregs of
// the original call.
if (OrigTy != VATy && !Args[i].OrigRegs.empty()) {
buildCopyToParts(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy,
VATy);
}
buildCopyFromRegs(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy,
VATy);
}
j += NumArgRegs - 1;

View File

@ -278,108 +278,6 @@ static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
}
}
void AMDGPUCallLowering::processSplitArgs(
MachineIRBuilder &B, const ArgInfo &OrigArg,
const SmallVectorImpl<ArgInfo> &SplitArg,
SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL,
CallingConv::ID CallConv, bool IsOutgoing,
SplitArgTy PerformArgSplit) const {
LLVMContext &Ctx = OrigArg.Ty->getContext();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
// FIXME: This is mostly nasty pre-processing before handleAssignments. Most
// of this should be performed by handleAssignments.
for (int SplitIdx = 0, e = SplitArg.size(); SplitIdx != e; ++SplitIdx) {
const ArgInfo &CurSplitArg = SplitArg[SplitIdx];
Register Reg = OrigArg.Regs[SplitIdx];
EVT VT = EVT::getEVT(CurSplitArg.Ty);
LLT LLTy = getLLTForType(*CurSplitArg.Ty, DL);
unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
if (NumParts == 1) {
// No splitting to do, but we want to replace the original type (e.g. [1 x
// double] -> double).
SplitArgs.emplace_back(Reg, CurSplitArg.Ty, OrigArg.Flags,
OrigArg.IsFixed);
continue;
}
SmallVector<Register, 8> SplitRegs;
Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
LLT PartLLT = getLLTForType(*PartTy, DL);
MachineRegisterInfo &MRI = *B.getMRI();
// FIXME: Should we be reporting all of the part registers for a single
// argument, and let handleAssignments take care of the repacking?
for (unsigned i = 0; i < NumParts; ++i) {
Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
SplitRegs.push_back(PartReg);
SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
}
PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
}
}
// TODO: Move to generic code
static void unpackRegsToOrigType(MachineIRBuilder &B,
ArrayRef<Register> DstRegs,
Register SrcReg,
const CallLowering::ArgInfo &Info,
LLT SrcTy,
LLT PartTy) {
assert(DstRegs.size() > 1 && "Nothing to unpack");
const unsigned PartSize = PartTy.getSizeInBits();
if (SrcTy.isVector() && !PartTy.isVector() &&
PartSize > SrcTy.getElementType().getSizeInBits()) {
// Vector was scalarized, and the elements extended.
auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
for (int i = 0, e = DstRegs.size(); i != e; ++i)
B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
return;
}
LLT GCDTy = getGCDType(SrcTy, PartTy);
if (GCDTy == PartTy) {
// If this already evenly divisible, we can create a simple unmerge.
B.buildUnmerge(DstRegs, SrcReg);
return;
}
MachineRegisterInfo &MRI = *B.getMRI();
LLT DstTy = MRI.getType(DstRegs[0]);
LLT LCMTy = getLCMType(SrcTy, PartTy);
const unsigned LCMSize = LCMTy.getSizeInBits();
const unsigned DstSize = DstTy.getSizeInBits();
const unsigned SrcSize = SrcTy.getSizeInBits();
Register UnmergeSrc = SrcReg;
if (LCMSize != SrcSize) {
// Widen to the common type.
Register Undef = B.buildUndef(SrcTy).getReg(0);
SmallVector<Register, 8> MergeParts(1, SrcReg);
for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
MergeParts.push_back(Undef);
UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
}
// Unmerge to the original registers and pad with dead defs.
SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
Size += DstSize) {
UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
}
B.buildUnmerge(UnmergeResults, UnmergeSrc);
}
bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
CallingConv::ID CallConv,
SmallVectorImpl<BaseArgInfo> &Outs,
@ -418,12 +316,6 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
assert(VRegs.size() == SplitEVTs.size() &&
"For each split Type there should be exactly one VReg.");
// We pre-process the return value decomposed into EVTs.
SmallVector<ArgInfo, 8> PreSplitRetInfos;
// Further processing is applied to split the arguments from PreSplitRetInfos
// into 32-bit pieces in SplitRetInfos before passing off to
// handleAssignments.
SmallVector<ArgInfo, 8> SplitRetInfos;
for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
@ -457,18 +349,7 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
}
splitToValueTypes(RetInfo, PreSplitRetInfos, DL, CC);
// FIXME: This splitting should mostly be done by handleAssignments
processSplitArgs(B, RetInfo,
PreSplitRetInfos, SplitRetInfos, DL, CC, true,
[&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy,
LLT PartLLT, int VTSplitIdx) {
unpackRegsToOrigType(B, Regs, SrcReg,
PreSplitRetInfos[VTSplitIdx], LLTy,
PartLLT);
});
PreSplitRetInfos.clear();
splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
}
CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
@ -1073,22 +954,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
}
SmallVector<ArgInfo, 8> OutArgs;
SmallVector<ArgInfo, 8> SplitArg;
for (auto &OrigArg : Info.OrigArgs) {
splitToValueTypes(OrigArg, SplitArg, DL, Info.CallConv);
processSplitArgs(
MIRBuilder, OrigArg, SplitArg, OutArgs, DL, Info.CallConv, true,
// FIXME: We should probably be passing multiple registers to
// handleAssignments to do this
[&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
int VTSplitIdx) {
unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
});
SplitArg.clear();
}
for (auto &OrigArg : Info.OrigArgs)
splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
// If we can lower as a tail call, do that instead.
bool CanTailCallOpt = false;

View File

@ -28,16 +28,6 @@ class AMDGPUCallLowering final : public CallLowering {
void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
Align Alignment, Register DstReg) const;
/// A function of this type is used to perform value split action.
using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>;
void processSplitArgs(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
const SmallVectorImpl<ArgInfo> &SplitArg,
SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, CallingConv::ID CallConv,
bool IsOutgoing,
SplitArgTy PerformArgSplit) const;
bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
SmallVectorImpl<BaseArgInfo> &Outs,
bool IsVarArg) const override;

View File

@ -156,9 +156,9 @@ define void @caller_s128(i128 *%ptr) {
; CHECK: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY]](p0) :: (load 16 from %ir.ptr)
; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128)
; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128)
; CHECK: $x0 = COPY [[UV]](s64)
; CHECK: $x1 = COPY [[UV1]](s64)
; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128)
; CHECK: $x2 = COPY [[UV2]](s64)
; CHECK: $x3 = COPY [[UV3]](s64)
; CHECK: $x4 = COPY [[COPY]](p0)

View File

@ -19,12 +19,9 @@ define i24 @test_v3i8(<3 x i8> %a) {
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $w0, $w1, $w2
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w1
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY $w2
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32)
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY3]](s32), [[COPY5]](s32)
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
; CHECK: [[TRUNC:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
; CHECK: [[BITCAST:%[0-9]+]]:_(s24) = G_BITCAST [[TRUNC]](<3 x s8>)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[BITCAST]](s24)

View File

@ -27,9 +27,9 @@ declare void @use_s128(i128 %a, i128 %b)
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[MV]](s128)
; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[MV1]](s128)
; CHECK: $x0 = COPY [[UV]](s64)
; CHECK: $x1 = COPY [[UV1]](s64)
; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[MV1]](s128)
; CHECK: $x2 = COPY [[UV2]](s64)
; CHECK: $x3 = COPY [[UV3]](s64)
; CHECK: BL @use_s128, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3

View File

@ -17,8 +17,9 @@ define i3 @bug47619(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %a
; CHECK: [[COPY6:%[0-9]+]]:_(s64) = COPY $x6
; CHECK: [[COPY7:%[0-9]+]]:_(s64) = COPY $x7
; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
; CHECK: [[LOAD:%[0-9]+]]:_(s3) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.0, align 16)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s3)
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 4 from %fixed-stack.0, align 16)
; CHECK: [[TRUNC:%[0-9]+]]:_(s3) = G_TRUNC [[LOAD]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s3)
; CHECK: $w0 = COPY [[ANYEXT]](s32)
; CHECK: RET_ReallyLR implicit $w0
bb:

View File

@ -29,9 +29,9 @@ body: |
; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.v2ptr)
; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128)
; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD1]](s128)
; CHECK: $x0 = COPY [[UV]](s64)
; CHECK: $x1 = COPY [[UV1]](s64)
; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD1]](s128)
; CHECK: $x2 = COPY [[UV2]](s64)
; CHECK: $x3 = COPY [[UV3]](s64)
; CHECK: BL &__udivti3, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit-def $x0, implicit-def $x1
@ -70,9 +70,9 @@ body: |
; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.v2ptr)
; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128)
; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD1]](s128)
; CHECK: $x0 = COPY [[UV]](s64)
; CHECK: $x1 = COPY [[UV1]](s64)
; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD1]](s128)
; CHECK: $x2 = COPY [[UV2]](s64)
; CHECK: $x3 = COPY [[UV3]](s64)
; CHECK: BL &__divti3, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit-def $x0, implicit-def $x1

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -O0 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s
define <1 x float> @foo(<1 x float> %v) {
; CHECK-LABEL: name: foo
define <1 x float> @ret_v1f32(<1 x float> %v) {
; CHECK-LABEL: name: ret_v1f32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $d0
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
@ -13,3 +13,23 @@ define <1 x float> @foo(<1 x float> %v) {
; CHECK: RET_ReallyLR implicit $d0
ret <1 x float> %v
}
define <1 x i8*> @ret_v1p0(<1 x i8*> %v) {
; CHECK-LABEL: name: ret_v1p0
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $d0
; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $d0
; CHECK: $d0 = COPY [[COPY]](p0)
; CHECK: RET_ReallyLR implicit $d0
ret <1 x i8*> %v
}
define <1 x i8 addrspace(1)*> @ret_v1p1(<1 x i8 addrspace(1)*> %v) {
; CHECK-LABEL: name: ret_v1p1
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $d0
; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $d0
; CHECK: $d0 = COPY [[COPY]](p1)
; CHECK: RET_ReallyLR implicit $d0
ret <1 x i8 addrspace(1)*> %v
}

View File

@ -363,20 +363,12 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg
}
define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
; GFX6-LABEL: v_andn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_andn2_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: v_andn2_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
; GCN-NEXT: v_and_b32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
%and = and <2 x i16> %src0, %not.src1
ret <2 x i16> %and

View File

@ -473,13 +473,18 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6-LABEL: v_ashr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v0, v1, v0
; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v1, v3, v1
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ashr_v2i16:
@ -504,10 +509,15 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ashr_v2i16_15:

View File

@ -467,15 +467,18 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
; GFX7-LABEL: v_bswap_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v3
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bswap_v2i16:

View File

@ -81,12 +81,17 @@ define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 {
; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[COPY5]]
; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32)
; CHECK: $vgpr0 = COPY [[COPY6]](s32)
; CHECK: $vgpr1 = COPY [[COPY7]](s32)
; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
; CHECK: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
; CHECK: S_SETPC_B64_return [[COPY8]], implicit $vgpr0, implicit $vgpr1
; CHECK: S_SETPC_B64_return [[COPY8]], implicit $vgpr0
%add = add <2 x i16> %arg0, %arg0
ret <2 x i16> %add
}

View File

@ -408,34 +408,38 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v4, v5, v7
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v3, v2
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v6
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v6, v1, v6
; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v3, v5
; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, v6
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
@ -456,26 +460,30 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v3, v2
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v5
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v5, v1, v5
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v5
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16:
@ -533,14 +541,18 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_afn:
@ -575,34 +587,38 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v4, v5, v7
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v3, v2
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v6
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v6, v1, v6
; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v3, v5
; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, v6
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
@ -623,26 +639,30 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v3, v2
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v5
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v5, v1, v5
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v5
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_ulp25:
@ -699,6 +719,7 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
@ -708,22 +729,25 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16:
@ -745,25 +769,29 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16:
@ -816,6 +844,7 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
@ -825,22 +854,25 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
@ -862,25 +894,29 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_arcp:
@ -935,11 +971,15 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
; GFX6-NEXT: v_rcp_f32_e32 v3, v0
; GFX6-NEXT: v_mul_f32_e32 v0, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, v2, v3
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
@ -971,6 +1011,7 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
@ -980,22 +1021,25 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
@ -1017,25 +1061,29 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_ulp25:
@ -1068,14 +1116,18 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
@ -1110,34 +1162,38 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v4, v5, v7
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v3, v2
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v6
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v6, v1, v6
; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v3, v5
; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, v6
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
@ -1158,26 +1214,30 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v3, v2
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v5
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v5, v1, v5
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v5
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
@ -1235,14 +1295,18 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:

View File

@ -93,6 +93,10 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16:
@ -135,6 +139,10 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_lhs:
@ -179,6 +187,10 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_rhs:
@ -225,6 +237,10 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs:

View File

@ -123,6 +123,10 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16:
@ -186,6 +190,10 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
@ -252,6 +260,10 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
@ -320,6 +332,10 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs:

View File

@ -211,7 +211,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
; GFX900: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX900: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
; GFX900: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
; GFX900: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
; GFX900: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX900: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
; GFX900: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@ -232,6 +231,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
; GFX900: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; GFX900: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
; GFX900: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; GFX900: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
; GFX900: $vgpr0 = COPY [[UV]](s32)
; GFX900: $vgpr1 = COPY [[UV1]](s32)
; GFX900: $vgpr2 = COPY [[UV2]](s32)
@ -296,7 +296,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
; GFX908: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX908: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
; GFX908: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
; GFX908: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
; GFX908: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX908: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
; GFX908: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@ -317,6 +316,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
; GFX908: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; GFX908: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
; GFX908: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; GFX908: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
; GFX908: $vgpr0 = COPY [[UV]](s32)
; GFX908: $vgpr1 = COPY [[UV1]](s32)
; GFX908: $vgpr2 = COPY [[UV2]](s32)
@ -435,7 +435,6 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
; GFX900: [[COPY25:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; GFX900: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX900: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
; GFX900: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
; GFX900: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX900: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
; GFX900: [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY7]]
@ -446,6 +445,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
; GFX900: [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; GFX900: [[COPY32:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; GFX900: [[COPY33:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GFX900: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
; GFX900: $vgpr0 = COPY [[UV]](s32)
; GFX900: $vgpr1 = COPY [[UV1]](s32)
; GFX900: $vgpr2 = COPY [[UV2]](s32)
@ -560,7 +560,6 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
; GFX908: [[COPY25:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; GFX908: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX908: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
; GFX908: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
; GFX908: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GFX908: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
; GFX908: [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY7]]
@ -571,6 +570,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
; GFX908: [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; GFX908: [[COPY32:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; GFX908: [[COPY33:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GFX908: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
; GFX908: $vgpr0 = COPY [[UV]](s32)
; GFX908: $vgpr1 = COPY [[UV1]](s32)
; GFX908: $vgpr2 = COPY [[UV2]](s32)

View File

@ -78,8 +78,9 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
; CHECK: $vgpr0 = COPY [[ANYEXT1]](s32)
; CHECK: $vgpr1 = COPY [[LOAD2]](s32)
; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
@ -106,8 +107,9 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
; CHECK: $sgpr4 = COPY [[ANYEXT]](s32)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
; CHECK: $sgpr4 = COPY [[ANYEXT1]](s32)
; CHECK: $sgpr5 = COPY [[LOAD2]](s32)
; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)

View File

@ -2842,7 +2842,6 @@ define amdgpu_kernel void @test_call_external_v33i32_func_v33i32_i32(<33 x i32>
; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[INT]], [[C]](s64)
; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 4 from %ir.idx.kernarg.offset.cast, align 8, addrspace 4)
; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p1)
; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc
; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v33i32_func_v33i32_i32
; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@ -2864,6 +2863,7 @@ define amdgpu_kernel void @test_call_external_v33i32_func_v33i32_i32(<33 x i32>
; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; GCN: $vgpr0 = COPY [[FRAME_INDEX]](p5)
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p1)
; GCN: $vgpr1 = COPY [[UV]](s32)
; GCN: $vgpr2 = COPY [[UV1]](s32)
; GCN: $vgpr3 = COPY [[LOAD1]](s32)

File diff suppressed because it is too large Load Diff

View File

@ -1867,7 +1867,8 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
; CHECK: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5)
; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
; CHECK: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 1 from %fixed-stack.2, align 4, addrspace 5)
; CHECK: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 2 from %fixed-stack.2, align 4, addrspace 5)
; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 2 from %fixed-stack.1, align 8, addrspace 5)
; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
@ -1880,7 +1881,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK: [[COPY36:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD]](s1), [[COPY33]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD1]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[TRUNC]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD2]](s16), [[COPY35]](p1) :: (volatile store 2 into `i16 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD3]](s16), [[COPY36]](p1) :: (volatile store 2 into `half addrspace(1)* undef`, addrspace 1)
; CHECK: [[COPY37:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]

View File

@ -491,6 +491,8 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_lshr_v2i16:
@ -515,9 +517,10 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 15, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_lshr_v2i16_15:

View File

@ -363,20 +363,12 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %
}
define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
; GFX6-LABEL: v_orn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_orn2_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: v_orn2_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
%or = or <2 x i16> %src0, %not.src1
ret <2 x i16> %or

View File

@ -178,11 +178,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-NEXT: v_rndne_f32_e32 v0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_rndne_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v2f16:
@ -190,11 +194,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v2f16:
@ -226,11 +234,15 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-NEXT: v_rndne_f32_e32 v0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_rndne_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v2f16_fneg:
@ -239,11 +251,15 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v2f16_fneg:

View File

@ -2724,8 +2724,13 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v2i16:

View File

@ -485,11 +485,16 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6-LABEL: v_shl_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v2
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_v2i16:
@ -515,7 +520,9 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 15, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 15, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_v2i16_15:

View File

@ -2704,14 +2704,19 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_max_i32_e32 v3, -1, v1
; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4
; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v2i16:

View File

@ -1760,8 +1760,10 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1
; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v2i16:

View File

@ -1672,8 +1672,10 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v2i16:

View File

@ -1988,9 +1988,9 @@ TEST_F(AArch64GISelMITest, LibcallSRem) {
CHECK: $x1 = COPY [[COPY]]
CHECK: BL &__moddi3
CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]]
CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]]
CHECK: $x0 = COPY [[UV]]
CHECK: $x1 = COPY [[UV1]]
CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]]
CHECK: $x2 = COPY [[UV2]]
CHECK: $x3 = COPY [[UV3]]
CHECK: BL &__modti3
@ -2045,9 +2045,9 @@ TEST_F(AArch64GISelMITest, LibcallURem) {
CHECK: $x1 = COPY [[COPY]]
CHECK: BL &__umoddi3
CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]]
CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]]
CHECK: $x0 = COPY [[UV]]
CHECK: $x1 = COPY [[UV1]]
CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]]
CHECK: $x2 = COPY [[UV2]]
CHECK: $x3 = COPY [[UV3]]
CHECK: BL &__umodti3