mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[SelectionDAG] Optimize VSELECT->SETCC of incompatible or illegal types.
Don't scalarize VSELECT->SETCC when operands/results needs to be widened, or when the type of the SETCC operands are different from those of the VSELECT. (VSELECT SETCC) and (VSELECT (AND/OR/XOR (SETCC,SETCC))) are handled. The previous splitting of VSELECT->SETCC in DAGCombiner::visitVSELECT() is no longer needed and has been removed. Updated tests: test/CodeGen/ARM/vuzp.ll test/CodeGen/NVPTX/f16x2-instructions.ll test/CodeGen/X86/2011-10-19-widen_vselect.ll test/CodeGen/X86/2011-10-21-widen-cmp.ll test/CodeGen/X86/psubus.ll test/CodeGen/X86/vselect-pcmp.ll Review: Eli Friedman, Simon Pilgrim https://reviews.llvm.org/D29489 llvm-svn: 297930
This commit is contained in:
parent
ced11aa6c7
commit
cbcaf13b31
@ -6557,34 +6557,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
|
||||
if (SimplifySelectOps(N, N1, N2))
|
||||
return SDValue(N, 0); // Don't revisit N.
|
||||
|
||||
// If the VSELECT result requires splitting and the mask is provided by a
|
||||
// SETCC, then split both nodes and its operands before legalization. This
|
||||
// prevents the type legalizer from unrolling SETCC into scalar comparisons
|
||||
// and enables future optimizations (e.g. min/max pattern matching on X86).
|
||||
if (N0.getOpcode() == ISD::SETCC) {
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// Check if any splitting is required.
|
||||
if (TLI.getTypeAction(*DAG.getContext(), VT) !=
|
||||
TargetLowering::TypeSplitVector)
|
||||
return SDValue();
|
||||
|
||||
SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH;
|
||||
std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG);
|
||||
std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1);
|
||||
std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2);
|
||||
|
||||
Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL);
|
||||
Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH);
|
||||
|
||||
// Add the new VSELECT nodes to the work list in case they need to be split
|
||||
// again.
|
||||
AddToWorklist(Lo.getNode());
|
||||
AddToWorklist(Hi.getNode());
|
||||
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
|
||||
}
|
||||
|
||||
// Fold (vselect (build_vector all_ones), N1, N2) -> N1
|
||||
if (ISD::isBuildVectorAllOnes(N0.getNode()))
|
||||
return N1;
|
||||
|
@ -1089,6 +1089,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
|
||||
SDValue Cond = N->getOperand(0);
|
||||
EVT OpTy = N->getOperand(1).getValueType();
|
||||
|
||||
if (N->getOpcode() == ISD::VSELECT)
|
||||
if (SDValue Res = WidenVSELECTAndMask(N))
|
||||
return Res;
|
||||
|
||||
// Promote all the way up to the canonical SetCC type.
|
||||
EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
|
||||
Cond = PromoteTargetBoolean(Cond, OpVT);
|
||||
|
@ -719,6 +719,7 @@ private:
|
||||
SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
|
||||
SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
|
||||
SDValue WidenVecRes_SELECT(SDNode* N);
|
||||
SDValue WidenVSELECTAndMask(SDNode *N);
|
||||
SDValue WidenVecRes_SELECT_CC(SDNode* N);
|
||||
SDValue WidenVecRes_SETCC(SDNode* N);
|
||||
SDValue WidenVecRes_UNDEF(SDNode *N);
|
||||
@ -788,6 +789,13 @@ private:
|
||||
/// By default, the vector will be widened with undefined values.
|
||||
SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
|
||||
|
||||
/// Return a mask of vector type MaskVT to replace InMask. Also adjust
|
||||
/// MaskVT to ToMaskVT if needed with vector extension or truncation.
|
||||
SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT);
|
||||
|
||||
/// Get the target mask VT, and widen if needed.
|
||||
EVT getSETCCWidenedResultTy(SDValue SetCC);
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Generic Splitting: LegalizeTypesGeneric.cpp
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
@ -512,8 +512,24 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
|
||||
GetSplitOp(Op, Lo, Hi);
|
||||
}
|
||||
|
||||
void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
|
||||
SDValue &Hi) {
|
||||
static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N,
|
||||
SelectionDAG &DAG) {
|
||||
SDLoc DL(N);
|
||||
EVT LoVT, HiVT;
|
||||
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
|
||||
|
||||
// Split the inputs.
|
||||
SDValue Lo, Hi, LL, LH, RL, RH;
|
||||
std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
|
||||
std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
|
||||
|
||||
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
|
||||
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
|
||||
|
||||
return std::make_pair(Lo, Hi);
|
||||
}
|
||||
|
||||
void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
|
||||
SDValue LL, LH, RL, RH, CL, CH;
|
||||
SDLoc dl(N);
|
||||
GetSplitOp(N->getOperand(1), LL, LH);
|
||||
@ -522,9 +538,16 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
|
||||
SDValue Cond = N->getOperand(0);
|
||||
CL = CH = Cond;
|
||||
if (Cond.getValueType().isVector()) {
|
||||
if (SDValue Res = WidenVSELECTAndMask(N))
|
||||
std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl);
|
||||
// It seems to improve code to generate two narrow SETCCs as opposed to
|
||||
// splitting a wide result vector.
|
||||
else if (Cond.getOpcode() == ISD::SETCC)
|
||||
std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG);
|
||||
// Check if there are already splitted versions of the vector available and
|
||||
// use those instead of splitting the mask operand again.
|
||||
if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector)
|
||||
else if (getTypeAction(Cond.getValueType()) ==
|
||||
TargetLowering::TypeSplitVector)
|
||||
GetSplitVector(Cond, CL, CH);
|
||||
else
|
||||
std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
|
||||
|
@ -2864,6 +2864,212 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
|
||||
WidenVT, N->getOperand(0));
|
||||
}
|
||||
|
||||
// Return true if this is a node that could have two SETCCs as operands.
|
||||
static inline bool isLogicalMaskOp(unsigned Opcode) {
|
||||
switch (Opcode) {
|
||||
case ISD::AND:
|
||||
case ISD::OR:
|
||||
case ISD::XOR:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// This is used just for the assert in convertMask(). Check that this either
|
||||
// a SETCC or a previously handled SETCC by convertMask().
|
||||
#ifndef NDEBUG
|
||||
static inline bool isSETCCorConvertedSETCC(SDValue N) {
|
||||
if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
|
||||
N = N.getOperand(0);
|
||||
else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
|
||||
for (unsigned i = 1; i < N->getNumOperands(); ++i)
|
||||
if (!N->getOperand(i)->isUndef())
|
||||
return false;
|
||||
N = N.getOperand(0);
|
||||
}
|
||||
|
||||
if (N.getOpcode() == ISD::TRUNCATE)
|
||||
N = N.getOperand(0);
|
||||
else if (N.getOpcode() == ISD::SIGN_EXTEND)
|
||||
N = N.getOperand(0);
|
||||
|
||||
return (N.getOpcode() == ISD::SETCC);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
|
||||
// to ToMaskVT if needed with vector extension or truncation.
|
||||
SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
|
||||
EVT ToMaskVT) {
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
|
||||
// Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
|
||||
unsigned InMaskOpc = InMask->getOpcode();
|
||||
assert((InMaskOpc == ISD::SETCC ||
|
||||
(isLogicalMaskOp(InMaskOpc) &&
|
||||
isSETCCorConvertedSETCC(InMask->getOperand(0)) &&
|
||||
isSETCCorConvertedSETCC(InMask->getOperand(1)))) &&
|
||||
"Unexpected mask argument.");
|
||||
|
||||
// Make a new Mask node, with a legal result VT.
|
||||
SmallVector<SDValue, 4> Ops;
|
||||
for (unsigned i = 0; i < InMask->getNumOperands(); ++i)
|
||||
Ops.push_back(InMask->getOperand(i));
|
||||
SDValue Mask = DAG.getNode(InMaskOpc, SDLoc(InMask), MaskVT, Ops);
|
||||
|
||||
// If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
|
||||
// extend or truncate is needed.
|
||||
unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
|
||||
unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
|
||||
if (MaskScalarBits < ToMaskScalBits) {
|
||||
EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
|
||||
MaskVT.getVectorNumElements());
|
||||
Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
|
||||
} else if (MaskScalarBits > ToMaskScalBits) {
|
||||
EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
|
||||
MaskVT.getVectorNumElements());
|
||||
Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
|
||||
}
|
||||
|
||||
assert(Mask->getValueType(0).getScalarSizeInBits() ==
|
||||
ToMaskVT.getScalarSizeInBits() &&
|
||||
"Mask should have the right element size by now.");
|
||||
|
||||
// Adjust Mask to the right number of elements.
|
||||
unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
|
||||
if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
|
||||
MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
|
||||
SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy);
|
||||
Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
|
||||
ZeroIdx);
|
||||
} else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
|
||||
unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
|
||||
EVT SubVT = Mask->getValueType(0);
|
||||
SmallVector<SDValue, 16> SubConcatOps(NumSubVecs);
|
||||
SubConcatOps[0] = Mask;
|
||||
for (unsigned i = 1; i < NumSubVecs; ++i)
|
||||
SubConcatOps[i] = DAG.getUNDEF(SubVT);
|
||||
Mask =
|
||||
DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps);
|
||||
}
|
||||
|
||||
assert((Mask->getValueType(0) == ToMaskVT) &&
|
||||
"A mask of ToMaskVT should have been produced by now.");
|
||||
|
||||
return Mask;
|
||||
}
|
||||
|
||||
// Get the target mask VT, and widen if needed.
|
||||
EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
|
||||
assert(SetCC->getOpcode() == ISD::SETCC);
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
|
||||
if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
|
||||
MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
|
||||
return MaskVT;
|
||||
}
|
||||
|
||||
// This method tries to handle VSELECT and its mask by legalizing operands
|
||||
// (which may require widening) and if needed adjusting the mask vector type
|
||||
// to match that of the VSELECT. Without it, many cases end up with
|
||||
// scalarization of the SETCC, with many unnecessary instructions.
|
||||
SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
SDValue Cond = N->getOperand(0);
|
||||
|
||||
if (N->getOpcode() != ISD::VSELECT)
|
||||
return SDValue();
|
||||
|
||||
if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode()))
|
||||
return SDValue();
|
||||
|
||||
// If this is a splitted VSELECT that was previously already handled, do
|
||||
// nothing.
|
||||
if (Cond->getValueType(0).getScalarSizeInBits() != 1)
|
||||
return SDValue();
|
||||
|
||||
EVT VSelVT = N->getValueType(0);
|
||||
// Only handle vector types which are a power of 2.
|
||||
if (!isPowerOf2_64(VSelVT.getSizeInBits()))
|
||||
return SDValue();
|
||||
|
||||
// Don't touch if this will be scalarized.
|
||||
EVT FinalVT = VSelVT;
|
||||
while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
|
||||
FinalVT = EVT::getVectorVT(Ctx, FinalVT.getVectorElementType(),
|
||||
FinalVT.getVectorNumElements() / 2);
|
||||
if (FinalVT.getVectorNumElements() == 1)
|
||||
return SDValue();
|
||||
|
||||
// If there is support for an i1 vector mask, don't touch.
|
||||
if (Cond.getOpcode() == ISD::SETCC) {
|
||||
EVT SetCCOpVT = Cond->getOperand(0).getValueType();
|
||||
while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
|
||||
SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
|
||||
EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
|
||||
if (SetCCResVT.getScalarSizeInBits() == 1)
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// Get the VT and operands for VSELECT, and widen if needed.
|
||||
SDValue VSelOp1 = N->getOperand(1);
|
||||
SDValue VSelOp2 = N->getOperand(2);
|
||||
if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
|
||||
VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
|
||||
VSelOp1 = GetWidenedVector(VSelOp1);
|
||||
VSelOp2 = GetWidenedVector(VSelOp2);
|
||||
}
|
||||
|
||||
// The mask of the VSELECT should have integer elements.
|
||||
EVT ToMaskVT = VSelVT;
|
||||
if (!ToMaskVT.getScalarType().isInteger())
|
||||
ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();
|
||||
|
||||
SDValue Mask;
|
||||
if (Cond->getOpcode() == ISD::SETCC) {
|
||||
EVT MaskVT = getSETCCWidenedResultTy(Cond);
|
||||
Mask = convertMask(Cond, MaskVT, ToMaskVT);
|
||||
} else if (isLogicalMaskOp(Cond->getOpcode()) &&
|
||||
Cond->getOperand(0).getOpcode() == ISD::SETCC &&
|
||||
Cond->getOperand(1).getOpcode() == ISD::SETCC) {
|
||||
// Cond is (AND/OR/XOR (SETCC, SETCC))
|
||||
SDValue SETCC0 = Cond->getOperand(0);
|
||||
SDValue SETCC1 = Cond->getOperand(1);
|
||||
EVT VT0 = getSETCCWidenedResultTy(SETCC0);
|
||||
EVT VT1 = getSETCCWidenedResultTy(SETCC1);
|
||||
unsigned ScalarBits0 = VT0.getScalarSizeInBits();
|
||||
unsigned ScalarBits1 = VT1.getScalarSizeInBits();
|
||||
unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
|
||||
EVT MaskVT;
|
||||
// If the two SETCCs have different VTs, either extend/truncate one of
|
||||
// them to the other "towards" ToMaskVT, or truncate one and extend the
|
||||
// other to ToMaskVT.
|
||||
if (ScalarBits0 != ScalarBits1) {
|
||||
EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
|
||||
EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
|
||||
if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
|
||||
MaskVT = WideVT;
|
||||
else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
|
||||
MaskVT = NarrowVT;
|
||||
else
|
||||
MaskVT = ToMaskVT;
|
||||
} else
|
||||
// If the two SETCCs have the same VT, don't change it.
|
||||
MaskVT = VT0;
|
||||
|
||||
// Make new SETCCs and logical nodes.
|
||||
SETCC0 = convertMask(SETCC0, VT0, MaskVT);
|
||||
SETCC1 = convertMask(SETCC1, VT1, MaskVT);
|
||||
Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);
|
||||
|
||||
// Convert the logical op for VSELECT if needed.
|
||||
Mask = convertMask(Cond, MaskVT, ToMaskVT);
|
||||
} else
|
||||
return SDValue();
|
||||
|
||||
return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
|
||||
}
|
||||
|
||||
SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
|
||||
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
|
||||
unsigned WidenNumElts = WidenVT.getVectorNumElements();
|
||||
@ -2871,6 +3077,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
|
||||
SDValue Cond1 = N->getOperand(0);
|
||||
EVT CondVT = Cond1.getValueType();
|
||||
if (CondVT.isVector()) {
|
||||
if (SDValue Res = WidenVSELECTAndMask(N))
|
||||
return Res;
|
||||
|
||||
EVT CondEltVT = CondVT.getVectorElementType();
|
||||
EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(),
|
||||
CondEltVT, WidenNumElts);
|
||||
|
@ -318,33 +318,29 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
|
||||
define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
|
||||
; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
|
||||
; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
|
||||
; truncate from i32 to i16 and one vuzp to perform the final truncation for i8.
|
||||
; CHECK-LABEL: vuzp_trunc:
|
||||
; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8.
|
||||
; CHECK-LABEL: cmpsel_trunc:
|
||||
; CHECK: @ BB#0:
|
||||
; CHECK-NEXT: .save {r4, r5, r11, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r11, lr}
|
||||
; CHECK-NEXT: add r12, sp, #48
|
||||
; CHECK-NEXT: add lr, sp, #16
|
||||
; CHECK-NEXT: add r4, sp, #64
|
||||
; CHECK-NEXT: add r5, sp, #32
|
||||
; CHECK-NEXT: add r12, sp, #48
|
||||
; CHECK-NEXT: add lr, sp, #16
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r5]
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r4]
|
||||
; CHECK-NEXT: vld1.64 {d20, d21}, [lr]
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [r12]
|
||||
; CHECK-NEXT: vcgt.u32 q8, q9, q8
|
||||
; CHECK-NEXT: vcgt.u32 q9, q11, q10
|
||||
; CHECK-NEXT: vmovn.i32 d16, q8
|
||||
; CHECK-NEXT: vmovn.i32 d17, q9
|
||||
; CHECK-NEXT: vmov.i8 d18, #0x7
|
||||
; CHECK-NEXT: vmov d19, r0, r1
|
||||
; CHECK-NEXT: vuzp.8 d17, d16
|
||||
; CHECK-NEXT: vneg.s8 d16, d18
|
||||
; CHECK-NEXT: vshl.i8 d17, d17, #7
|
||||
; CHECK-NEXT: vmovn.i32 d17, q8
|
||||
; CHECK-NEXT: vmovn.i32 d16, q9
|
||||
; CHECK-NEXT: vmov d18, r2, r3
|
||||
; CHECK-NEXT: vshl.s8 d16, d17, d16
|
||||
; CHECK-NEXT: vmov d19, r0, r1
|
||||
; CHECK-NEXT: vmovn.i16 d16, q8
|
||||
; CHECK-NEXT: vbsl d16, d19, d18
|
||||
; CHECK-NEXT: vmov r0, r1, d16
|
||||
; CHECK-NEXT: pop {r4, r5, r11, lr}
|
||||
|
@ -422,17 +422,10 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
|
||||
; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
|
||||
; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
|
||||
; CHECK-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
|
||||
; CHECK-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
|
||||
;
|
||||
; TODO: Currently DAG combiner scalarizes setcc before we can lower it to setp.f16x2.
|
||||
; We'd like to see this instruction:
|
||||
; CHECK-F16-NOTYET: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
|
||||
; But we end up with a pair of scalar instances of it instead:
|
||||
; CHECK-F16-DAG: setp.neu.f16 [[P0:%p[0-9]+]], [[C0]], [[D0]]
|
||||
; CHECK-F16-DAG: setp.neu.f16 [[P1:%p[0-9]+]], [[C1]], [[D1]]
|
||||
|
||||
|
||||
; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
|
||||
; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
|
||||
; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
|
||||
; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
|
||||
; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
|
||||
; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
|
||||
|
@ -27,7 +27,6 @@ define void @complex_inreg_work(<2 x float> %a, <2 x float> %b) {
|
||||
; X32: # BB#0: # %entry
|
||||
; X32-NEXT: movaps %xmm0, %xmm2
|
||||
; X32-NEXT: cmpordps %xmm0, %xmm0
|
||||
; X32-NEXT: pslld $31, %xmm0
|
||||
; X32-NEXT: blendvps %xmm0, %xmm2, %xmm1
|
||||
; X32-NEXT: extractps $1, %xmm1, (%eax)
|
||||
; X32-NEXT: movss %xmm1, (%eax)
|
||||
@ -37,7 +36,6 @@ define void @complex_inreg_work(<2 x float> %a, <2 x float> %b) {
|
||||
; X64: # BB#0: # %entry
|
||||
; X64-NEXT: movaps %xmm0, %xmm2
|
||||
; X64-NEXT: cmpordps %xmm0, %xmm0
|
||||
; X64-NEXT: pslld $31, %xmm0
|
||||
; X64-NEXT: blendvps %xmm0, %xmm2, %xmm1
|
||||
; X64-NEXT: movlps %xmm1, (%rax)
|
||||
; X64-NEXT: retq
|
||||
@ -78,7 +76,6 @@ define void @full_test() {
|
||||
; X32-NEXT: cvtdq2ps %xmm0, %xmm1
|
||||
; X32-NEXT: xorps %xmm0, %xmm0
|
||||
; X32-NEXT: cmpltps %xmm2, %xmm0
|
||||
; X32-NEXT: pslld $31, %xmm0
|
||||
; X32-NEXT: movaps {{.*#+}} xmm3 = <1,1,u,u>
|
||||
; X32-NEXT: addps %xmm1, %xmm3
|
||||
; X32-NEXT: movaps %xmm1, %xmm4
|
||||
|
@ -9,7 +9,6 @@ define void @cmp_2_floats(<2 x float> %a, <2 x float> %b) {
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: movaps %xmm0, %xmm2
|
||||
; CHECK-NEXT: cmpordps %xmm0, %xmm0
|
||||
; CHECK-NEXT: pslld $31, %xmm0
|
||||
; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1
|
||||
; CHECK-NEXT: movlps %xmm1, (%rax)
|
||||
; CHECK-NEXT: retq
|
||||
|
@ -542,8 +542,6 @@ define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
|
||||
; SSE2-NEXT: psllw $15, %xmm4
|
||||
; SSE2-NEXT: psraw $15, %xmm4
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm1
|
||||
; SSE2-NEXT: pslld $16, %xmm0
|
||||
; SSE2-NEXT: psrad $16, %xmm0
|
||||
@ -577,8 +575,6 @@ define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
|
||||
; SSSE3-NEXT: pshufb %xmm5, %xmm6
|
||||
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
|
||||
; SSSE3-NEXT: psllw $15, %xmm6
|
||||
; SSSE3-NEXT: psraw $15, %xmm6
|
||||
; SSSE3-NEXT: psubd %xmm2, %xmm1
|
||||
; SSSE3-NEXT: pshufb %xmm5, %xmm0
|
||||
; SSSE3-NEXT: pshufb %xmm5, %xmm1
|
||||
@ -648,145 +644,118 @@ vector.ph:
|
||||
define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; SSE2-LABEL: test14:
|
||||
; SSE2: ## BB#0: ## %vector.ph
|
||||
; SSE2-NEXT: movdqu (%rdi), %xmm1
|
||||
; SSE2-NEXT: movdqu (%rdi), %xmm0
|
||||
; SSE2-NEXT: movdqu (%rsi), %xmm8
|
||||
; SSE2-NEXT: movdqu 16(%rsi), %xmm9
|
||||
; SSE2-NEXT: movdqu 32(%rsi), %xmm10
|
||||
; SSE2-NEXT: movdqu 48(%rsi), %xmm6
|
||||
; SSE2-NEXT: pxor %xmm11, %xmm11
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
|
||||
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm5
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm7
|
||||
; SSE2-NEXT: psubd %xmm6, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm6
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm7
|
||||
; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
|
||||
; SSE2-NEXT: movdqa %xmm5, %xmm7
|
||||
; SSE2-NEXT: psubd %xmm10, %xmm5
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm10
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm7
|
||||
; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0]
|
||||
; SSE2-NEXT: psllw $15, %xmm7
|
||||
; SSE2-NEXT: psraw $15, %xmm7
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
|
||||
; SSE2-NEXT: pand %xmm10, %xmm7
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE2-NEXT: psubd %xmm9, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm9
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm2
|
||||
; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
||||
; SSE2-NEXT: movdqa %xmm8, %xmm6
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm6
|
||||
; SSE2-NEXT: movdqu 48(%rsi), %xmm7
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
|
||||
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm6
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm5
|
||||
; SSE2-NEXT: psubd %xmm7, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm7
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm5
|
||||
; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255]
|
||||
; SSE2-NEXT: pand %xmm5, %xmm7
|
||||
; SSE2-NEXT: movdqa %xmm6, %xmm4
|
||||
; SSE2-NEXT: psubd %xmm10, %xmm6
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm10
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm4
|
||||
; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
|
||||
; SSE2-NEXT: psllw $15, %xmm4
|
||||
; SSE2-NEXT: psraw $15, %xmm4
|
||||
; SSE2-NEXT: pand %xmm10, %xmm4
|
||||
; SSE2-NEXT: packuswb %xmm7, %xmm4
|
||||
; SSE2-NEXT: psllw $7, %xmm4
|
||||
; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
|
||||
; SSE2-NEXT: pcmpgtb %xmm4, %xmm11
|
||||
; SSE2-NEXT: psubd %xmm8, %xmm3
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm1
|
||||
; SSE2-NEXT: pand %xmm2, %xmm5
|
||||
; SSE2-NEXT: packuswb %xmm1, %xmm5
|
||||
; SSE2-NEXT: pand %xmm2, %xmm0
|
||||
; SSE2-NEXT: pand %xmm2, %xmm3
|
||||
; SSE2-NEXT: packuswb %xmm0, %xmm3
|
||||
; SSE2-NEXT: packuswb %xmm5, %xmm3
|
||||
; SSE2-NEXT: pandn %xmm3, %xmm11
|
||||
; SSE2-NEXT: movdqu %xmm11, (%rdi)
|
||||
; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
|
||||
; SSE2-NEXT: pand %xmm5, %xmm10
|
||||
; SSE2-NEXT: packuswb %xmm7, %xmm10
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: psubd %xmm9, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm9
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm4
|
||||
; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
|
||||
; SSE2-NEXT: pand %xmm5, %xmm9
|
||||
; SSE2-NEXT: movdqa %xmm8, %xmm4
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm4
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm3
|
||||
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
|
||||
; SSE2-NEXT: pand %xmm5, %xmm4
|
||||
; SSE2-NEXT: packuswb %xmm9, %xmm4
|
||||
; SSE2-NEXT: packuswb %xmm10, %xmm4
|
||||
; SSE2-NEXT: psubd %xmm8, %xmm2
|
||||
; SSE2-NEXT: pand %xmm5, %xmm0
|
||||
; SSE2-NEXT: pand %xmm5, %xmm6
|
||||
; SSE2-NEXT: packuswb %xmm0, %xmm6
|
||||
; SSE2-NEXT: pand %xmm5, %xmm1
|
||||
; SSE2-NEXT: pand %xmm5, %xmm2
|
||||
; SSE2-NEXT: packuswb %xmm1, %xmm2
|
||||
; SSE2-NEXT: packuswb %xmm6, %xmm2
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm4
|
||||
; SSE2-NEXT: movdqu %xmm4, (%rdi)
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: test14:
|
||||
; SSSE3: ## BB#0: ## %vector.ph
|
||||
; SSSE3-NEXT: movdqu (%rdi), %xmm1
|
||||
; SSSE3-NEXT: movdqu (%rdi), %xmm0
|
||||
; SSSE3-NEXT: movdqu (%rsi), %xmm8
|
||||
; SSSE3-NEXT: movdqu 16(%rsi), %xmm9
|
||||
; SSSE3-NEXT: movdqu 32(%rsi), %xmm10
|
||||
; SSSE3-NEXT: movdqu 48(%rsi), %xmm4
|
||||
; SSSE3-NEXT: pxor %xmm0, %xmm0
|
||||
; SSSE3-NEXT: movdqu 48(%rsi), %xmm7
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm3
|
||||
; SSSE3-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
||||
; SSSE3-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
|
||||
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
||||
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm7
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
|
||||
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm5
|
||||
; SSSE3-NEXT: psubd %xmm4, %xmm1
|
||||
; SSSE3-NEXT: pxor %xmm6, %xmm4
|
||||
; SSSE3-NEXT: pxor %xmm6, %xmm5
|
||||
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; SSSE3-NEXT: pshufb %xmm11, %xmm4
|
||||
; SSSE3-NEXT: movdqa %xmm7, %xmm5
|
||||
; SSSE3-NEXT: psubd %xmm10, %xmm7
|
||||
; SSSE3-NEXT: pxor %xmm6, %xmm10
|
||||
; SSSE3-NEXT: pxor %xmm6, %xmm5
|
||||
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
|
||||
; SSSE3-NEXT: pshufb %xmm11, %xmm10
|
||||
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm4[0]
|
||||
; SSSE3-NEXT: psllw $15, %xmm10
|
||||
; SSSE3-NEXT: psraw $15, %xmm10
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm10
|
||||
; SSSE3-NEXT: movdqa %xmm2, %xmm5
|
||||
; SSSE3-NEXT: psubd %xmm9, %xmm2
|
||||
; SSSE3-NEXT: pxor %xmm6, %xmm9
|
||||
; SSSE3-NEXT: pxor %xmm6, %xmm5
|
||||
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm9
|
||||
; SSSE3-NEXT: pshufb %xmm11, %xmm9
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
|
||||
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
|
||||
; SSSE3-NEXT: movdqa %xmm0, %xmm6
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
|
||||
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
|
||||
; SSSE3-NEXT: movdqa %xmm0, %xmm5
|
||||
; SSSE3-NEXT: psubd %xmm7, %xmm0
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm7
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm5
|
||||
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; SSSE3-NEXT: pshufb %xmm5, %xmm7
|
||||
; SSSE3-NEXT: movdqa %xmm6, %xmm4
|
||||
; SSSE3-NEXT: psubd %xmm10, %xmm6
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm10
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm4
|
||||
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm10
|
||||
; SSSE3-NEXT: pshufb %xmm5, %xmm10
|
||||
; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSSE3-NEXT: psubd %xmm9, %xmm1
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm9
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm4
|
||||
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm9
|
||||
; SSSE3-NEXT: movdqa %xmm8, %xmm5
|
||||
; SSSE3-NEXT: pxor %xmm6, %xmm5
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm6
|
||||
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
|
||||
; SSSE3-NEXT: pshufb %xmm11, %xmm5
|
||||
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0]
|
||||
; SSSE3-NEXT: psllw $15, %xmm5
|
||||
; SSSE3-NEXT: psraw $15, %xmm5
|
||||
; SSSE3-NEXT: pxor %xmm3, %xmm5
|
||||
; SSSE3-NEXT: pxor %xmm2, %xmm3
|
||||
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm5
|
||||
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm10[0]
|
||||
; SSSE3-NEXT: psllw $7, %xmm5
|
||||
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm5
|
||||
; SSSE3-NEXT: pcmpgtb %xmm5, %xmm0
|
||||
; SSSE3-NEXT: psubd %xmm8, %xmm3
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
||||
; SSSE3-NEXT: pand %xmm4, %xmm1
|
||||
; SSSE3-NEXT: pand %xmm4, %xmm7
|
||||
; SSSE3-NEXT: packuswb %xmm1, %xmm7
|
||||
; SSSE3-NEXT: pand %xmm4, %xmm2
|
||||
; SSSE3-NEXT: pand %xmm4, %xmm3
|
||||
; SSSE3-NEXT: packuswb %xmm2, %xmm3
|
||||
; SSSE3-NEXT: packuswb %xmm7, %xmm3
|
||||
; SSSE3-NEXT: pandn %xmm3, %xmm0
|
||||
; SSSE3-NEXT: movdqu %xmm0, (%rdi)
|
||||
; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
|
||||
; SSSE3-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1]
|
||||
; SSSE3-NEXT: psubd %xmm8, %xmm2
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
||||
; SSSE3-NEXT: pand %xmm3, %xmm0
|
||||
; SSSE3-NEXT: pand %xmm3, %xmm6
|
||||
; SSSE3-NEXT: packuswb %xmm0, %xmm6
|
||||
; SSSE3-NEXT: pand %xmm3, %xmm1
|
||||
; SSSE3-NEXT: pand %xmm3, %xmm2
|
||||
; SSSE3-NEXT: packuswb %xmm1, %xmm2
|
||||
; SSSE3-NEXT: packuswb %xmm6, %xmm2
|
||||
; SSSE3-NEXT: andnpd %xmm2, %xmm10
|
||||
; SSSE3-NEXT: movupd %xmm10, (%rdi)
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: test14:
|
||||
@ -805,23 +774,18 @@ define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm10, %xmm7
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpcmpgtd %xmm7, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm12
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm7
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm7
|
||||
; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm11
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm4
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm3
|
||||
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm4
|
||||
; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm6
|
||||
; AVX1-NEXT: vpcmpgtd %xmm7, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpacksswb %xmm3, %xmm6, %xmm3
|
||||
; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0]
|
||||
; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm6, %xmm4
|
||||
; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpacksswb %xmm11, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm0
|
||||
; AVX1-NEXT: vpsubd %xmm4, %xmm9, %xmm4
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm4
|
||||
; AVX1-NEXT: vpsubd %xmm1, %xmm10, %xmm1
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
||||
@ -850,26 +814,22 @@ define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
|
||||
; AVX2-NEXT: vpacksswb %xmm6, %xmm5, %xmm5
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5
|
||||
; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm7
|
||||
; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm6
|
||||
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4
|
||||
; AVX2-NEXT: vpcmpgtd %ymm7, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
|
||||
; AVX2-NEXT: vpacksswb %xmm7, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
|
||||
; AVX2-NEXT: vpsllw $7, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
|
||||
; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpsubd %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
||||
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
|
||||
@ -919,8 +879,6 @@ define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
|
||||
; SSE2-NEXT: psllw $15, %xmm4
|
||||
; SSE2-NEXT: psraw $15, %xmm4
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm1
|
||||
; SSE2-NEXT: pslld $16, %xmm0
|
||||
; SSE2-NEXT: psrad $16, %xmm0
|
||||
@ -954,8 +912,6 @@ define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm3
|
||||
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
|
||||
; SSSE3-NEXT: psllw $15, %xmm3
|
||||
; SSSE3-NEXT: psraw $15, %xmm3
|
||||
; SSSE3-NEXT: psubd %xmm2, %xmm1
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm0
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm1
|
||||
@ -1049,8 +1005,6 @@ define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
|
||||
; SSE2-NEXT: psllw $15, %xmm4
|
||||
; SSE2-NEXT: psraw $15, %xmm4
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm1
|
||||
; SSE2-NEXT: pslld $16, %xmm0
|
||||
; SSE2-NEXT: psrad $16, %xmm0
|
||||
@ -1084,8 +1038,6 @@ define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
|
||||
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm3
|
||||
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
|
||||
; SSSE3-NEXT: psllw $15, %xmm3
|
||||
; SSSE3-NEXT: psraw $15, %xmm3
|
||||
; SSSE3-NEXT: psubd %xmm2, %xmm1
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm0
|
||||
; SSSE3-NEXT: pshufb %xmm4, %xmm1
|
||||
|
@ -247,8 +247,6 @@ define <4 x double> @signbit_sel_v4f64(<4 x double> %x, <4 x double> %y, <4 x i6
|
||||
define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double> %y, <4 x i32> %mask) {
|
||||
; AVX1-LABEL: signbit_sel_v4f64_small_mask:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
|
||||
|
Loading…
x
Reference in New Issue
Block a user