1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[SelectionDAG][X86] Move setcc mask splitting for mload/mstore/mgather/mscatter from DAGCombiner to the type legalizer.

We may be able to look to how VSELECT is handled to further
improve this, but this appears to be neutral or an improvement
on the test cases we have.

llvm-svn: 368344
This commit is contained in:
Craig Topper 2019-08-08 21:14:08 +00:00
parent 91643844c2
commit 14499088f1
5 changed files with 78 additions and 326 deletions

View File

@ -8431,23 +8431,6 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
return SDValue();
}
static
std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
// Split the inputs.
SDValue Lo, Hi, LL, LH, RL, RH;
std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
return std::make_pair(Lo, Hi);
}
// This function assumes all the vselect's arguments are CONCAT_VECTOR
// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
@ -8506,7 +8489,6 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
SDValue Mask = MSC->getMask();
SDValue Data = MSC->getValue();
SDValue Chain = MSC->getChain();
SDLoc DL(N);
@ -8514,124 +8496,19 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return Chain;
if (Level >= AfterLegalizeTypes)
return SDValue();
// If the MSCATTER data type requires splitting and the mask is provided by a
// SETCC, then split both nodes and its operands before legalization. This
// prevents the type legalizer from unrolling SETCC into scalar comparisons
// and enables future optimizations (e.g. min/max pattern matching on X86).
if (Mask.getOpcode() != ISD::SETCC)
return SDValue();
// Check if any splitting is required.
if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
TargetLowering::TypeSplitVector)
return SDValue();
SDValue MaskLo, MaskHi;
std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0));
EVT MemoryVT = MSC->getMemoryVT();
unsigned Alignment = MSC->getOriginalAlignment();
EVT LoMemVT, HiMemVT;
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
SDValue DataLo, DataHi;
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
SDValue Scale = MSC->getScale();
SDValue BasePtr = MSC->getBasePtr();
SDValue IndexLo, IndexHi;
std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MSC->getPointerInfo(),
MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
Alignment, MSC->getAAInfo(), MSC->getRanges());
SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo, Scale };
SDValue Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
DataLo.getValueType(), DL, OpsLo, MMO,
MSC->getIndexType());
// The order of the Scatter operation after split is well defined. The "Hi"
// part comes after the "Lo". So these two operations should be chained one
// after another.
SDValue OpsHi[] = { Lo, DataHi, MaskHi, BasePtr, IndexHi, Scale };
return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
DL, OpsHi, MMO, MSC->getIndexType());
return SDValue();
}
SDValue DAGCombiner::visitMSTORE(SDNode *N) {
MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
SDValue Mask = MST->getMask();
SDValue Data = MST->getValue();
SDValue Chain = MST->getChain();
EVT VT = Data.getValueType();
SDLoc DL(N);
// Zap masked stores with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return Chain;
if (Level >= AfterLegalizeTypes)
return SDValue();
// If the MSTORE data type requires splitting and the mask is provided by a
// SETCC, then split both nodes and its operands before legalization. This
// prevents the type legalizer from unrolling SETCC into scalar comparisons
// and enables future optimizations (e.g. min/max pattern matching on X86).
if (Mask.getOpcode() == ISD::SETCC) {
// Check if any splitting is required.
if (TLI.getTypeAction(*DAG.getContext(), VT) !=
TargetLowering::TypeSplitVector)
return SDValue();
SDValue MaskLo, MaskHi, Lo, Hi;
std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
SDValue Ptr = MST->getBasePtr();
EVT MemoryVT = MST->getMemoryVT();
unsigned Alignment = MST->getOriginalAlignment();
EVT LoMemVT, HiMemVT;
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
SDValue DataLo, DataHi;
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MST->getPointerInfo(),
MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
Alignment, MST->getAAInfo(), MST->getRanges());
Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
MST->isTruncatingStore(),
MST->isCompressingStore());
Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
MST->isCompressingStore());
unsigned HiOffset = LoMemVT.getStoreSize();
MMO = DAG.getMachineFunction().getMachineMemOperand(
MST->getPointerInfo().getWithOffset(HiOffset),
MachineMemOperand::MOStore, HiMemVT.getStoreSize(), Alignment,
MST->getAAInfo(), MST->getRanges());
Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
MST->isTruncatingStore(),
MST->isCompressingStore());
AddToWorklist(Lo.getNode());
AddToWorklist(Hi.getNode());
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
}
return SDValue();
}
@ -8644,76 +8521,7 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return CombineTo(N, MGT->getPassThru(), MGT->getChain());
if (Level >= AfterLegalizeTypes)
return SDValue();
// If the MGATHER result requires splitting and the mask is provided by a
// SETCC, then split both nodes and its operands before legalization. This
// prevents the type legalizer from unrolling SETCC into scalar comparisons
// and enables future optimizations (e.g. min/max pattern matching on X86).
if (Mask.getOpcode() != ISD::SETCC)
return SDValue();
EVT VT = N->getValueType(0);
// Check if any splitting is required.
if (TLI.getTypeAction(*DAG.getContext(), VT) !=
TargetLowering::TypeSplitVector)
return SDValue();
SDValue MaskLo, MaskHi, Lo, Hi;
std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
SDValue PassThru = MGT->getPassThru();
SDValue PassThruLo, PassThruHi;
std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, DL);
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
SDValue Chain = MGT->getChain();
EVT MemoryVT = MGT->getMemoryVT();
unsigned Alignment = MGT->getOriginalAlignment();
EVT LoMemVT, HiMemVT;
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
SDValue Scale = MGT->getScale();
SDValue BasePtr = MGT->getBasePtr();
SDValue Index = MGT->getIndex();
SDValue IndexLo, IndexHi;
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MGT->getPointerInfo(),
MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
Alignment, MGT->getAAInfo(), MGT->getRanges());
SDValue OpsLo[] = { Chain, PassThruLo, MaskLo, BasePtr, IndexLo, Scale };
Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
MMO, MGT->getIndexType());
SDValue OpsHi[] = { Chain, PassThruHi, MaskHi, BasePtr, IndexHi, Scale };
Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
MMO, MGT->getIndexType());
AddToWorklist(Lo.getNode());
AddToWorklist(Hi.getNode());
// Build a factor node to remember that this load is independent of the
// other one.
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
Hi.getValue(1));
// Legalized the chain result - switch anything that used the old chain to
// use the new one.
DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain);
SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
SDValue RetOps[] = { GatherRes, Chain };
return DAG.getMergeValues(RetOps, DL);
return SDValue();
}
SDValue DAGCombiner::visitMLOAD(SDNode *N) {
@ -8725,76 +8533,6 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return CombineTo(N, MLD->getPassThru(), MLD->getChain());
if (Level >= AfterLegalizeTypes)
return SDValue();
// If the MLOAD result requires splitting and the mask is provided by a
// SETCC, then split both nodes and its operands before legalization. This
// prevents the type legalizer from unrolling SETCC into scalar comparisons
// and enables future optimizations (e.g. min/max pattern matching on X86).
if (Mask.getOpcode() == ISD::SETCC) {
EVT VT = N->getValueType(0);
// Check if any splitting is required.
if (TLI.getTypeAction(*DAG.getContext(), VT) !=
TargetLowering::TypeSplitVector)
return SDValue();
SDValue MaskLo, MaskHi, Lo, Hi;
std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
SDValue PassThru = MLD->getPassThru();
SDValue PassThruLo, PassThruHi;
std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, DL);
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
SDValue Chain = MLD->getChain();
SDValue Ptr = MLD->getBasePtr();
EVT MemoryVT = MLD->getMemoryVT();
unsigned Alignment = MLD->getOriginalAlignment();
EVT LoMemVT, HiMemVT;
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MLD->getPointerInfo(),
MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
Alignment, MLD->getAAInfo(), MLD->getRanges());
Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, PassThruLo, LoMemVT,
MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad());
Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
MLD->isExpandingLoad());
unsigned HiOffset = LoMemVT.getStoreSize();
MMO = DAG.getMachineFunction().getMachineMemOperand(
MLD->getPointerInfo().getWithOffset(HiOffset),
MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment,
MLD->getAAInfo(), MLD->getRanges());
Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, PassThruHi, HiMemVT,
MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad());
AddToWorklist(Lo.getNode());
AddToWorklist(Hi.getNode());
// Build a factor node to remember that this load is independent of the
// other one.
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
Hi.getValue(1));
// Legalized the chain result - switch anything that used the old chain to
// use the new one.
DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
SDValue RetOps[] = { LoadRes, Chain };
return DAG.getMergeValues(RetOps, DL);
}
return SDValue();
}

View File

@ -933,6 +933,8 @@ private:
void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitRes_UNDEF (SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVSETCC(const SDNode *N);
//===--------------------------------------------------------------------===//
// Generic Expansion: LegalizeTypesGeneric.cpp
//===--------------------------------------------------------------------===//

View File

@ -1560,10 +1560,14 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
// Split Mask operand
SDValue MaskLo, MaskHi;
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
if (Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
}
EVT MemoryVT = MLD->getMemoryVT();
EVT LoMemVT, HiMemVT;
@ -1622,10 +1626,14 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
// Split Mask operand
SDValue MaskLo, MaskHi;
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
if (Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
}
EVT MemoryVT = MGT->getMemoryVT();
EVT LoMemVT, HiMemVT;
@ -2340,12 +2348,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
else
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
// Split Mask operand
SDValue MaskLo, MaskHi;
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
// Split Mask operand
GetSplitVector(Mask, MaskLo, MaskHi);
else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
}
SDValue Lo, Hi;
MachineMemOperand *MMO = DAG.getMachineFunction().
@ -2397,12 +2409,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
else
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
// Split Mask operand
SDValue MaskLo, MaskHi;
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
// Split Mask operand
GetSplitVector(Mask, MaskLo, MaskHi);
else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
}
SDValue IndexHi, IndexLo;
if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)

View File

@ -2774,12 +2774,12 @@ define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
; KNL_64-LABEL: test_gather_setcc_split:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1
; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
; KNL_64-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
; KNL_64-NEXT: vmovapd %zmm2, %zmm0
; KNL_64-NEXT: vmovapd %zmm3, %zmm1
; KNL_64-NEXT: retq
@ -2795,12 +2795,12 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
; KNL_32-NEXT: movl 8(%ebp), %eax
; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1
; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
; KNL_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: vmovapd %zmm3, %zmm1
; KNL_32-NEXT: movl %ebp, %esp
@ -2810,12 +2810,12 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
;
; SKX-LABEL: test_gather_setcc_split:
; SKX: # %bb.0:
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1
; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
; SKX-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
; SKX-NEXT: vmovapd %zmm2, %zmm0
; SKX-NEXT: vmovapd %zmm3, %zmm1
; SKX-NEXT: retq
@ -2831,12 +2831,12 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
; SKX_32-NEXT: movl 8(%ebp), %eax
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1
; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
; SKX_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: vmovapd %zmm3, %zmm1
; SKX_32-NEXT: movl %ebp, %esp
@ -2854,12 +2854,12 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) {
; KNL_64-LABEL: test_scatter_setcc_split:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1
; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k1}
; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
@ -2874,12 +2874,12 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32>
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
; KNL_32-NEXT: movl 8(%ebp), %eax
; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1
; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k1}
; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: .cfi_def_cfa %esp, 4
@ -2888,12 +2888,12 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32>
;
; SKX-LABEL: test_scatter_setcc_split:
; SKX: # %bb.0:
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1
; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k1}
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
@ -2908,12 +2908,12 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32>
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
; SKX_32-NEXT: movl 8(%ebp), %eax
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1
; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k1}
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: .cfi_def_cfa %esp, 4

View File

@ -467,10 +467,9 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, <
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
@ -486,8 +485,7 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, <
; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
@ -1792,10 +1790,9 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
@ -1811,8 +1808,7 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6
; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1