mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[X86][FPEnv] Teach X86 mask compare intrinsics to respect strict FP semantics.
When we use mask compare intrinsics under strict FP option, the masked elements shouldn't raise any exception. So, we cann't replace the intrinsic with a full compare + "and" operation. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D85385
This commit is contained in:
parent
99ed94570d
commit
72838e8fb2
@ -4749,26 +4749,26 @@ let TargetPrefix = "x86" in {
|
||||
let TargetPrefix = "x86" in {
|
||||
// NOTE: These comparison intrinsics are not used by clang as long as the
|
||||
// distinction in signaling behaviour is not implemented.
|
||||
def int_x86_avx512_cmp_ps_512 :
|
||||
def int_x86_avx512_mask_cmp_ps_512 :
|
||||
Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
|
||||
llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
|
||||
def int_x86_avx512_cmp_pd_512 :
|
||||
llvm_i32_ty, llvm_v16i1_ty, llvm_i32_ty],
|
||||
[IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
|
||||
def int_x86_avx512_mask_cmp_pd_512 :
|
||||
Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
|
||||
llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
|
||||
def int_x86_avx512_cmp_ps_256 :
|
||||
llvm_i32_ty, llvm_v8i1_ty, llvm_i32_ty],
|
||||
[IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
|
||||
def int_x86_avx512_mask_cmp_ps_256 :
|
||||
Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
|
||||
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_avx512_cmp_pd_256 :
|
||||
llvm_i32_ty, llvm_v8i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_avx512_mask_cmp_pd_256 :
|
||||
Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
|
||||
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_avx512_cmp_ps_128 :
|
||||
llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_avx512_mask_cmp_ps_128 :
|
||||
Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
|
||||
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_avx512_cmp_pd_128 :
|
||||
llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_avx512_mask_cmp_pd_128 :
|
||||
Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
|
||||
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
|
||||
llvm_i32_ty, llvm_v2i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
|
||||
|
||||
def int_x86_avx512_mask_cmp_ss :
|
||||
GCCBuiltin<"__builtin_ia32_cmpss_mask">,
|
||||
|
@ -68,6 +68,19 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Upgrade the declaration of fp compare intrinsics that change return type
|
||||
// from scalar to vXi1 mask.
|
||||
static bool UpgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
|
||||
Function *&NewFn) {
|
||||
// Check if the return type is a vector.
|
||||
if (F->getReturnType()->isVectorTy())
|
||||
return false;
|
||||
|
||||
rename(F);
|
||||
NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
|
||||
// All of the intrinsics matches below should be marked with which llvm
|
||||
// version started autoupgrading them. At some point in the future we would
|
||||
@ -241,7 +254,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
|
||||
Name.startswith("avx512.mask.cmp.d") || // Added in 5.0
|
||||
Name.startswith("avx512.mask.cmp.q") || // Added in 5.0
|
||||
Name.startswith("avx512.mask.cmp.w") || // Added in 5.0
|
||||
Name.startswith("avx512.mask.cmp.p") || // Added in 7.0
|
||||
Name.startswith("avx512.cmp.p") || // Added in 12.0
|
||||
Name.startswith("avx512.mask.ucmp.") || // Added in 5.0
|
||||
Name.startswith("avx512.cvtb2mask.") || // Added in 7.0
|
||||
Name.startswith("avx512.cvtw2mask.") || // Added in 7.0
|
||||
@ -456,6 +469,24 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
|
||||
if (Name == "avx2.mpsadbw") // Added in 3.6
|
||||
return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
|
||||
NewFn);
|
||||
if (Name == "avx512.mask.cmp.pd.128") // Added in 7.0
|
||||
return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_128,
|
||||
NewFn);
|
||||
if (Name == "avx512.mask.cmp.pd.256") // Added in 7.0
|
||||
return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_256,
|
||||
NewFn);
|
||||
if (Name == "avx512.mask.cmp.pd.512") // Added in 7.0
|
||||
return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_512,
|
||||
NewFn);
|
||||
if (Name == "avx512.mask.cmp.ps.128") // Added in 7.0
|
||||
return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_128,
|
||||
NewFn);
|
||||
if (Name == "avx512.mask.cmp.ps.256") // Added in 7.0
|
||||
return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_256,
|
||||
NewFn);
|
||||
if (Name == "avx512.mask.cmp.ps.512") // Added in 7.0
|
||||
return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_512,
|
||||
NewFn);
|
||||
|
||||
// frcz.ss/sd may need to have an argument dropped. Added in 3.2
|
||||
if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
|
||||
@ -2000,38 +2031,36 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
||||
Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
|
||||
{ CI->getOperand(0), CI->getArgOperand(1) });
|
||||
Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
|
||||
} else if (IsX86 && Name.startswith("avx512.mask.cmp.p")) {
|
||||
Type *OpTy = CI->getArgOperand(0)->getType();
|
||||
} else if (IsX86 && Name.startswith("avx512.cmp.p")) {
|
||||
SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
|
||||
CI->arg_operands().end());
|
||||
Type *OpTy = Args[0]->getType();
|
||||
unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
|
||||
unsigned EltWidth = OpTy->getScalarSizeInBits();
|
||||
Intrinsic::ID IID;
|
||||
if (VecWidth == 128 && EltWidth == 32)
|
||||
IID = Intrinsic::x86_avx512_cmp_ps_128;
|
||||
IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
|
||||
else if (VecWidth == 256 && EltWidth == 32)
|
||||
IID = Intrinsic::x86_avx512_cmp_ps_256;
|
||||
IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
|
||||
else if (VecWidth == 512 && EltWidth == 32)
|
||||
IID = Intrinsic::x86_avx512_cmp_ps_512;
|
||||
IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
|
||||
else if (VecWidth == 128 && EltWidth == 64)
|
||||
IID = Intrinsic::x86_avx512_cmp_pd_128;
|
||||
IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
|
||||
else if (VecWidth == 256 && EltWidth == 64)
|
||||
IID = Intrinsic::x86_avx512_cmp_pd_256;
|
||||
IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
|
||||
else if (VecWidth == 512 && EltWidth == 64)
|
||||
IID = Intrinsic::x86_avx512_cmp_pd_512;
|
||||
IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
|
||||
else
|
||||
llvm_unreachable("Unexpected intrinsic");
|
||||
|
||||
SmallVector<Value *, 4> Args;
|
||||
Args.push_back(CI->getArgOperand(0));
|
||||
Args.push_back(CI->getArgOperand(1));
|
||||
Args.push_back(CI->getArgOperand(2));
|
||||
if (CI->getNumArgOperands() == 5)
|
||||
Args.push_back(CI->getArgOperand(4));
|
||||
Value *Mask = Constant::getAllOnesValue(CI->getType());
|
||||
if (VecWidth == 512)
|
||||
std::swap(Mask, Args.back());
|
||||
Args.push_back(Mask);
|
||||
|
||||
Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
|
||||
Args);
|
||||
Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(3));
|
||||
} else if (IsX86 && Name.startswith("avx512.mask.cmp.") &&
|
||||
Name[16] != 'p') {
|
||||
} else if (IsX86 && Name.startswith("avx512.mask.cmp.")) {
|
||||
// Integer compare intrinsics.
|
||||
unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
|
||||
Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
|
||||
@ -3718,6 +3747,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::x86_avx512_mask_cmp_pd_128:
|
||||
case Intrinsic::x86_avx512_mask_cmp_pd_256:
|
||||
case Intrinsic::x86_avx512_mask_cmp_pd_512:
|
||||
case Intrinsic::x86_avx512_mask_cmp_ps_128:
|
||||
case Intrinsic::x86_avx512_mask_cmp_ps_256:
|
||||
case Intrinsic::x86_avx512_mask_cmp_ps_512: {
|
||||
SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
|
||||
CI->arg_operands().end());
|
||||
unsigned NumElts = cast<VectorType>(Args[0]->getType())->getNumElements();
|
||||
Args[3] = getX86MaskVec(Builder, Args[3], NumElts);
|
||||
|
||||
NewCall = Builder.CreateCall(NewFn, Args);
|
||||
Value *Res = ApplyX86MaskOn1BitsVec(Builder, NewCall, nullptr);
|
||||
|
||||
StringRef Name = CI->getName();
|
||||
if (!Name.empty()) {
|
||||
CI->setName(Name + ".old");
|
||||
NewCall->setName(Name);
|
||||
}
|
||||
CI->replaceAllUsesWith(Res);
|
||||
CI->eraseFromParent();
|
||||
return;
|
||||
}
|
||||
|
||||
case Intrinsic::thread_pointer: {
|
||||
NewCall = Builder.CreateCall(NewFn, {});
|
||||
break;
|
||||
|
@ -521,9 +521,9 @@ namespace {
|
||||
// type.
|
||||
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
|
||||
unsigned Opcode = N->getOpcode();
|
||||
if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM ||
|
||||
Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE ||
|
||||
Opcode == X86ISD::VFPCLASS) {
|
||||
if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
|
||||
Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
|
||||
Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
|
||||
// We can get 256-bit 8 element types here without VLX being enabled. When
|
||||
// this happens we will use 512-bit operations and the mask will not be
|
||||
// zero extended.
|
||||
|
@ -24709,20 +24709,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
case CMP_MASK_CC: {
|
||||
MVT MaskVT = Op.getSimpleValueType();
|
||||
SDValue CC = Op.getOperand(3);
|
||||
SDValue Mask = Op.getOperand(4);
|
||||
// We specify 2 possible opcodes for intrinsics with rounding modes.
|
||||
// First, we check if the intrinsic may have non-default rounding mode,
|
||||
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
|
||||
if (IntrData->Opc1 != 0) {
|
||||
SDValue Sae = Op.getOperand(4);
|
||||
SDValue Sae = Op.getOperand(5);
|
||||
if (isRoundModeSAE(Sae))
|
||||
return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
|
||||
Op.getOperand(2), CC, Sae);
|
||||
Op.getOperand(2), CC, Mask, Sae);
|
||||
if (!isRoundModeCurDirection(Sae))
|
||||
return SDValue();
|
||||
}
|
||||
//default rounding mode
|
||||
return DAG.getNode(IntrData->Opc0, dl, MaskVT,
|
||||
{Op.getOperand(1), Op.getOperand(2), CC});
|
||||
{Op.getOperand(1), Op.getOperand(2), CC, Mask});
|
||||
}
|
||||
case CMP_MASK_SCALAR_CC: {
|
||||
SDValue Src1 = Op.getOperand(1);
|
||||
@ -30302,8 +30303,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(COMI)
|
||||
NODE_NAME_CASE(UCOMI)
|
||||
NODE_NAME_CASE(CMPM)
|
||||
NODE_NAME_CASE(CMPMM)
|
||||
NODE_NAME_CASE(STRICT_CMPM)
|
||||
NODE_NAME_CASE(CMPM_SAE)
|
||||
NODE_NAME_CASE(CMPMM_SAE)
|
||||
NODE_NAME_CASE(SETCC)
|
||||
NODE_NAME_CASE(SETCC_CARRY)
|
||||
NODE_NAME_CASE(FSETCC)
|
||||
|
@ -384,8 +384,10 @@ namespace llvm {
|
||||
/// Vector comparison generating mask bits for fp and
|
||||
/// integer signed and unsigned data types.
|
||||
CMPM,
|
||||
// Vector comparison with SAE for FP values
|
||||
CMPM_SAE,
|
||||
// Vector mask comparison generating mask bits for FP values.
|
||||
CMPMM,
|
||||
// Vector mask comparison with SAE for FP values.
|
||||
CMPMM_SAE,
|
||||
|
||||
// Arithmetic operations with FLAGS results.
|
||||
ADD,
|
||||
|
@ -1150,39 +1150,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Intrinsic::x86_avx512_cmp_pd_128:
|
||||
case Intrinsic::x86_avx512_cmp_pd_256:
|
||||
case Intrinsic::x86_avx512_cmp_pd_512:
|
||||
case Intrinsic::x86_avx512_cmp_ps_128:
|
||||
case Intrinsic::x86_avx512_cmp_ps_256:
|
||||
case Intrinsic::x86_avx512_cmp_ps_512: {
|
||||
// Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
|
||||
Value *Arg0 = II.getArgOperand(0);
|
||||
Value *Arg1 = II.getArgOperand(1);
|
||||
bool Arg0IsZero = match(Arg0, PatternMatch::m_PosZeroFP());
|
||||
if (Arg0IsZero)
|
||||
std::swap(Arg0, Arg1);
|
||||
Value *A, *B;
|
||||
// This fold requires only the NINF(not +/- inf) since inf minus
|
||||
// inf is nan.
|
||||
// NSZ(No Signed Zeros) is not needed because zeros of any sign are
|
||||
// equal for both compares.
|
||||
// NNAN is not needed because nans compare the same for both compares.
|
||||
// The compare intrinsic uses the above assumptions and therefore
|
||||
// doesn't require additional flags.
|
||||
if ((match(Arg0,
|
||||
PatternMatch::m_OneUse(PatternMatch::m_FSub(
|
||||
PatternMatch::m_Value(A), PatternMatch::m_Value(B)))) &&
|
||||
match(Arg1, PatternMatch::m_PosZeroFP()) && isa<Instruction>(Arg0) &&
|
||||
cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
|
||||
if (Arg0IsZero)
|
||||
std::swap(A, B);
|
||||
IC.replaceOperand(II, 0, A);
|
||||
IC.replaceOperand(II, 1, B);
|
||||
return &II;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::x86_avx512_add_ps_512:
|
||||
case Intrinsic::x86_avx512_div_ps_512:
|
||||
|
@ -2494,10 +2494,6 @@ def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
|
||||
(X86cmpm node:$src1, node:$src2, node:$cc), [{
|
||||
return N->hasOneUse();
|
||||
}]>;
|
||||
def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
|
||||
(X86cmpmSAE node:$src1, node:$src2, node:$cc), [{
|
||||
return N->hasOneUse();
|
||||
}]>;
|
||||
|
||||
def X86cmpm_imm_commute : SDNodeXForm<timm, [{
|
||||
uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f);
|
||||
@ -2564,19 +2560,71 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
|
||||
_.RC:$src1, addr:$src2,
|
||||
(X86cmpm_imm_commute timm:$cc))>;
|
||||
|
||||
// Patterns for mask intrinsics.
|
||||
def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc,
|
||||
(_.KVT immAllOnesV)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>;
|
||||
|
||||
def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1,
|
||||
_.RC:$src2, timm:$cc)>;
|
||||
|
||||
def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
|
||||
(_.KVT immAllOnesV)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>;
|
||||
|
||||
def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
|
||||
_.KRCWM:$mask),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1,
|
||||
addr:$src2, timm:$cc)>;
|
||||
|
||||
def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
|
||||
(_.KVT immAllOnesV)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>;
|
||||
|
||||
def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
|
||||
_.KRCWM:$mask),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1,
|
||||
addr:$src2, timm:$cc)>;
|
||||
|
||||
// Patterns for mask intrinsics with loads in other operand.
|
||||
def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
|
||||
(_.KVT immAllOnesV)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
|
||||
(X86cmpm_imm_commute timm:$cc))>;
|
||||
|
||||
def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
|
||||
_.KRCWM:$mask),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
|
||||
_.RC:$src1, addr:$src2,
|
||||
(X86cmpm_imm_commute timm:$cc))>;
|
||||
|
||||
def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
|
||||
(_.KVT immAllOnesV)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
|
||||
(X86cmpm_imm_commute timm:$cc))>;
|
||||
|
||||
def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
|
||||
_.KRCWM:$mask),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
|
||||
_.RC:$src1, addr:$src2,
|
||||
(X86cmpm_imm_commute timm:$cc))>;
|
||||
}
|
||||
|
||||
multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
|
||||
// comparison code form (VCMP[EQ/LT/LE/...]
|
||||
let Uses = [MXCSR] in
|
||||
defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
|
||||
(outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
|
||||
defm rrib : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst),
|
||||
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
|
||||
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc),
|
||||
"vcmp"#_.Suffix,
|
||||
"$cc, {sae}, $src2, $src1",
|
||||
"$src1, $src2, {sae}, $cc",
|
||||
(X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
|
||||
(X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
|
||||
timm:$cc)>,
|
||||
[(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
|
||||
(_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))],
|
||||
[(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
|
||||
(_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>,
|
||||
EVEX_B, Sched<[sched]>;
|
||||
}
|
||||
|
||||
|
@ -207,16 +207,21 @@ def X86CmpMaskCC :
|
||||
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
|
||||
SDTCisVec<1>, SDTCisSameAs<2, 1>,
|
||||
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
|
||||
def X86MaskCmpMaskCC :
|
||||
SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
|
||||
SDTCisVec<1>, SDTCisSameAs<2, 1>,
|
||||
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>;
|
||||
def X86CmpMaskCCScalar :
|
||||
SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
|
||||
SDTCisVT<3, i8>]>;
|
||||
|
||||
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
|
||||
def X86cmpmm : SDNode<"X86ISD::CMPMM", X86MaskCmpMaskCC>;
|
||||
def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>;
|
||||
def X86any_cmpm : PatFrags<(ops node:$src1, node:$src2, node:$src3),
|
||||
[(X86strict_cmpm node:$src1, node:$src2, node:$src3),
|
||||
(X86cmpm node:$src1, node:$src2, node:$src3)]>;
|
||||
def X86cmpmSAE : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>;
|
||||
def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>;
|
||||
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
|
||||
def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>;
|
||||
|
||||
|
@ -417,12 +417,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
||||
X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
|
||||
X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
|
||||
X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
|
||||
X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
|
||||
X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
|
||||
X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
|
||||
X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
|
||||
X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
|
||||
@ -464,6 +458,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
||||
X86ISD::FADDS, X86ISD::FADDS_RND),
|
||||
X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK,
|
||||
X86ISD::FADDS, X86ISD::FADDS_RND),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
|
||||
X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
|
||||
|
@ -22,13 +22,13 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float
|
||||
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; CHECK-NEXT: retq
|
||||
entry:
|
||||
%0 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, i32 4)
|
||||
%0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%1 = bitcast <16 x i1> %0 to i16
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %b, <16 x float> %x, i32 13, i32 4)
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %b, <16 x float> %x, i32 13, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%3 = bitcast <16 x i1> %2 to i16
|
||||
%4 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %c, <16 x float> %x, i32 13, i32 4)
|
||||
%4 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %x, i32 13, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%5 = bitcast <16 x i1> %4 to i16
|
||||
%6 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %d, <16 x float> %x, i32 13, i32 4)
|
||||
%6 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %d, <16 x float> %x, i32 13, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%7 = bitcast <16 x i1> %6 to i16
|
||||
%8 = bitcast i16 %1 to <16 x i1>
|
||||
%9 = bitcast i16 %3 to <16 x i1>
|
||||
@ -46,7 +46,7 @@ entry:
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) #1
|
||||
declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1
|
||||
|
||||
attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
@ -1,40 +1,23 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512DQ
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512DQVL
|
||||
|
||||
define <4 x i64> @PR32546(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
|
||||
; AVX512F-LABEL: PR32546:
|
||||
; AVX512F: ## %bb.0: ## %entry
|
||||
; AVX512F-NEXT: ## kill: def $ymm3 killed $ymm3 def $zmm3
|
||||
; AVX512F-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2
|
||||
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
|
||||
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
|
||||
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0 {%k1}
|
||||
; AVX512F-NEXT: kmovw %k0, %eax
|
||||
; AVX512F-NEXT: movzbl %al, %eax
|
||||
; AVX512F-NEXT: vmovd %eax, %xmm0
|
||||
; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: PR32546:
|
||||
; AVX512DQ: ## %bb.0: ## %entry
|
||||
; AVX512DQ-NEXT: ## kill: def $ymm3 killed $ymm3 def $zmm3
|
||||
; AVX512DQ-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2
|
||||
; AVX512DQ-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
|
||||
; AVX512DQ-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; AVX512DQ-NEXT: vcmpltps %zmm3, %zmm2, %k1
|
||||
; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm0, %k0 {%k1}
|
||||
; AVX512DQ-NEXT: kmovb %k0, %eax
|
||||
; AVX512DQ-NEXT: vmovd %eax, %xmm0
|
||||
; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm0
|
||||
; AVX512DQ-NEXT: retq
|
||||
; AVX512VL-LABEL: PR32546:
|
||||
; AVX512VL: ## %bb.0: ## %entry
|
||||
; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0
|
||||
; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
|
||||
; AVX512VL-NEXT: kandw %k0, %k1, %k0
|
||||
; AVX512VL-NEXT: kmovw %k0, %eax
|
||||
; AVX512VL-NEXT: movzbl %al, %eax
|
||||
; AVX512VL-NEXT: vpbroadcastd %eax, %ymm0
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512DQVL-LABEL: PR32546:
|
||||
; AVX512DQVL: ## %bb.0: ## %entry
|
||||
; AVX512DQVL-NEXT: vcmpltps %ymm1, %ymm0, %k0
|
||||
; AVX512DQVL-NEXT: vcmpltps %ymm3, %ymm2, %k1
|
||||
; AVX512DQVL-NEXT: vcmpltps %ymm1, %ymm0, %k0 {%k1}
|
||||
; AVX512DQVL-NEXT: kandb %k0, %k1, %k0
|
||||
; AVX512DQVL-NEXT: kmovb %k0, %eax
|
||||
; AVX512DQVL-NEXT: vpbroadcastd %eax, %ymm0
|
||||
; AVX512DQVL-NEXT: retq
|
||||
@ -48,4 +31,108 @@ entry:
|
||||
%3 = bitcast <8 x i32> %vecinit7.i to <4 x i64>
|
||||
ret <4 x i64> %3
|
||||
}
|
||||
declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
|
||||
|
||||
define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
|
||||
; CHECK-LABEL: PR32547:
|
||||
; CHECK: ## %bb.0: ## %entry
|
||||
; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %k0
|
||||
; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %k1
|
||||
; CHECK-NEXT: kunpckbw %k1, %k0, %k1
|
||||
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
entry:
|
||||
%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1)
|
||||
%1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1)
|
||||
%conv.i = zext i8 %0 to i16
|
||||
%conv.i18 = zext i8 %1 to i16
|
||||
%shl = shl nuw i16 %conv.i, 8
|
||||
%or = or i16 %shl, %conv.i18
|
||||
%2 = bitcast float* %p to <16 x float>*
|
||||
%3 = bitcast i16 %or to <16 x i1>
|
||||
tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
|
||||
; CHECK-LABEL: PR32547_swap:
|
||||
; CHECK: ## %bb.0: ## %entry
|
||||
; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %k0
|
||||
; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %k1
|
||||
; CHECK-NEXT: kunpckbw %k1, %k0, %k1
|
||||
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
entry:
|
||||
%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1)
|
||||
%1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1)
|
||||
%conv.i = zext i8 %0 to i16
|
||||
%conv.i18 = zext i8 %1 to i16
|
||||
%shl = shl nuw i16 %conv.i, 8
|
||||
%or = or i16 %conv.i18, %shl
|
||||
%2 = bitcast float* %p to <16 x float>*
|
||||
%3 = bitcast i16 %or to <16 x i1>
|
||||
tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @mask_cmp_128(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %p) {
|
||||
; AVX512VL-LABEL: mask_cmp_128:
|
||||
; AVX512VL: ## %bb.0: ## %entry
|
||||
; AVX512VL-NEXT: vcmpltps %xmm1, %xmm0, %k0
|
||||
; AVX512VL-NEXT: kmovw %k0, %eax
|
||||
; AVX512VL-NEXT: vcmpltps %xmm3, %xmm2, %k0
|
||||
; AVX512VL-NEXT: shlb $4, %al
|
||||
; AVX512VL-NEXT: kmovw %eax, %k1
|
||||
; AVX512VL-NEXT: korw %k1, %k0, %k1
|
||||
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovaps %ymm0, (%rdi) {%k1}
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512DQVL-LABEL: mask_cmp_128:
|
||||
; AVX512DQVL: ## %bb.0: ## %entry
|
||||
; AVX512DQVL-NEXT: vcmpltps %xmm1, %xmm0, %k0
|
||||
; AVX512DQVL-NEXT: vcmpltps %xmm3, %xmm2, %k1
|
||||
; AVX512DQVL-NEXT: kshiftlb $4, %k0, %k0
|
||||
; AVX512DQVL-NEXT: korb %k0, %k1, %k1
|
||||
; AVX512DQVL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512DQVL-NEXT: vmovaps %ymm0, (%rdi) {%k1}
|
||||
; AVX512DQVL-NEXT: vzeroupper
|
||||
; AVX512DQVL-NEXT: retq
|
||||
entry:
|
||||
%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 1, i8 -1)
|
||||
%1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %c, <4 x float> %d, i32 1, i8 -1)
|
||||
%shl = shl nuw i8 %0, 4
|
||||
%or = or i8 %1, %shl
|
||||
%2 = bitcast float* %p to <8 x float>*
|
||||
%3 = bitcast i8 %or to <8 x i1>
|
||||
tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> zeroinitializer, <8 x float>* %2, i32 64, <8 x i1> %3)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <16 x float> @mask_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, float* %p) {
|
||||
; CHECK-LABEL: mask_cmp_512:
|
||||
; CHECK: ## %bb.0: ## %entry
|
||||
; CHECK-NEXT: vcmpltps {sae}, %zmm1, %zmm0, %k0
|
||||
; CHECK-NEXT: vcmpltps %zmm3, %zmm2, %k1
|
||||
; CHECK-NEXT: kxnorw %k1, %k0, %k1
|
||||
; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: retq
|
||||
entry:
|
||||
%0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i16 -1, i32 8)
|
||||
%1 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i16 -1, i32 4)
|
||||
%2 = bitcast float* %p to <16 x float>*
|
||||
%3 = load <16 x float>, <16 x float>* %2
|
||||
%4 = xor i16 %0, %1
|
||||
%5 = bitcast i16 %4 to <16 x i1>
|
||||
%6 = select <16 x i1> %5, <16 x float> zeroinitializer, <16 x float> %3
|
||||
ret <16 x float> %6
|
||||
}
|
||||
declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
|
||||
declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
|
||||
declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32)
|
||||
declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
|
||||
declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
|
||||
|
@ -10870,3 +10870,32 @@ define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) {
|
||||
}
|
||||
|
||||
declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
|
||||
|
||||
define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, float* %p) {
|
||||
; X86-LABEL: test_cmp_512:
|
||||
; X86: ## %bb.0: ## %entry
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
|
||||
; X86-NEXT: vcmpltps {sae}, %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x01]
|
||||
; X86-NEXT: vcmpltps %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6c,0x48,0xc2,0xcb,0x01]
|
||||
; X86-NEXT: kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9]
|
||||
; X86-NEXT: vmovaps (%eax), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x00]
|
||||
; X86-NEXT: retl ## encoding: [0xc3]
|
||||
;
|
||||
; X64-LABEL: test_cmp_512:
|
||||
; X64: ## %bb.0: ## %entry
|
||||
; X64-NEXT: vcmpltps {sae}, %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x01]
|
||||
; X64-NEXT: vcmpltps %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6c,0x48,0xc2,0xcb,0x01]
|
||||
; X64-NEXT: kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9]
|
||||
; X64-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07]
|
||||
; X64-NEXT: retq ## encoding: [0xc3]
|
||||
entry:
|
||||
%0 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i32 8)
|
||||
%1 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i32 4)
|
||||
%2 = bitcast float* %p to <16 x float>*
|
||||
%3 = load <16 x float>, <16 x float>* %2
|
||||
%4 = xor <16 x i1> %0, %1
|
||||
%5 = select <16 x i1> %4, <16 x float> zeroinitializer, <16 x float> %3
|
||||
ret <16 x float> %5
|
||||
}
|
||||
|
||||
declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
|
||||
|
@ -1046,11 +1046,11 @@ define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
|
||||
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: ret{{[l|q]}}
|
||||
%res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
|
||||
%res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%1 = bitcast <16 x i1> %res to i16
|
||||
ret i16 %1
|
||||
}
|
||||
declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
|
||||
declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
|
||||
|
||||
define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
|
||||
; CHECK-LABEL: test_cmppd:
|
||||
@ -1060,11 +1060,11 @@ define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
|
||||
; CHECK-NEXT: # kill: def $al killed $al killed $eax
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: ret{{[l|q]}}
|
||||
%res = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4)
|
||||
%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%1 = bitcast <8 x i1> %res to i8
|
||||
ret i8 %1
|
||||
}
|
||||
declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
|
||||
declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
|
||||
@ -7521,9 +7521,9 @@ define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x
|
||||
; X86-NEXT: .cfi_def_cfa %esp, 4
|
||||
; X86-NEXT: retl
|
||||
entry:
|
||||
%0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
|
||||
%0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%1 = bitcast <8 x i1> %0 to i8
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i32 4)
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%3 = bitcast <8 x i1> %2 to i8
|
||||
%conv = zext i8 %1 to i16
|
||||
%conv2 = zext i8 %3 to i16
|
||||
@ -7561,7 +7561,7 @@ define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8
|
||||
; X86-NEXT: .cfi_def_cfa %esp, 4
|
||||
; X86-NEXT: retl
|
||||
entry:
|
||||
%0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
|
||||
%0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%1 = bitcast <8 x i1> %0 to i8
|
||||
%conv = zext i8 %1 to i16
|
||||
%2 = bitcast i16 %conv to <16 x i1>
|
||||
|
@ -17233,3 +17233,90 @@ define <8 x i32> @test_maskz_expand_d_256(<8 x i32> %data, i8 %mask) {
|
||||
}
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx512.mask.expand.d.256(<8 x i32> %data, <8 x i32> %src0, i8 %mask)
|
||||
|
||||
define void @test_cmp_128(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %p) {
|
||||
; X86-LABEL: test_cmp_128:
|
||||
; X86: # %bb.0: # %entry
|
||||
; X86-NEXT: pushl %ebp # encoding: [0x55]
|
||||
; X86-NEXT: .cfi_def_cfa_offset 8
|
||||
; X86-NEXT: .cfi_offset %ebp, -8
|
||||
; X86-NEXT: movl %esp, %ebp # encoding: [0x89,0xe5]
|
||||
; X86-NEXT: .cfi_def_cfa_register %ebp
|
||||
; X86-NEXT: andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
|
||||
; X86-NEXT: subl $16, %esp # encoding: [0x83,0xec,0x10]
|
||||
; X86-NEXT: movl 24(%ebp), %eax # encoding: [0x8b,0x45,0x18]
|
||||
; X86-NEXT: vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
|
||||
; X86-NEXT: vcmpltps 8(%ebp), %xmm2, %k1 # encoding: [0x62,0xf1,0x6c,0x08,0xc2,0x8d,0x08,0x00,0x00,0x00,0x01]
|
||||
; X86-NEXT: kshiftlw $4, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x04]
|
||||
; X86-NEXT: korw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x45,0xc9]
|
||||
; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
|
||||
; X86-NEXT: vmovaps %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x00]
|
||||
; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec]
|
||||
; X86-NEXT: popl %ebp # encoding: [0x5d]
|
||||
; X86-NEXT: .cfi_def_cfa %esp, 4
|
||||
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
|
||||
; X86-NEXT: retl # encoding: [0xc3]
|
||||
;
|
||||
; X64-LABEL: test_cmp_128:
|
||||
; X64: # %bb.0: # %entry
|
||||
; X64-NEXT: vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
|
||||
; X64-NEXT: vcmpltps %xmm3, %xmm2, %k1 # encoding: [0x62,0xf1,0x6c,0x08,0xc2,0xcb,0x01]
|
||||
; X64-NEXT: kshiftlw $4, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x04]
|
||||
; X64-NEXT: korw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x45,0xc9]
|
||||
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
|
||||
; X64-NEXT: vmovaps %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x07]
|
||||
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
|
||||
; X64-NEXT: retq # encoding: [0xc3]
|
||||
entry:
|
||||
%0 = tail call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 1)
|
||||
%1 = tail call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %c, <4 x float> %d, i32 1)
|
||||
%2 = bitcast float* %p to <8 x float>*
|
||||
%3 = shufflevector <4 x i1> %0, <4 x i1> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> zeroinitializer, <8 x float>* %2, i32 64, <8 x i1> %3)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cmp_256(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
|
||||
; X86-LABEL: test_cmp_256:
|
||||
; X86: # %bb.0: # %entry
|
||||
; X86-NEXT: pushl %ebp # encoding: [0x55]
|
||||
; X86-NEXT: .cfi_def_cfa_offset 8
|
||||
; X86-NEXT: .cfi_offset %ebp, -8
|
||||
; X86-NEXT: movl %esp, %ebp # encoding: [0x89,0xe5]
|
||||
; X86-NEXT: .cfi_def_cfa_register %ebp
|
||||
; X86-NEXT: andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
|
||||
; X86-NEXT: subl $32, %esp # encoding: [0x83,0xec,0x20]
|
||||
; X86-NEXT: movl 40(%ebp), %eax # encoding: [0x8b,0x45,0x28]
|
||||
; X86-NEXT: vcmpltps %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x01]
|
||||
; X86-NEXT: vcmpltps 8(%ebp), %ymm2, %k1 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0x8d,0x08,0x00,0x00,0x00,0x01]
|
||||
; X86-NEXT: kunpckbw %k0, %k1, %k1 # encoding: [0xc5,0xf5,0x4b,0xc8]
|
||||
; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
|
||||
; X86-NEXT: vmovaps %zmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x29,0x00]
|
||||
; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec]
|
||||
; X86-NEXT: popl %ebp # encoding: [0x5d]
|
||||
; X86-NEXT: .cfi_def_cfa %esp, 4
|
||||
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
|
||||
; X86-NEXT: retl # encoding: [0xc3]
|
||||
;
|
||||
; X64-LABEL: test_cmp_256:
|
||||
; X64: # %bb.0: # %entry
|
||||
; X64-NEXT: vcmpltps %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x01]
|
||||
; X64-NEXT: vcmpltps %ymm3, %ymm2, %k1 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0xcb,0x01]
|
||||
; X64-NEXT: kunpckbw %k0, %k1, %k1 # encoding: [0xc5,0xf5,0x4b,0xc8]
|
||||
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
|
||||
; X64-NEXT: vmovaps %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x29,0x07]
|
||||
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
|
||||
; X64-NEXT: retq # encoding: [0xc3]
|
||||
entry:
|
||||
%0 = tail call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1)
|
||||
%1 = tail call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1)
|
||||
%2 = bitcast float* %p to <16 x float>*
|
||||
%3 = shufflevector <8 x i1> %0, <8 x i1> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
|
||||
declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
|
||||
declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
|
||||
declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
|
||||
|
@ -770,11 +770,11 @@ define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
|
||||
; CHECK-NEXT: # kill: def $al killed $al killed $eax
|
||||
; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
|
||||
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
|
||||
%res = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2)
|
||||
%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
|
||||
%1 = bitcast <8 x i1> %res to i8
|
||||
ret i8 %1
|
||||
}
|
||||
declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
|
||||
declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, <8 x i1>)
|
||||
|
||||
define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
|
||||
; CHECK-LABEL: test_cmpps_128:
|
||||
@ -783,12 +783,12 @@ define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
|
||||
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
|
||||
; CHECK-NEXT: # kill: def $al killed $al killed $eax
|
||||
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
|
||||
%res = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2)
|
||||
%res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
|
||||
%1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%2 = bitcast <8 x i1> %1 to i8
|
||||
ret i8 %2
|
||||
}
|
||||
declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
|
||||
declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, <4 x i1>)
|
||||
|
||||
define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
|
||||
; CHECK-LABEL: test_cmppd_256:
|
||||
@ -798,12 +798,12 @@ define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
|
||||
; CHECK-NEXT: # kill: def $al killed $al killed $eax
|
||||
; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
|
||||
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
|
||||
%res = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2)
|
||||
%res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
|
||||
%1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%2 = bitcast <8 x i1> %1 to i8
|
||||
ret i8 %2
|
||||
}
|
||||
declare <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double>, <4 x double>, i32)
|
||||
declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, <4 x i1>)
|
||||
|
||||
define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
|
||||
; CHECK-LABEL: test_cmppd_128:
|
||||
@ -812,12 +812,12 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
|
||||
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
|
||||
; CHECK-NEXT: # kill: def $al killed $al killed $eax
|
||||
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
|
||||
%res = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2)
|
||||
%res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, <2 x i1> <i1 true, i1 true>)
|
||||
%1 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
%2 = bitcast <8 x i1> %1 to i8
|
||||
ret i8 %2
|
||||
}
|
||||
declare <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double>, <2 x double>, i32)
|
||||
declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, <2 x i1>)
|
||||
|
||||
define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
|
||||
; X86-LABEL: test_mm512_maskz_max_ps_256:
|
||||
|
@ -19358,7 +19358,7 @@ entry:
|
||||
}
|
||||
|
||||
|
||||
declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
|
||||
declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
|
||||
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
|
||||
; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
|
||||
; VLX: # %bb.0: # %entry
|
||||
@ -20799,7 +20799,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <16 x float>
|
||||
%1 = bitcast <8 x i64> %__b to <16 x float>
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast <16 x i1> %2 to i16
|
||||
%4 = zext i16 %3 to i32
|
||||
ret i32 %4
|
||||
@ -20824,7 +20824,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u,
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <16 x float>
|
||||
%1 = bitcast <8 x i64> %__b to <16 x float>
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast i16 %__u to <16 x i1>
|
||||
%4 = and <16 x i1> %2, %3
|
||||
%5 = bitcast <16 x i1> %4 to i16
|
||||
@ -21002,7 +21002,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <16 x float>
|
||||
%1 = bitcast <8 x i64> %__b to <16 x float>
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast <16 x i1> %2 to i16
|
||||
%4 = zext i16 %3 to i64
|
||||
ret i64 %4
|
||||
@ -21027,7 +21027,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u,
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <16 x float>
|
||||
%1 = bitcast <8 x i64> %__b to <16 x float>
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
|
||||
%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast i16 %__u to <16 x i1>
|
||||
%4 = and <16 x i1> %2, %3
|
||||
%5 = bitcast <16 x i1> %4 to i16
|
||||
@ -21037,7 +21037,7 @@ entry:
|
||||
|
||||
|
||||
|
||||
declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
|
||||
declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
|
||||
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
|
||||
; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
|
||||
; VLX: # %bb.0: # %entry
|
||||
@ -22867,7 +22867,7 @@ define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64>
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <8 x double>
|
||||
%1 = bitcast <8 x i64> %__b to <8 x double>
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast <8 x i1> %2 to i8
|
||||
%4 = zext i8 %3 to i16
|
||||
ret i16 %4
|
||||
@ -22896,7 +22896,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <8 x double>
|
||||
%1 = bitcast <8 x i64> %__b to <8 x double>
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast i8 %__u to <8 x i1>
|
||||
%4 = and <8 x i1> %2, %3
|
||||
%5 = bitcast <8 x i1> %4 to i8
|
||||
@ -23082,7 +23082,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64>
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <8 x double>
|
||||
%1 = bitcast <8 x i64> %__b to <8 x double>
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast <8 x i1> %2 to i8
|
||||
%4 = zext i8 %3 to i32
|
||||
ret i32 %4
|
||||
@ -23109,7 +23109,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <8 x double>
|
||||
%1 = bitcast <8 x i64> %__b to <8 x double>
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast i8 %__u to <8 x i1>
|
||||
%4 = and <8 x i1> %2, %3
|
||||
%5 = bitcast <8 x i1> %4 to i8
|
||||
@ -23295,7 +23295,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64>
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <8 x double>
|
||||
%1 = bitcast <8 x i64> %__b to <8 x double>
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast <8 x i1> %2 to i8
|
||||
%4 = zext i8 %3 to i64
|
||||
ret i64 %4
|
||||
@ -23322,7 +23322,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <
|
||||
entry:
|
||||
%0 = bitcast <8 x i64> %__a to <8 x double>
|
||||
%1 = bitcast <8 x i64> %__b to <8 x double>
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
|
||||
%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%3 = bitcast i8 %__u to <8 x i1>
|
||||
%4 = and <8 x i1> %2, %3
|
||||
%5 = bitcast <8 x i1> %4 to i8
|
||||
@ -23345,7 +23345,7 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {
|
||||
; NoVLX-NEXT: kmovw %k0, %eax
|
||||
; NoVLX-NEXT: vzeroupper
|
||||
; NoVLX-NEXT: retq
|
||||
%res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
|
||||
%res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
|
||||
%1 = bitcast <16 x i1> %res to i16
|
||||
%cast = bitcast i16 %1 to <16 x i1>
|
||||
%shuffle = shufflevector <16 x i1> %cast, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
|
||||
|
@ -312,11 +312,11 @@ define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) {
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%res = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, i32 4)
|
||||
%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%2 = bitcast <8 x i1> %res to i8
|
||||
ret i8 %2
|
||||
}
|
||||
declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
|
||||
declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
|
||||
|
||||
define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
|
||||
; CHECK-LABEL: stack_fold_cmppd_mask:
|
||||
@ -332,8 +332,9 @@ define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <
|
||||
; CHECK-NEXT: #NO_APP
|
||||
; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
||||
; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0
|
||||
; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
|
||||
; CHECK-NEXT: kandb %k0, %k1, %k1
|
||||
; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
||||
; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
|
||||
; CHECK-NEXT: addq $184, %rsp
|
||||
@ -344,7 +345,7 @@ define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <
|
||||
%2 = load <8 x double>, <8 x double>* %a2
|
||||
%3 = fadd <8 x double> %a1, %2
|
||||
%4 = bitcast i8 %mask to <8 x i1>
|
||||
%5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, i32 4)
|
||||
%5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%6 = and <8 x i1> %4, %5
|
||||
%7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
|
||||
ret <8 x double> %7
|
||||
@ -364,8 +365,9 @@ define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x doubl
|
||||
; CHECK-NEXT: #NO_APP
|
||||
; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
||||
; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0
|
||||
; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
|
||||
; CHECK-NEXT: kandb %k0, %k1, %k1
|
||||
; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
||||
; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
|
||||
; CHECK-NEXT: addq $184, %rsp
|
||||
@ -376,7 +378,7 @@ define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x doubl
|
||||
%2 = load <8 x double>, <8 x double>* %a2
|
||||
%3 = fadd <8 x double> %a1, %2
|
||||
%4 = bitcast i8 %mask to <8 x i1>
|
||||
%5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, i32 4)
|
||||
%5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%6 = and <8 x i1> %4, %5
|
||||
%7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
|
||||
ret <8 x double> %7
|
||||
@ -395,11 +397,11 @@ define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, i32 4)
|
||||
%res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%2 = bitcast <16 x i1> %res to i16
|
||||
ret i16 %2
|
||||
}
|
||||
declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
|
||||
declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
|
||||
|
||||
define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
|
||||
; CHECK-LABEL: stack_fold_cmpps_mask:
|
||||
@ -415,8 +417,9 @@ define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <
|
||||
; CHECK-NEXT: #NO_APP
|
||||
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
||||
; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0
|
||||
; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
|
||||
; CHECK-NEXT: kandw %k0, %k1, %k1
|
||||
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
||||
; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
|
||||
; CHECK-NEXT: addq $184, %rsp
|
||||
@ -427,7 +430,7 @@ define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <
|
||||
%2 = load <16 x float>, <16 x float>* %a2
|
||||
%3 = fadd <16 x float> %a1, %2
|
||||
%4 = bitcast i16 %mask to <16 x i1>
|
||||
%5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, i32 4)
|
||||
%5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%6 = and <16 x i1> %4, %5
|
||||
%7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
|
||||
ret <16 x float> %7
|
||||
@ -447,8 +450,9 @@ define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x floa
|
||||
; CHECK-NEXT: #NO_APP
|
||||
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
||||
; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0
|
||||
; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
|
||||
; CHECK-NEXT: kandw %k0, %k1, %k1
|
||||
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
||||
; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
|
||||
; CHECK-NEXT: addq $184, %rsp
|
||||
@ -459,7 +463,7 @@ define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x floa
|
||||
%2 = load <16 x float>, <16 x float>* %a2
|
||||
%3 = fadd <16 x float> %a1, %2
|
||||
%4 = bitcast i16 %mask to <16 x i1>
|
||||
%5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, i32 4)
|
||||
%5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
|
||||
%6 = and <16 x i1> %4, %5
|
||||
%7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
|
||||
ret <16 x float> %7
|
||||
|
@ -249,12 +249,12 @@ define i8 @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
|
||||
; CHECK-NEXT: # kill: def $al killed $al killed $eax
|
||||
; CHECK-NEXT: retq
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%res = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0)
|
||||
%res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0, <2 x i1> <i1 true, i1 true>)
|
||||
%2 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
%3 = bitcast <8 x i1> %2 to i8
|
||||
ret i8 %3
|
||||
}
|
||||
declare <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double>, <2 x double>, i32)
|
||||
declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, <2 x i1>)
|
||||
|
||||
define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
|
||||
; CHECK-LABEL: stack_fold_cmppd_ymm:
|
||||
@ -269,12 +269,12 @@ define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%res = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
|
||||
%res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
|
||||
%2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%3 = bitcast <8 x i1> %2 to i8
|
||||
ret i8 %3
|
||||
}
|
||||
declare <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double>, <4 x double>, i32)
|
||||
declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, <4 x i1>)
|
||||
|
||||
define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
|
||||
; CHECK-LABEL: stack_fold_cmpps:
|
||||
@ -288,12 +288,12 @@ define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
|
||||
; CHECK-NEXT: # kill: def $al killed $al killed $eax
|
||||
; CHECK-NEXT: retq
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%res = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0)
|
||||
%res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
|
||||
%2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%3 = bitcast <8 x i1> %2 to i8
|
||||
ret i8 %3
|
||||
}
|
||||
declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
|
||||
declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, <4 x i1>)
|
||||
|
||||
define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
|
||||
; CHECK-LABEL: stack_fold_cmpps_ymm:
|
||||
@ -308,11 +308,11 @@ define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%res = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
|
||||
%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
|
||||
%2 = bitcast <8 x i1> %res to i8
|
||||
ret i8 %2
|
||||
}
|
||||
declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
|
||||
declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, <8 x i1>)
|
||||
|
||||
define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
|
||||
; CHECK-LABEL: stack_fold_divpd:
|
||||
|
@ -881,159 +881,3 @@ define i64 @shuf64i1_zero(i64 %a) {
|
||||
%d = bitcast <64 x i1> %c to i64
|
||||
ret i64 %d
|
||||
}
|
||||
|
||||
; OR(KSHIFTL(X,8),Y) -> KUNPCKBW
|
||||
define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
|
||||
; AVX512F-LABEL: PR32547:
|
||||
; AVX512F: # %bb.0: # %entry
|
||||
; AVX512F-NEXT: # kill: def $ymm3 killed $ymm3 def $zmm3
|
||||
; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
|
||||
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
|
||||
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0
|
||||
; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
|
||||
; AVX512F-NEXT: kunpckbw %k1, %k0, %k1
|
||||
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: PR32547:
|
||||
; AVX512VL: # %bb.0: # %entry
|
||||
; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0
|
||||
; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
|
||||
; AVX512VL-NEXT: kunpckbw %k1, %k0, %k1
|
||||
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; VL_BW_DQ-LABEL: PR32547:
|
||||
; VL_BW_DQ: # %bb.0: # %entry
|
||||
; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0
|
||||
; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1
|
||||
; VL_BW_DQ-NEXT: kunpckbw %k1, %k0, %k1
|
||||
; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; VL_BW_DQ-NEXT: vzeroupper
|
||||
; VL_BW_DQ-NEXT: retq
|
||||
entry:
|
||||
%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1)
|
||||
%1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1)
|
||||
%conv.i = zext i8 %0 to i16
|
||||
%conv.i18 = zext i8 %1 to i16
|
||||
%shl = shl nuw i16 %conv.i, 8
|
||||
%or = or i16 %shl, %conv.i18
|
||||
%2 = bitcast float* %p to <16 x float>*
|
||||
%3 = bitcast i16 %or to <16 x i1>
|
||||
tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3) #4
|
||||
ret void
|
||||
}
|
||||
|
||||
; OR(X, KSHIFTL(Y,8)) -> KUNPCKBW
|
||||
define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
|
||||
; AVX512F-LABEL: PR32547_swap:
|
||||
; AVX512F: # %bb.0: # %entry
|
||||
; AVX512F-NEXT: # kill: def $ymm3 killed $ymm3 def $zmm3
|
||||
; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
|
||||
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
|
||||
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0
|
||||
; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
|
||||
; AVX512F-NEXT: kunpckbw %k1, %k0, %k1
|
||||
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: PR32547_swap:
|
||||
; AVX512VL: # %bb.0: # %entry
|
||||
; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0
|
||||
; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
|
||||
; AVX512VL-NEXT: kunpckbw %k1, %k0, %k1
|
||||
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; VL_BW_DQ-LABEL: PR32547_swap:
|
||||
; VL_BW_DQ: # %bb.0: # %entry
|
||||
; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0
|
||||
; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1
|
||||
; VL_BW_DQ-NEXT: kunpckbw %k1, %k0, %k1
|
||||
; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; VL_BW_DQ-NEXT: vzeroupper
|
||||
; VL_BW_DQ-NEXT: retq
|
||||
entry:
|
||||
%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1)
|
||||
%1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1)
|
||||
%conv.i = zext i8 %0 to i16
|
||||
%conv.i18 = zext i8 %1 to i16
|
||||
%shl = shl nuw i16 %conv.i, 8
|
||||
%or = or i16 %conv.i18, %shl
|
||||
%2 = bitcast float* %p to <16 x float>*
|
||||
%3 = bitcast i16 %or to <16 x i1>
|
||||
tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3) #4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @mask_cmp_128(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %p) {
|
||||
; AVX512F-LABEL: mask_cmp_128:
|
||||
; AVX512F: # %bb.0: # %entry
|
||||
; AVX512F-NEXT: # kill: def $xmm3 killed $xmm3 def $zmm3
|
||||
; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
|
||||
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
|
||||
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
|
||||
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0
|
||||
; AVX512F-NEXT: kmovw %k0, %eax
|
||||
; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k0
|
||||
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
|
||||
; AVX512F-NEXT: kshiftrw $12, %k0, %k0
|
||||
; AVX512F-NEXT: shlb $4, %al
|
||||
; AVX512F-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-NEXT: korw %k1, %k0, %k0
|
||||
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
|
||||
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
|
||||
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: mask_cmp_128:
|
||||
; AVX512VL: # %bb.0: # %entry
|
||||
; AVX512VL-NEXT: vcmpltps %xmm1, %xmm0, %k0
|
||||
; AVX512VL-NEXT: kmovw %k0, %eax
|
||||
; AVX512VL-NEXT: vcmpltps %xmm3, %xmm2, %k0
|
||||
; AVX512VL-NEXT: shlb $4, %al
|
||||
; AVX512VL-NEXT: kmovw %eax, %k1
|
||||
; AVX512VL-NEXT: korw %k1, %k0, %k1
|
||||
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovaps %ymm0, (%rdi) {%k1}
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; VL_BW_DQ-LABEL: mask_cmp_128:
|
||||
; VL_BW_DQ: # %bb.0: # %entry
|
||||
; VL_BW_DQ-NEXT: vcmpltps %xmm1, %xmm0, %k0
|
||||
; VL_BW_DQ-NEXT: vcmpltps %xmm3, %xmm2, %k1
|
||||
; VL_BW_DQ-NEXT: kshiftlb $4, %k0, %k0
|
||||
; VL_BW_DQ-NEXT: korb %k0, %k1, %k1
|
||||
; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; VL_BW_DQ-NEXT: vmovaps %ymm0, (%rdi) {%k1}
|
||||
; VL_BW_DQ-NEXT: vzeroupper
|
||||
; VL_BW_DQ-NEXT: retq
|
||||
entry:
|
||||
%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 1, i8 -1)
|
||||
%1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %c, <4 x float> %d, i32 1, i8 -1)
|
||||
%shl = shl nuw i8 %0, 4
|
||||
%or = or i8 %1, %shl
|
||||
%2 = bitcast float* %p to <8 x float>*
|
||||
%3 = bitcast i8 %or to <8 x i1>
|
||||
tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> zeroinitializer, <8 x float>* %2, i32 64, <8 x i1> %3)
|
||||
ret void
|
||||
}
|
||||
declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
|
||||
declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
|
||||
declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
|
||||
declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
|
||||
|
@ -1,210 +0,0 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
|
||||
|
||||
; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).
|
||||
|
||||
define i8 @sub_compare_foldingPD128_safe(<2 x double> %a, <2 x double> %b){
|
||||
; CHECK-LABEL: @sub_compare_foldingPD128_safe(
|
||||
; CHECK-NEXT: [[SUB_SAFE:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5)
|
||||
; CHECK-NEXT: [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
; CHECK-NEXT: [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T2]]
|
||||
;
|
||||
%sub.safe = fsub <2 x double> %a, %b
|
||||
%t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.safe, <2 x double> zeroinitializer, i32 5)
|
||||
%t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
%t2 = bitcast <8 x i1> %t1 to i8
|
||||
ret i8 %t2
|
||||
}
|
||||
|
||||
define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){
|
||||
; CHECK-LABEL: @sub_compare_foldingPD128(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5)
|
||||
; CHECK-NEXT: [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
; CHECK-NEXT: [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T2]]
|
||||
;
|
||||
%sub.i = fsub ninf <2 x double> %a, %b
|
||||
%t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.i, <2 x double> zeroinitializer, i32 5)
|
||||
%t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
%t2 = bitcast <8 x i1> %t1 to i8
|
||||
ret i8 %t2
|
||||
}
|
||||
|
||||
define i8 @sub_compare_foldingPD128_undef_elt(<2 x double> %a, <2 x double> %b){
|
||||
; CHECK-LABEL: @sub_compare_foldingPD128_undef_elt(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5)
|
||||
; CHECK-NEXT: [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
; CHECK-NEXT: [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T2]]
|
||||
;
|
||||
%sub.i = fsub ninf <2 x double> %a, %b
|
||||
%t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.i, <2 x double> <double 0.0, double undef>, i32 5)
|
||||
%t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
%t2 = bitcast <8 x i1> %t1 to i8
|
||||
ret i8 %t2
|
||||
}
|
||||
|
||||
define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){
|
||||
; CHECK-LABEL: @sub_compare_foldingPD256(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5)
|
||||
; CHECK-NEXT: [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T2]]
|
||||
;
|
||||
%sub.i1 = fsub ninf <4 x double> %a, %b
|
||||
%t0 = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5)
|
||||
%t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%t2 = bitcast <8 x i1> %t1 to i8
|
||||
ret i8 %t2
|
||||
}
|
||||
|
||||
define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){
|
||||
; CHECK-LABEL: @sub_compare_foldingPD512(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i32 4)
|
||||
; CHECK-NEXT: [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T1]]
|
||||
;
|
||||
%sub.i2 = fsub ninf <8 x double> %a, %b
|
||||
%t0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i32 4)
|
||||
%t1 = bitcast <8 x i1> %t0 to i8
|
||||
ret i8 %t1
|
||||
}
|
||||
|
||||
define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){
|
||||
; CHECK-LABEL: @sub_compare_foldingPS128(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12)
|
||||
; CHECK-NEXT: [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T2]]
|
||||
;
|
||||
%sub.i3 = fsub ninf <4 x float> %a, %b
|
||||
%t0 = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12)
|
||||
%t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%t2 = bitcast <8 x i1> %t1 to i8
|
||||
ret i8 %t2
|
||||
}
|
||||
|
||||
define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){
|
||||
; CHECK-LABEL: @sub_compare_foldingPS256(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5)
|
||||
; CHECK-NEXT: [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T1]]
|
||||
;
|
||||
%sub.i4 = fsub ninf <8 x float> %a, %b
|
||||
%t0 = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5)
|
||||
%t1 = bitcast <8 x i1> %t0 to i8
|
||||
ret i8 %t1
|
||||
}
|
||||
|
||||
define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){
|
||||
; CHECK-LABEL: @sub_compare_foldingPS512(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i32 4)
|
||||
; CHECK-NEXT: [[T1:%.*]] = bitcast <16 x i1> [[T0]] to i16
|
||||
; CHECK-NEXT: ret i16 [[T1]]
|
||||
;
|
||||
%sub.i5 = fsub ninf <16 x float> %a, %b
|
||||
%t0 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i32 4)
|
||||
%t1 = bitcast <16 x i1> %t0 to i16
|
||||
ret i16 %t1
|
||||
}
|
||||
|
||||
define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){
|
||||
; CHECK-LABEL: @sub_compare_folding_swapPD128(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5)
|
||||
; CHECK-NEXT: [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
; CHECK-NEXT: [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T2]]
|
||||
;
|
||||
%sub.i = fsub ninf <2 x double> %a, %b
|
||||
%t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5)
|
||||
%t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
|
||||
%t2 = bitcast <8 x i1> %t1 to i8
|
||||
ret i8 %t2
|
||||
}
|
||||
|
||||
define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){
|
||||
; CHECK-LABEL: @sub_compare_folding_swapPD256(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5)
|
||||
; CHECK-NEXT: [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T2]]
|
||||
;
|
||||
%sub.i = fsub ninf <4 x double> %a, %b
|
||||
%t0 = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5)
|
||||
%t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%t2 = bitcast <8 x i1> %t1 to i8
|
||||
ret i8 %t2
|
||||
}
|
||||
|
||||
define i8 @sub_compare_folding_swapPD256_undef(<4 x double> %a, <4 x double> %b) {
|
||||
; CHECK-LABEL: @sub_compare_folding_swapPD256_undef(
|
||||
; CHECK-NEXT: [[TMP:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> undef, <4 x double> zeroinitializer, i32 5)
|
||||
; CHECK-NEXT: [[T0:%.*]] = shufflevector <4 x i1> [[TMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T1]]
|
||||
;
|
||||
%sub.i1 = fsub ninf <4 x double> undef, undef
|
||||
%tmp = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5)
|
||||
%t0 = shufflevector <4 x i1> %tmp, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%t1 = bitcast <8 x i1> %t0 to i8
|
||||
ret i8 %t1
|
||||
}
|
||||
|
||||
define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){
|
||||
; CHECK-LABEL: @sub_compare_folding_swapPD512(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i32 4)
|
||||
; CHECK-NEXT: [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T1]]
|
||||
;
|
||||
%sub.i = fsub ninf <8 x double> %a, %b
|
||||
%t0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i32 4)
|
||||
%t1 = bitcast <8 x i1> %t0 to i8
|
||||
ret i8 %t1
|
||||
}
|
||||
|
||||
define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){
|
||||
; CHECK-LABEL: @sub_compare_folding_swapPS128(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12)
|
||||
; CHECK-NEXT: [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T2]]
|
||||
;
|
||||
%sub.i = fsub ninf <4 x float> %a, %b
|
||||
%t0 = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12)
|
||||
%t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%t2 = bitcast <8 x i1> %t1 to i8
|
||||
ret i8 %t2
|
||||
}
|
||||
|
||||
define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){
|
||||
; CHECK-LABEL: @sub_compare_folding_swapPS256(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5)
|
||||
; CHECK-NEXT: [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
|
||||
; CHECK-NEXT: ret i8 [[T1]]
|
||||
;
|
||||
%sub.i = fsub ninf <8 x float> %a, %b
|
||||
%t0 = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5)
|
||||
%t1 = bitcast <8 x i1> %t0 to i8
|
||||
ret i8 %t1
|
||||
}
|
||||
|
||||
define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){
|
||||
; CHECK-LABEL: @sub_compare_folding_swapPS512(
|
||||
; CHECK-NEXT: [[T0:%.*]] = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i32 4)
|
||||
; CHECK-NEXT: [[T1:%.*]] = bitcast <16 x i1> [[T0]] to i16
|
||||
; CHECK-NEXT: ret i16 [[T1]]
|
||||
;
|
||||
%sub.i = fsub ninf <16 x float> %a, %b
|
||||
%t0 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i32 4)
|
||||
%t1 = bitcast <16 x i1> %t0 to i16
|
||||
ret i16 %t1
|
||||
}
|
||||
|
||||
declare <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double>, <2 x double>, i32)
|
||||
declare <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double>, <4 x double>, i32)
|
||||
declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
|
||||
declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
|
||||
declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
|
||||
declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
|
Loading…
x
Reference in New Issue
Block a user