From 21a343961083bc31b4718bdc78c3ad12b7f84bce Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Thu, 26 Apr 2018 21:46:01 +0000 Subject: [PATCH] [x86] Revert r330322 (& r330323): Lowering x86 adds/addus/subs/subus intrinsics The LLVM commit introduces a crash in LLVM's instruction selection. I filed http://llvm.org/PR37260 with the test case. llvm-svn: 330997 --- include/llvm/IR/IntrinsicsX86.td | 120 + lib/IR/AutoUpgrade.cpp | 106 +- lib/Target/X86/X86ISelLowering.cpp | 89 - lib/Target/X86/X86IntrinsicsInfo.h | 40 + test/CodeGen/X86/avx2-intrinsics-fast-isel.ll | 88 +- .../X86/avx2-intrinsics-x86-upgrade.ll | 127 - test/CodeGen/X86/avx2-intrinsics-x86.ll | 285 +- .../X86/avx512bw-intrinsics-upgrade.ll | 416 --- test/CodeGen/X86/avx512bw-intrinsics.ll | 416 +++ .../X86/avx512bwvl-intrinsics-upgrade.ll | 1040 ------ test/CodeGen/X86/avx512bwvl-intrinsics.ll | 1040 ++++++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll | 104 +- .../X86/sse2-intrinsics-x86-upgrade.ll | 166 - test/CodeGen/X86/sse2-intrinsics-x86.ll | 168 + test/CodeGen/X86/sse2-schedule.ll | 111 +- test/CodeGen/X86/vector-arith-sat.ll | 3025 ----------------- .../MemorySanitizer/msan_x86intrinsics.ll | 10 +- 17 files changed, 2121 insertions(+), 5230 deletions(-) delete mode 100644 test/CodeGen/X86/vector-arith-sat.ll diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 22d4d5d16ea..db4cb46b308 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -378,6 +378,30 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_sse2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb128">, + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, + llvm_v16i8_ty], [IntrNoMem, Commutative]>; + def int_x86_sse2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw128">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, + llvm_v8i16_ty], [IntrNoMem, Commutative]>; + def int_x86_sse2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb128">, + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, + llvm_v16i8_ty], [IntrNoMem, Commutative]>; + def int_x86_sse2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw128">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, + llvm_v8i16_ty], [IntrNoMem, Commutative]>; + def int_x86_sse2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb128">, + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, + llvm_v16i8_ty], [IntrNoMem]>; + def int_x86_sse2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw128">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, + llvm_v8i16_ty], [IntrNoMem]>; + def int_x86_sse2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb128">, + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, + llvm_v16i8_ty], [IntrNoMem]>; + def int_x86_sse2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw128">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, + llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; @@ -1603,6 +1627,30 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; @@ -4647,6 +4695,78 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". } // Integer arithmetic ops let TargetPrefix = "x86" in { + def int_x86_avx512_mask_padds_b_128 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, + llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_padds_b_256 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, + llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512_mask">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, + llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; + def int_x86_avx512_mask_padds_w_128 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, + llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_padds_w_256 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, + llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512_mask">, + Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, + llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_paddus_b_128 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, + llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_paddus_b_256 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, + llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_paddus_b_512 : GCCBuiltin<"__builtin_ia32_paddusb512_mask">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, + llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; + def int_x86_avx512_mask_paddus_w_128 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, + llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_paddus_w_256 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, + llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_paddus_w_512 : GCCBuiltin<"__builtin_ia32_paddusw512_mask">, + Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, + llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubs_b_128 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, + llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubs_b_256 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, + llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512_mask">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, + llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubs_w_128 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, + llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubs_w_256 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, + llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512_mask">, + Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, + llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubus_b_128 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, + llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubus_b_256 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, + llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubus_b_512 : GCCBuiltin<"__builtin_ia32_psubusb512_mask">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, + llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubus_w_128 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, + llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubus_w_256 : // FIXME: remove this intrinsic + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, + llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">, + Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, + llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 8f602035f78..13fb2782472 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -84,19 +84,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { // like to use this information to remove upgrade code for some older // intrinsics. It is currently undecided how we will determine that future // point. - if (Name.startswith("sse2.padds") || // Added in 7.0 - Name.startswith("sse2.paddus") || // Added in 7.0 - Name.startswith("sse2.psubs") || // Added in 7.0 - Name.startswith("sse2.psubus") || // Added in 7.0 - Name.startswith("avx2.padds") || // Added in 7.0 - Name.startswith("avx2.paddus") || // Added in 7.0 - Name.startswith("avx2.psubs") || // Added in 7.0 - Name.startswith("avx2.psubus") || // Added in 7.0 - Name.startswith("avx512.mask.padds") || // Added in 7.0 - Name.startswith("avx512.mask.paddus") || // Added in 7.0 - Name.startswith("avx512.mask.psubs") || // Added in 7.0 - Name.startswith("avx512.mask.psubus") || // Added in 7.0 - Name=="ssse3.pabs.b.128" || // Added in 6.0 + if (Name=="ssse3.pabs.b.128" || // Added in 6.0 Name=="ssse3.pabs.w.128" || // Added in 6.0 Name=="ssse3.pabs.d.128" || // Added in 6.0 Name.startswith("avx512.mask.shuf.i") || // Added in 6.0 @@ -857,77 +845,6 @@ static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0, return EmitX86Select(Builder, Mask, Align, Passthru); } -static Value *UpgradeX86AddSubSatIntrinsics(IRBuilder<> &Builder, CallInst &CI, - bool IsSigned, bool IsAddition) { - // Get elements. - Value *Op0 = CI.getArgOperand(0); - Value *Op1 = CI.getArgOperand(1); - - // Extend elements. - Type *ResultType = CI.getType(); - unsigned NumElts = ResultType->getVectorNumElements(); - - Value *Res; - if (!IsAddition && !IsSigned) { - Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Op0, Op1); - Value *Select = Builder.CreateSelect(ICmp, Op0, Op1); - Res = Builder.CreateSub(Select, Op1); - } else { - Type *EltType = ResultType->getVectorElementType(); - Type *ExtEltType = EltType == Builder.getInt8Ty() ? Builder.getInt16Ty() - : Builder.getInt32Ty(); - Type *ExtVT = VectorType::get(ExtEltType, NumElts); - Op0 = IsSigned ? Builder.CreateSExt(Op0, ExtVT) - : Builder.CreateZExt(Op0, ExtVT); - Op1 = IsSigned ? Builder.CreateSExt(Op1, ExtVT) - : Builder.CreateZExt(Op1, ExtVT); - - // Perform addition/substraction. - Res = IsAddition ? Builder.CreateAdd(Op0, Op1) - : Builder.CreateSub(Op0, Op1); - - // Create a vector of maximum values of not extended type - // (if overflow occurs, it will be saturated to that value). - unsigned EltSizeInBits = EltType->getPrimitiveSizeInBits(); - APInt MaxInt = IsSigned ? APInt::getSignedMaxValue(EltSizeInBits) - : APInt::getMaxValue(EltSizeInBits); - Value *MaxVec = ConstantInt::get(ResultType, MaxInt); - // Extend so that it can be compared to result of add/sub. - MaxVec = IsSigned ? Builder.CreateSExt(MaxVec, ExtVT) - : Builder.CreateZExt(MaxVec, ExtVT); - - // Saturate overflow. - ICmpInst::Predicate Pred = IsSigned ? ICmpInst::ICMP_SLE - : ICmpInst::ICMP_ULE; - Value *Cmp = Builder.CreateICmp(Pred, Res, - MaxVec); // 1 if no overflow. - Res = Builder.CreateSelect(Cmp, Res, - MaxVec); // If overflowed, copy from max vec. - - // Saturate underflow. - if (IsSigned) { - APInt MinInt = APInt::getSignedMinValue(EltSizeInBits); - Value *MinVec = ConstantInt::get(ResultType, MinInt); - // Extend so that it can be compared to result of add/sub. - MinVec = Builder.CreateSExt(MinVec, ExtVT); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Res, - MinVec); // 1 if no underflow. - Res = Builder.CreateSelect(Cmp, Res, - MinVec); // If underflowed, copy from min vec. - } - - // Truncate to original type. - Res = Builder.CreateTrunc(Res, ResultType); - } - - if (CI.getNumArgOperands() == 4) { // For masked intrinsics. - Value *VecSRC = CI.getArgOperand(2); - Value *Mask = CI.getArgOperand(3); - Res = EmitX86Select(Builder, Mask, Res, VecSRC); - } - return Res; -} - static Value *UpgradeMaskedStore(IRBuilder<> &Builder, Value *Ptr, Value *Data, Value *Mask, bool Aligned) { @@ -1766,26 +1683,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { ShuffleMask); Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); - } else if (IsX86 && (Name.startswith("sse2.padds") || - Name.startswith("avx2.padds") || - Name.startswith("avx512.mask.padds"))) { - Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, - true, true); // Signed add. - } else if (IsX86 && (Name.startswith("sse2.paddus") || - Name.startswith("avx2.paddus") || - Name.startswith("avx512.mask.paddus"))) { - Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, - false, true); // Unsigned add. - } else if (IsX86 && (Name.startswith("sse2.psubs") || - Name.startswith("avx2.psubs") || - Name.startswith("avx512.mask.psubs"))) { - Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, - true, false); // Signed sub. - } else if (IsX86 && (Name.startswith("sse2.psubus") || - Name.startswith("avx2.psubus") || - Name.startswith("avx512.mask.psubus"))) { - Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, - false, false); // Unsigned sub. } else if (IsX86 && (Name.startswith("avx2.pbroadcast") || Name.startswith("avx2.vbroadcast") || Name.startswith("avx512.pbroadcast") || @@ -1796,6 +1693,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts); Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()), Constant::getNullValue(MaskTy)); + if (CI->getNumArgOperands() == 3) Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4d95a2c4987..ef08b90122d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -36023,91 +36023,6 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, return SDValue(); } -/// This function detects the addition or substraction with saturation pattern -/// between 2 unsigned i8/i16 vectors and replace this operation with the -/// efficient X86ISD::ADDUS/X86ISD::ADDS/X86ISD::SUBUS/x86ISD::SUBS instruction. -static SDValue detectAddSubSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const SDLoc &DL) { - if (!VT.isVector() || !VT.isSimple()) - return SDValue(); - EVT InVT = In.getValueType(); - unsigned NumElems = VT.getVectorNumElements(); - - EVT ScalarVT = VT.getVectorElementType(); - if ((ScalarVT != MVT::i8 && ScalarVT != MVT::i16) || - InVT.getSizeInBits() % 128 != 0 || !isPowerOf2_32(NumElems)) - return SDValue(); - - // InScalarVT is the intermediate type in AddSubSat pattern - // and it should be greater than the original input type (i8/i16). - EVT InScalarVT = InVT.getVectorElementType(); - if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) - return SDValue(); - - if (!Subtarget.hasSSE2()) - return SDValue(); - - // Detect the following pattern: - // %2 = zext <16 x i8> %0 to <16 x i16> - // %3 = zext <16 x i8> %1 to <16 x i16> - // %4 = add nuw nsw <16 x i16> %3, %2 - // %5 = icmp ult <16 x i16> %4, <16 x i16> (vector of max InScalarVT values) - // %6 = select <16 x i1> %5, <16 x i16> (vector of max InScalarVT values) - // %7 = trunc <16 x i16> %6 to <16 x i8> - - // Detect a Sat Pattern - bool Signed = true; - SDValue Sat = detectSSatPattern(In, VT, false); - if (!Sat) { - Sat = detectUSatPattern(In, VT); - Signed = false; - } - if (!Sat) - return SDValue(); - if (Sat.getOpcode() != ISD::ADD && Sat.getOpcode() != ISD::SUB) - return SDValue(); - - unsigned Opcode = Sat.getOpcode() == ISD::ADD ? Signed ? X86ISD::ADDS - : X86ISD::ADDUS - : Signed ? X86ISD::SUBS - : X86ISD::SUBUS; - - // Get addition elements. - SDValue LHS = Sat.getOperand(0); - SDValue RHS = Sat.getOperand(1); - - // Check if LHS and RHS are results of type promotion or - // one of them is and the other one is constant. - unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : - ISD::ZERO_EXTEND; - unsigned LHSOpcode = LHS.getOpcode(); - unsigned RHSOpcode = RHS.getOpcode(); - - if (LHSOpcode == ExtendOpcode && RHSOpcode == ExtendOpcode) { - LHS = LHS.getOperand(0); - RHS = RHS.getOperand(0); - } else if (LHSOpcode == ExtendOpcode && - ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) { - LHS = LHS.getOperand(0); - RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); - } else if (RHSOpcode == ExtendOpcode && - ISD::isBuildVectorOfConstantSDNodes(LHS.getNode())) { - RHS = RHS.getOperand(0); - LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS); - } else - return SDValue(); - - // The pattern is detected, emit ADDS/ADDUS/SUBS/SUBUS instruction. - auto AddSubSatBuilder = [Opcode](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - EVT VT = Ops[0].getValueType(); - return DAG.getNode(Opcode, DL, VT, Ops); - }; - return SplitOpsAndApply(DAG, Subtarget, DL, VT, { LHS, RHS }, - AddSubSatBuilder); -} - // Try to form a MULHU or MULHS node by looking for // (trunc (srl (mul ext, ext), 16)) // TODO: This is X86 specific because we want to be able to handle wide types @@ -36175,10 +36090,6 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; - // Try to detect addition or substraction with saturation. - if (SDValue AddSubSat = detectAddSubSatPattern(Src, VT, DAG, Subtarget, DL)) - return AddSubSat; - // Try to combine truncation with signed/unsigned saturation. if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) return Val; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 66f0a675cf1..ad95d4e9aff 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -402,6 +402,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -440,6 +444,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), @@ -795,6 +803,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FMULS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMULS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK, @@ -961,6 +981,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK, @@ -1570,6 +1602,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), @@ -1591,6 +1627,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index 47b681f4fa3..0a61f21c90c 100644 --- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -98,17 +98,11 @@ define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %1 = sext <32 x i8> %arg0 to <32 x i16> - %2 = sext <32 x i8> %arg1 to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %bc = bitcast <32 x i8> %8 to <4 x i64> + %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epi16: @@ -117,17 +111,11 @@ define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %1 = sext <16 x i16> %arg0 to <16 x i32> - %2 = sext <16 x i16> %arg1 to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %bc = bitcast <16 x i16> %8 to <4 x i64> + %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu8: @@ -136,15 +124,11 @@ define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %1 = zext <32 x i8> %arg0 to <32 x i16> - %2 = zext <32 x i8> %arg1 to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp ult <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = trunc <32 x i16> %5 to <32 x i8> - %bc = bitcast <32 x i8> %6 to <4 x i64> + %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu16: @@ -153,15 +137,11 @@ define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %1 = zext <16 x i16> %arg0 to <16 x i32> - %2 = zext <16 x i16> %arg1 to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp ult <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = trunc <16 x i32> %5 to <16 x i16> - %bc = bitcast <16 x i16> %6 to <4 x i64> + %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_alignr_epi8: @@ -2549,17 +2529,11 @@ define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %1 = sext <32 x i8> %arg0 to <32 x i16> - %2 = sext <32 x i8> %arg1 to <32 x i16> - %3 = sub nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %bc = bitcast <32 x i8> %8 to <4 x i64> + %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epi16: @@ -2568,47 +2542,37 @@ define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %1 = sext <16 x i16> %arg0 to <16 x i32> - %2 = sext <16 x i16> %arg1 to <16 x i32> - %3 = sub nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %bc = bitcast <16 x i16> %8 to <4 x i64> + %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %cmp = icmp ugt <32 x i8> %arg0, %arg1 - %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 - %sub = sub <32 x i8> %sel, %arg1 - %bc = bitcast <32 x i8> %sub to <4 x i64> + %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %cmp = icmp ugt <16 x i16> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 - %sub = sub <16 x i16> %sel, %arg1 - %bc = bitcast <16 x i16> %sub to <4 x i64> + %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_unpackhi_epi8: diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll index 4073dae58e9..4a3e98a45fc 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -848,133 +848,6 @@ define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) { declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly -define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_padds_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_padds_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_padds_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_padds_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone - - -define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_paddus_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_paddus_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_paddus_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_paddus_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone - - -define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_psubs_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_psubs_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_psubs_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_psubs_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone - - -define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_psubus_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_psubus_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_psubus_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_psubus_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone - define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) { ; X86-LABEL: test_x86_avx2_pmulu_dq: ; X86: ## %bb.0: diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 01ed7994b2e..4a7f70b9335 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -181,6 +181,110 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() { } +define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_padds_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_padds_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_padds_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_padds_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_padds_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_padds_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_padds_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_padds_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_paddus_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_paddus_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone + + define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) { ; X86-AVX-LABEL: test_x86_avx2_pmadd_wd: ; X86-AVX: ## %bb.0: @@ -823,6 +927,109 @@ define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) { declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone +define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubs_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubs_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubs_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubs_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubus_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubus_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone + define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) { ; X86-LABEL: test_x86_avx2_phadd_d: ; X86: ## %bb.0: @@ -1123,29 +1330,29 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() { ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI46_0, kind: FK_Data_4 -; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 +; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI46_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: vmovaps LCPI54_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI46_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI46_0-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI46_0-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) ret <16 x i16> %res } @@ -1864,37 +2071,37 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) { ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X86-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI78_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI78_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI78_1, kind: FK_Data_4 -; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 +; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI78_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: vmovdqa LCPI86_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI78_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI78_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI78_1, kind: FK_Data_4 -; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X64-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI78_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI78_1-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI78_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI78_1-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res } @@ -1929,37 +2136,37 @@ define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI80_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI80_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI80_1, kind: FK_Data_4 -; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 +; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI80_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: vmovdqa LCPI88_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI80_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI80_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI80_1, kind: FK_Data_4 -; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI80_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI80_1-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI80_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI80_1-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res } diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 2b0f67cfb07..1e65819f918 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2710,422 +2710,6 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i ret <32 x i16> %res2 } -define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - -define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - -define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - -define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res -} - -declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 5b745f9a872..a2ada649610 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -579,6 +579,422 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 01a5722ee07..1028e815635 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -3947,1046 +3947,6 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i ret <16 x i16> %res2 } -define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epi16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epi16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epi16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epi16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) - ret <8 x i16> %res -} - -declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - -define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - -define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epi8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epi8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - -define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epi8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epi8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - -define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epu8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epu8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - -define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epu8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) - ret <16 x i8> %res -} - -declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - -define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epu8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) - ret <32 x i8> %res -} - -declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8) define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 4bf48e970b4..221d0388c2b 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -758,6 +758,1046 @@ define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %pt declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) +define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 54ad815bda6..df92c47a7b7 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -112,17 +112,11 @@ define <2 x i64> @test_mm_adds_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %1 = sext <16 x i8> %arg0 to <16 x i16> - %2 = sext <16 x i8> %arg1 to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %bc = bitcast <16 x i8> %8 to <2 x i64> + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1) + %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_adds_epi16: @@ -136,17 +130,11 @@ define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %1 = sext <8 x i16> %arg0 to <8 x i32> - %2 = sext <8 x i16> %arg1 to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %bc = bitcast <8 x i16> %8 to <2 x i64> + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1) + %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_adds_epu8: @@ -160,15 +148,11 @@ define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %1 = zext <16 x i8> %arg0 to <16 x i16> - %2 = zext <16 x i8> %arg1 to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - %bc = bitcast <16 x i8> %6 to <2 x i64> + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1) + %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_adds_epu16: @@ -182,15 +166,11 @@ define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %1 = zext <8 x i16> %arg0 to <8 x i32> - %2 = zext <8 x i16> %arg1 to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - %bc = bitcast <8 x i16> %6 to <2 x i64> + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1) + %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_and_pd: @@ -3527,17 +3507,11 @@ define <2 x i64> @test_mm_subs_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %1 = sext <16 x i8> %arg0 to <16 x i16> - %2 = sext <16 x i8> %arg1 to <16 x i16> - %3 = sub nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %bc = bitcast <16 x i8> %8 to <2 x i64> + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1) + %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epi16: @@ -3551,69 +3525,47 @@ define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %1 = sext <8 x i16> %arg0 to <8 x i32> - %2 = sext <8 x i16> %arg1 to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %bc = bitcast <8 x i16> %8 to <2 x i64> + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1) + %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epu8: ; X32: # %bb.0: -; X32-NEXT: pmaxub %xmm1, %xmm0 -; X32-NEXT: psubb %xmm1, %xmm0 +; X32-NEXT: psubusb %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_subs_epu8: ; X64: # %bb.0: -; X64-NEXT: pmaxub %xmm1, %xmm0 -; X64-NEXT: psubb %xmm1, %xmm0 +; X64-NEXT: psubusb %xmm1, %xmm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %cmp = icmp ugt <16 x i8> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 - %sub = sub <16 x i8> %sel, %arg1 - %bc = bitcast <16 x i8> %sub to <2 x i64> + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1) + %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epu16: ; X32: # %bb.0: -; X32-NEXT: movdqa .LCPI190_0, %xmm2 # xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X32-NEXT: movdqa %xmm1, %xmm3 -; X32-NEXT: pxor %xmm2, %xmm3 -; X32-NEXT: pxor %xmm2, %xmm0 -; X32-NEXT: pmaxsw %xmm3, %xmm0 -; X32-NEXT: pxor %xmm2, %xmm0 -; X32-NEXT: psubw %xmm1, %xmm0 +; X32-NEXT: psubusw %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_subs_epu16: ; X64: # %bb.0: -; X64-NEXT: movdqa .LCPI190_0(%rip), %xmm2 # xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: pxor %xmm2, %xmm3 -; X64-NEXT: pxor %xmm2, %xmm0 -; X64-NEXT: pmaxsw %xmm3, %xmm0 -; X64-NEXT: pxor %xmm2, %xmm0 -; X64-NEXT: psubw %xmm1, %xmm0 +; X64-NEXT: psubusw %xmm1, %xmm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %cmp = icmp ugt <8 x i16> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 - %sub = sub <8 x i16> %sel, %arg1 - %bc = bitcast <8 x i16> %sub to <2 x i64> + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1) + %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_ucomieq_sd: diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index d8d4e95791d..ccc4b1208e0 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -246,172 +246,6 @@ define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) { } declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone -define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_padds_b: -; SSE: ## %bb.0: -; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_padds_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_padds_b: -; SKX: ## %bb.0: -; SKX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_padds_w: -; SSE: ## %bb.0: -; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_padds_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_padds_w: -; SKX: ## %bb.0: -; SKX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone - - -define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_paddus_b: -; SSE: ## %bb.0: -; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_paddus_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_paddus_b: -; SKX: ## %bb.0: -; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_paddus_w: -; SSE: ## %bb.0: -; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_paddus_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_paddus_w: -; SKX: ## %bb.0: -; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone - - -define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psubs_b: -; SSE: ## %bb.0: -; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psubs_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psubs_b: -; SKX: ## %bb.0: -; SKX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_psubs_w: -; SSE: ## %bb.0: -; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psubs_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psubs_w: -; SKX: ## %bb.0: -; SKX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone - - -define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psubus_b: -; SSE: ## %bb.0: -; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psubus_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psubus_b: -; SKX: ## %bb.0: -; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_psubus_w: -; SSE: ## %bb.0: -; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psubus_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psubus_w: -; SKX: ## %bb.0: -; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_x86_sse2_pmulu_dq: diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index 25b827076f7..ba787cb58b3 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -882,6 +882,90 @@ define <16 x i8> @test_x86_sse2_packuswb_128_fold() { } +define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_padds_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone + + define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_pmadd_wd: ; SSE: ## %bb.0: @@ -1402,6 +1486,90 @@ define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) { declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone +define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone + + +define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone + + define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { ; SSE-LABEL: test_x86_sse2_sqrt_pd: ; SSE: ## %bb.0: diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll index b1bd9a83726..e6e2cd94f30 100644 --- a/test/CodeGen/X86/sse2-schedule.ll +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -7122,7 +7122,6 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { } declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone - define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_paddsw: ; GENERIC: # %bb.0: @@ -7229,25 +7228,12 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ZNVER1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = sext <8 x i16> %a0 to <8 x i32> - %2 = sext <8 x i16> %a1 to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = load <8 x i16>, <8 x i16> *%a2, align 16 - %10 = sext <8 x i16> %8 to <8 x i32> - %11 = sext <8 x i16> %9 to <8 x i32> - %12 = add nsw <8 x i32> %10, %11 - %13 = icmp slt <8 x i32> %12, - %14 = select <8 x i1> %13, <8 x i32> %12, <8 x i32> - %15 = icmp sgt <8 x i32> %14, - %16 = select <8 x i1> %15, <8 x i32> %14, <8 x i32> - %17 = trunc <8 x i32> %16 to <8 x i16> - ret <8 x i16> %17 + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 } +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; GENERIC-LABEL: test_paddusb: @@ -7355,21 +7341,12 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ZNVER1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = zext <16 x i8> %a0 to <16 x i16> - %2 = zext <16 x i8> %a1 to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - %7 = load <16 x i8>, <16 x i8> *%a2, align 16 - %8 = zext <16 x i8> %6 to <16 x i16> - %9 = zext <16 x i8> %7 to <16 x i16> - %10 = add nsw <16 x i16> %8, %9 - %11 = icmp ult <16 x i16> %10, - %12 = select <16 x i1> %11, <16 x i16> %10, <16 x i16> - %13 = trunc <16 x i16> %12 to <16 x i8> - ret <16 x i8> %13 + %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 } +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_paddusw: @@ -7477,21 +7454,12 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ZNVER1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = zext <8 x i16> %a0 to <8 x i32> - %2 = zext <8 x i16> %a1 to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - %7 = load <8 x i16>, <8 x i16> *%a2, align 16 - %8 = zext <8 x i16> %6 to <8 x i32> - %9 = zext <8 x i16> %7 to <8 x i32> - %10 = add nsw <8 x i32> %8, %9 - %11 = icmp ult <8 x i32> %10, - %12 = select <8 x i1> %11, <8 x i32> %10, <8 x i32> - %13 = trunc <8 x i32> %12 to <8 x i16> - ret <8 x i16> %13 + %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 } +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_paddw: @@ -12617,25 +12585,12 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ZNVER1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = sext <8 x i16> %a0 to <8 x i32> - %2 = sext <8 x i16> %a1 to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = load <8 x i16>, <8 x i16> *%a2, align 16 - %10 = sext <8 x i16> %8 to <8 x i32> - %11 = sext <8 x i16> %9 to <8 x i32> - %12 = sub nsw <8 x i32> %10, %11 - %13 = icmp slt <8 x i32> %12, - %14 = select <8 x i1> %13, <8 x i32> %12, <8 x i32> - %15 = icmp sgt <8 x i32> %14, - %16 = select <8 x i1> %15, <8 x i32> %14, <8 x i32> - %17 = trunc <8 x i32> %16 to <8 x i16> - ret <8 x i16> %17 + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 } +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; GENERIC-LABEL: test_psubusb: @@ -12743,15 +12698,12 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ZNVER1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = icmp ugt <16 x i8> %a0, %a1 - %2 = select <16 x i1> %1, <16 x i8> %a0, <16 x i8> %a1 - %3 = sub <16 x i8> %2, %a1 - %4 = load <16 x i8>, <16 x i8> *%a2, align 16 - %5 = icmp ugt <16 x i8> %3, %4 - %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 - %7 = sub <16 x i8> %6, %4 - ret <16 x i8> %7 + %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 } +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psubusw: @@ -12859,15 +12811,12 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ZNVER1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] ; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = icmp ugt <8 x i16> %a0, %a1 - %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> %a1 - %3 = sub <8 x i16> %2, %a1 - %4 = load <8 x i16>, <8 x i16> *%a2, align 16 - %5 = icmp ugt <8 x i16> %3, %4 - %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 - %7 = sub <8 x i16> %6, %4 - ret <8 x i16> %7 + %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 } +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psubw: diff --git a/test/CodeGen/X86/vector-arith-sat.ll b/test/CodeGen/X86/vector-arith-sat.ll deleted file mode 100644 index ef34843f969..00000000000 --- a/test/CodeGen/X86/vector-arith-sat.ll +++ /dev/null @@ -1,3025 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX -; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding | FileCheck %s -; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE - -define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_padds_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_padds_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: ret{{[l|q]}} - %1 = sext <32 x i8> %a0 to <32 x i16> - %2 = sext <32 x i8> %a1 to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - ret <32 x i8> %8 -} - - -define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_padds_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_padds_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: ret{{[l|q]}} - %1 = sext <16 x i16> %a0 to <16 x i32> - %2 = sext <16 x i16> %a1 to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - ret <16 x i16> %8 -} - - -define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; X86-AVX-LABEL: test_mask_adds_epi16_rr_512: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 -; X86-AVX-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 -; X86-AVX-NEXT: retl -; -; X86-AVX512VL-LABEL: test_mask_adds_epi16_rr_512: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; X86-AVX512VL-NEXT: retl -; -; X64-AVX-LABEL: test_mask_adds_epi16_rr_512: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 -; X64-AVX-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 -; X64-AVX-NEXT: retq -; -; X64-AVX512VL-LABEL: test_mask_adds_epi16_rr_512: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - ret <32 x i16> %8 -} - - -define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_paddus_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_paddus_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: ret{{[l|q]}} - %1 = zext <32 x i8> %a0 to <32 x i16> - %2 = zext <32 x i8> %a1 to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp ult <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = trunc <32 x i16> %5 to <32 x i8> - ret <32 x i8> %6 -} - - -define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_paddus_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_paddus_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: ret{{[l|q]}} - %1 = zext <16 x i16> %a0 to <16 x i32> - %2 = zext <16 x i16> %a1 to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp ult <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = trunc <16 x i32> %5 to <16 x i16> - ret <16 x i16> %6 -} - - -define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; X86-AVX-LABEL: test_mask_adds_epu16_rr_512: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 -; X86-AVX-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 -; X86-AVX-NEXT: retl -; -; X86-AVX512VL-LABEL: test_mask_adds_epu16_rr_512: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; X86-AVX512VL-NEXT: retl -; -; X64-AVX-LABEL: test_mask_adds_epu16_rr_512: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 -; X64-AVX-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 -; X64-AVX-NEXT: retq -; -; X64-AVX512VL-LABEL: test_mask_adds_epu16_rr_512: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq - %1 = zext <32 x i16> %a to <32 x i32> - %2 = zext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp ult <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = trunc <32 x i32> %5 to <32 x i16> - ret <32 x i16> %6 -} - -define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_psubs_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_psubs_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: ret{{[l|q]}} - %1 = sext <32 x i8> %a0 to <32 x i16> - %2 = sext <32 x i8> %a1 to <32 x i16> - %3 = sub nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - ret <32 x i8> %8 -} - - -define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_psubs_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_psubs_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: ret{{[l|q]}} - %1 = sext <16 x i16> %a0 to <16 x i32> - %2 = sext <16 x i16> %a1 to <16 x i32> - %3 = sub nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - ret <16 x i16> %8 -} - - -define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; X86-AVX-LABEL: test_mask_subs_epi16_rr_512: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 -; X86-AVX-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 -; X86-AVX-NEXT: retl -; -; X86-AVX512VL-LABEL: test_mask_subs_epi16_rr_512: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; X86-AVX512VL-NEXT: retl -; -; X64-AVX-LABEL: test_mask_subs_epi16_rr_512: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 -; X64-AVX-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 -; X64-AVX-NEXT: retq -; -; X64-AVX512VL-LABEL: test_mask_subs_epi16_rr_512: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = sub nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - ret <32 x i16> %8 -} - - -define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_psubus_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_psubus_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: ret{{[l|q]}} - %cmp = icmp ugt <32 x i8> %a0, %a1 - %sel = select <32 x i1> %cmp, <32 x i8> %a0, <32 x i8> %a1 - %sub = sub <32 x i8> %sel, %a1 - ret <32 x i8> %sub -} - - -define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_psubus_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512VL-LABEL: test_x86_avx2_psubus_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: ret{{[l|q]}} - %cmp = icmp ugt <16 x i16> %a0, %a1 - %sel = select <16 x i1> %cmp, <16 x i16> %a0, <16 x i16> %a1 - %sub = sub <16 x i16> %sel, %a1 - ret <16 x i16> %sub -} - - -define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; X86-AVX-LABEL: test_mask_subs_epu16_rr_512: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 -; X86-AVX-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 -; X86-AVX-NEXT: retl -; -; X86-AVX512VL-LABEL: test_mask_subs_epu16_rr_512: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; X86-AVX512VL-NEXT: retl -; -; X64-AVX-LABEL: test_mask_subs_epu16_rr_512: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 -; X64-AVX-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 -; X64-AVX-NEXT: retq -; -; X64-AVX512VL-LABEL: test_mask_subs_epu16_rr_512: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq - %cmp = icmp ugt <32 x i16> %a, %b - %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b - %sub = sub <32 x i16> %sel, %b - ret <32 x i16> %sub -} - -define <32 x i16> @test_mask_adds_epi16_rr_512_avx512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_adds_epi16_rr_512_avx512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512_avx512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - ret <32 x i16> %8 -} - -define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru - ret <32 x i16> %10 -} - -define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer - ret <32 x i16> %10 -} - -define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - ret <32 x i16> %8 -} - -define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru - ret <32 x i16> %10 -} - -define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer - ret <32 x i16> %10 -} - -define <64 x i16> @test_mask_adds_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { -; AVX512BW-LABEL: test_mask_adds_epi16_rr_1024: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddsw %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epi16_rr_1024: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: pushl %ebp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 -; AVX512F-32-NEXT: .cfi_offset %ebp, -8 -; AVX512F-32-NEXT: movl %esp, %ebp -; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp -; AVX512F-32-NEXT: andl $-64, %esp -; AVX512F-32-NEXT: subl $64, %esp -; AVX512F-32-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddsw 8(%ebp), %zmm1, %zmm1 -; AVX512F-32-NEXT: movl %ebp, %esp -; AVX512F-32-NEXT: popl %ebp -; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX512F-32-NEXT: retl - %1 = sext <64 x i16> %a to <64 x i32> - %2 = sext <64 x i16> %b to <64 x i32> - %3 = add nsw <64 x i32> %1, %2 - %4 = icmp slt <64 x i32> %3, - %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> - %6 = icmp sgt <64 x i32> %5, - %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> - %8 = trunc <64 x i32> %7 to <64 x i16> - ret <64 x i16> %8 -} - -define <32 x i16> @test_mask_subs_epi16_rr_512_avx512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_subs_epi16_rr_512_avx512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512_avx512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = sub nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - ret <32 x i16> %8 -} - -define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = sub nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru - ret <32 x i16> %10 -} - -define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = sub nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer - ret <32 x i16> %10 -} - -define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = sub nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - ret <32 x i16> %8 -} - -define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = sub nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru - ret <32 x i16> %10 -} - -define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = sext <32 x i16> %a to <32 x i32> - %2 = sext <32 x i16> %b to <32 x i32> - %3 = sub nsw <32 x i32> %1, %2 - %4 = icmp slt <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = icmp sgt <32 x i32> %5, - %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> - %8 = trunc <32 x i32> %7 to <32 x i16> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer - ret <32 x i16> %10 -} - -define <64 x i16> @test_mask_subs_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { -; AVX512BW-LABEL: test_mask_subs_epi16_rr_1024: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsubsw %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epi16_rr_1024: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: pushl %ebp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 -; AVX512F-32-NEXT: .cfi_offset %ebp, -8 -; AVX512F-32-NEXT: movl %esp, %ebp -; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp -; AVX512F-32-NEXT: andl $-64, %esp -; AVX512F-32-NEXT: subl $64, %esp -; AVX512F-32-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpsubsw 8(%ebp), %zmm1, %zmm1 -; AVX512F-32-NEXT: movl %ebp, %esp -; AVX512F-32-NEXT: popl %ebp -; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX512F-32-NEXT: retl - %1 = sext <64 x i16> %a to <64 x i32> - %2 = sext <64 x i16> %b to <64 x i32> - %3 = sub nsw <64 x i32> %1, %2 - %4 = icmp slt <64 x i32> %3, - %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> - %6 = icmp sgt <64 x i32> %5, - %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> - %8 = trunc <64 x i32> %7 to <64 x i16> - ret <64 x i16> %8 -} - -define <32 x i16> @test_mask_adds_epu16_rr_512_avx512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_adds_epu16_rr_512_avx512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512_avx512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %1 = zext <32 x i16> %a to <32 x i32> - %2 = zext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp ult <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = trunc <32 x i32> %5 to <32 x i16> - ret <32 x i16> %6 -} - -define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %1 = zext <32 x i16> %a to <32 x i32> - %2 = zext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp ult <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = trunc <32 x i32> %5 to <32 x i16> - %7 = bitcast i32 %mask to <32 x i1> - %8 = select <32 x i1> %7, <32 x i16> %6, <32 x i16> %passThru - ret <32 x i16> %8 -} - -define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %1 = zext <32 x i16> %a to <32 x i32> - %2 = zext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp ult <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = trunc <32 x i32> %5 to <32 x i16> - %7 = bitcast i32 %mask to <32 x i1> - %8 = select <32 x i1> %7, <32 x i16> %6, <32 x i16> zeroinitializer - ret <32 x i16> %8 -} - -define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = zext <32 x i16> %a to <32 x i32> - %2 = zext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp ult <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = trunc <32 x i32> %5 to <32 x i16> - ret <32 x i16> %6 -} - -define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = zext <32 x i16> %a to <32 x i32> - %2 = zext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp ult <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = trunc <32 x i32> %5 to <32 x i16> - %7 = bitcast i32 %mask to <32 x i1> - %8 = select <32 x i1> %7, <32 x i16> %6, <32 x i16> %passThru - ret <32 x i16> %8 -} - -define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = zext <32 x i16> %a to <32 x i32> - %2 = zext <32 x i16> %b to <32 x i32> - %3 = add nsw <32 x i32> %1, %2 - %4 = icmp ult <32 x i32> %3, - %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> - %6 = trunc <32 x i32> %5 to <32 x i16> - %7 = bitcast i32 %mask to <32 x i1> - %8 = select <32 x i1> %7, <32 x i16> %6, <32 x i16> zeroinitializer - ret <32 x i16> %8 -} - -define <64 x i16> @test_mask_adds_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { -; AVX512BW-LABEL: test_mask_adds_epu16_rr_1024: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddusw %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_adds_epu16_rr_1024: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: pushl %ebp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 -; AVX512F-32-NEXT: .cfi_offset %ebp, -8 -; AVX512F-32-NEXT: movl %esp, %ebp -; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp -; AVX512F-32-NEXT: andl $-64, %esp -; AVX512F-32-NEXT: subl $64, %esp -; AVX512F-32-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddusw 8(%ebp), %zmm1, %zmm1 -; AVX512F-32-NEXT: movl %ebp, %esp -; AVX512F-32-NEXT: popl %ebp -; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX512F-32-NEXT: retl - %1 = zext <64 x i16> %a to <64 x i32> - %2 = zext <64 x i16> %b to <64 x i32> - %3 = add nsw <64 x i32> %1, %2 - %4 = icmp ult <64 x i32> %3, - %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> - %6 = trunc <64 x i32> %5 to <64 x i16> - ret <64 x i16> %6 -} - -define <32 x i16> @test_mask_subs_epu16_rr_512_avx512(<32 x i16> %a, <32 x i16> %b) { -; AVX512BW-LABEL: test_mask_subs_epu16_rr_512_avx512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512_avx512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %cmp = icmp ugt <32 x i16> %a, %b - %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b - %sub = sub <32 x i16> %sel, %b - ret <32 x i16> %sub -} - -define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl - %cmp = icmp ugt <32 x i16> %a, %b - %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b - %sub = sub <32 x i16> %sel, %b - %bc = bitcast i32 %mask to <32 x i1> - %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %cmp = icmp ugt <32 x i16> %a, %b - %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b - %sub = sub <32 x i16> %sel, %b - %bc = bitcast i32 %mask to <32 x i1> - %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %cmp = icmp ugt <32 x i16> %a, %b - %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b - %sub = sub <32 x i16> %sel, %b - ret <32 x i16> %sub -} - -define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %cmp = icmp ugt <32 x i16> %a, %b - %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b - %sub = sub <32 x i16> %sel, %b - %bc = bitcast i32 %mask to <32 x i1> - %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru - ret <32 x i16> %res -} - -define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: retl - %b = load <32 x i16>, <32 x i16>* %ptr_b - %cmp = icmp ugt <32 x i16> %a, %b - %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b - %sub = sub <32 x i16> %sel, %b - %bc = bitcast i32 %mask to <32 x i1> - %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer - ret <32 x i16> %res -} - -define <64 x i16> @test_mask_subs_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { -; AVX512BW-LABEL: test_mask_subs_epu16_rr_1024: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsubusw %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_mask_subs_epu16_rr_1024: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: pushl %ebp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 -; AVX512F-32-NEXT: .cfi_offset %ebp, -8 -; AVX512F-32-NEXT: movl %esp, %ebp -; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp -; AVX512F-32-NEXT: andl $-64, %esp -; AVX512F-32-NEXT: subl $64, %esp -; AVX512F-32-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpsubusw 8(%ebp), %zmm1, %zmm1 -; AVX512F-32-NEXT: movl %ebp, %esp -; AVX512F-32-NEXT: popl %ebp -; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX512F-32-NEXT: retl - %cmp = icmp ugt <64 x i16> %a, %b - %sel = select <64 x i1> %cmp, <64 x i16> %a, <64 x i16> %b - %sub = sub <64 x i16> %sel, %b - ret <64 x i16> %sub -} - -define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epi16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - ret <8 x i16> %8 -} - -define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: retq - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = bitcast i8 %mask to <8 x i1> - %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru - ret <8 x i16> %10 -} - -define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = bitcast i8 %mask to <8 x i1> - %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer - ret <8 x i16> %10 -} - -define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - ret <8 x i16> %8 -} - -define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = bitcast i8 %mask to <8 x i1> - %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru - ret <8 x i16> %10 -} - -define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = bitcast i8 %mask to <8 x i1> - %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer - ret <8 x i16> %10 -} - - -define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epi16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - ret <16 x i16> %8 -} - -define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: retq - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru - ret <16 x i16> %10 -} - -define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer - ret <16 x i16> %10 -} - -define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - ret <16 x i16> %8 -} - -define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru - ret <16 x i16> %10 -} - -define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer - ret <16 x i16> %10 -} - -define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epi16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - ret <8 x i16> %8 -} - -define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: retq - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = bitcast i8 %mask to <8 x i1> - %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru - ret <8 x i16> %10 -} - -define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = bitcast i8 %mask to <8 x i1> - %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer - ret <8 x i16> %10 -} - -define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - ret <8 x i16> %8 -} - -define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = bitcast i8 %mask to <8 x i1> - %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru - ret <8 x i16> %10 -} - -define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = sext <8 x i16> %a to <8 x i32> - %2 = sext <8 x i16> %b to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - %9 = bitcast i8 %mask to <8 x i1> - %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer - ret <8 x i16> %10 -} - -define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epi16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = sub nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - ret <16 x i16> %8 -} - -define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: retq - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = sub nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru - ret <16 x i16> %10 -} - -define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = sub nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer - ret <16 x i16> %10 -} - -define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = sub nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - ret <16 x i16> %8 -} - -define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = sub nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru - ret <16 x i16> %10 -} - -define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = sext <16 x i16> %a to <16 x i32> - %2 = sext <16 x i16> %b to <16 x i32> - %3 = sub nsw <16 x i32> %1, %2 - %4 = icmp slt <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = icmp sgt <16 x i32> %5, - %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> - %8 = trunc <16 x i32> %7 to <16 x i16> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer - ret <16 x i16> %10 -} - -define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %1 = zext <8 x i16> %a to <8 x i32> - %2 = zext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - ret <8 x i16> %6 -} - -define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: retq - %1 = zext <8 x i16> %a to <8 x i32> - %2 = zext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - %7 = bitcast i8 %mask to <8 x i1> - %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> %passThru - ret <8 x i16> %8 -} - -define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = zext <8 x i16> %a to <8 x i32> - %2 = zext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - %7 = bitcast i8 %mask to <8 x i1> - %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> zeroinitializer - ret <8 x i16> %8 -} - -define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = zext <8 x i16> %a to <8 x i32> - %2 = zext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - ret <8 x i16> %6 -} - -define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = zext <8 x i16> %a to <8 x i32> - %2 = zext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - %7 = bitcast i8 %mask to <8 x i1> - %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> %passThru - ret <8 x i16> %8 -} - -define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = zext <8 x i16> %a to <8 x i32> - %2 = zext <8 x i16> %b to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - %7 = bitcast i8 %mask to <8 x i1> - %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> zeroinitializer - ret <8 x i16> %8 -} - -define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epu16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %1 = zext <16 x i16> %a to <16 x i32> - %2 = zext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp ult <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = trunc <16 x i32> %5 to <16 x i16> - ret <16 x i16> %6 -} - -define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: retq - %1 = zext <16 x i16> %a to <16 x i32> - %2 = zext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp ult <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = trunc <16 x i32> %5 to <16 x i16> - %7 = bitcast i16 %mask to <16 x i1> - %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> %passThru - ret <16 x i16> %8 -} - -define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = zext <16 x i16> %a to <16 x i32> - %2 = zext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp ult <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = trunc <16 x i32> %5 to <16 x i16> - %7 = bitcast i16 %mask to <16 x i1> - %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> zeroinitializer - ret <16 x i16> %8 -} - -define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = zext <16 x i16> %a to <16 x i32> - %2 = zext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp ult <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = trunc <16 x i32> %5 to <16 x i16> - ret <16 x i16> %6 -} - -define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = zext <16 x i16> %a to <16 x i32> - %2 = zext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp ult <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = trunc <16 x i32> %5 to <16 x i16> - %7 = bitcast i16 %mask to <16 x i1> - %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> %passThru - ret <16 x i16> %8 -} - -define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = zext <16 x i16> %a to <16 x i32> - %2 = zext <16 x i16> %b to <16 x i32> - %3 = add nsw <16 x i32> %1, %2 - %4 = icmp ult <16 x i32> %3, - %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> - %6 = trunc <16 x i32> %5 to <16 x i16> - %7 = bitcast i16 %mask to <16 x i1> - %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> zeroinitializer - ret <16 x i16> %8 -} - -define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %cmp = icmp ugt <8 x i16> %a, %b - %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b - %sub = sub <8 x i16> %sel, %b - ret <8 x i16> %sub -} - -define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: retq - %cmp = icmp ugt <8 x i16> %a, %b - %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b - %sub = sub <8 x i16> %sel, %b - %bc = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %cmp = icmp ugt <8 x i16> %a, %b - %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b - %sub = sub <8 x i16> %sel, %b - %bc = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu16_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %cmp = icmp ugt <8 x i16> %a, %b - %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b - %sub = sub <8 x i16> %sel, %b - ret <8 x i16> %sub -} - -define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %cmp = icmp ugt <8 x i16> %a, %b - %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b - %sub = sub <8 x i16> %sel, %b - %bc = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru - ret <8 x i16> %res -} - -define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <8 x i16>, <8 x i16>* %ptr_b - %cmp = icmp ugt <8 x i16> %a, %b - %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b - %sub = sub <8 x i16> %sel, %b - %bc = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer - ret <8 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epu16_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %cmp = icmp ugt <16 x i16> %a, %b - %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b - %sub = sub <16 x i16> %sel, %b - ret <16 x i16> %sub -} - -define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: retq - %cmp = icmp ugt <16 x i16> %a, %b - %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b - %sub = sub <16 x i16> %sel, %b - %bc = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %cmp = icmp ugt <16 x i16> %a, %b - %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b - %sub = sub <16 x i16> %sel, %b - %bc = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu16_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %cmp = icmp ugt <16 x i16> %a, %b - %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b - %sub = sub <16 x i16> %sel, %b - ret <16 x i16> %sub -} - -define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %cmp = icmp ugt <16 x i16> %a, %b - %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b - %sub = sub <16 x i16> %sel, %b - %bc = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru - ret <16 x i16> %res -} - -define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <16 x i16>, <16 x i16>* %ptr_b - %cmp = icmp ugt <16 x i16> %a, %b - %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b - %sub = sub <16 x i16> %sel, %b - %bc = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer - ret <16 x i16> %res -} - -define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epi8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - ret <16 x i8> %8 -} - -define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: retq - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru - ret <16 x i8> %10 -} - -define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer - ret <16 x i8> %10 -} - -define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - ret <16 x i8> %8 -} - -define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru - ret <16 x i8> %10 -} - -define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer - ret <16 x i8> %10 -} - -define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epi8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - ret <32 x i8> %8 -} - -define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: retq - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru - ret <32 x i8> %10 -} - -define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer - ret <32 x i8> %10 -} - -define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epi8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - ret <32 x i8> %8 -} - -define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru - ret <32 x i8> %10 -} - -define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer - ret <32 x i8> %10 -} - -define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epi8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = sub nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - ret <16 x i8> %8 -} - -define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: retq - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = sub nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru - ret <16 x i8> %10 -} - -define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = sub nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer - ret <16 x i8> %10 -} - -define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = sub nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - ret <16 x i8> %8 -} - -define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = sub nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru - ret <16 x i8> %10 -} - -define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = sext <16 x i8> %a to <16 x i16> - %2 = sext <16 x i8> %b to <16 x i16> - %3 = sub nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - %9 = bitcast i16 %mask to <16 x i1> - %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer - ret <16 x i8> %10 -} - -define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epi8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = sub nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - ret <32 x i8> %8 -} - -define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: retq - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = sub nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru - ret <32 x i8> %10 -} - -define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = sub nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer - ret <32 x i8> %10 -} - -define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epi8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = sub nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - ret <32 x i8> %8 -} - -define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = sub nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru - ret <32 x i8> %10 -} - -define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = sext <32 x i8> %a to <32 x i16> - %2 = sext <32 x i8> %b to <32 x i16> - %3 = sub nsw <32 x i16> %1, %2 - %4 = icmp slt <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = icmp sgt <32 x i16> %5, - %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> - %8 = trunc <32 x i16> %7 to <32 x i8> - %9 = bitcast i32 %mask to <32 x i1> - %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer - ret <32 x i8> %10 -} - -define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epu8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %1 = zext <16 x i8> %a to <16 x i16> - %2 = zext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - ret <16 x i8> %6 -} - -define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: retq - %1 = zext <16 x i8> %a to <16 x i16> - %2 = zext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - %7 = bitcast i16 %mask to <16 x i1> - %8 = select <16 x i1> %7, <16 x i8> %6, <16 x i8> %passThru - ret <16 x i8> %8 -} - -define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = zext <16 x i8> %a to <16 x i16> - %2 = zext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - %7 = bitcast i16 %mask to <16 x i1> - %8 = select <16 x i1> %7, <16 x i8> %6, <16 x i8> zeroinitializer - ret <16 x i8> %8 -} - -define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = zext <16 x i8> %a to <16 x i16> - %2 = zext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - ret <16 x i8> %6 -} - -define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = zext <16 x i8> %a to <16 x i16> - %2 = zext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - %7 = bitcast i16 %mask to <16 x i1> - %8 = select <16 x i1> %7, <16 x i8> %6, <16 x i8> %passThru - ret <16 x i8> %8 -} - -define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = zext <16 x i8> %a to <16 x i16> - %2 = zext <16 x i8> %b to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - %7 = bitcast i16 %mask to <16 x i1> - %8 = select <16 x i1> %7, <16 x i8> %6, <16 x i8> zeroinitializer - ret <16 x i8> %8 -} - -define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_adds_epu8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %1 = zext <32 x i8> %a to <32 x i16> - %2 = zext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp ult <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = trunc <32 x i16> %5 to <32 x i8> - ret <32 x i8> %6 -} - -define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: retq - %1 = zext <32 x i8> %a to <32 x i16> - %2 = zext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp ult <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = trunc <32 x i16> %5 to <32 x i8> - %7 = bitcast i32 %mask to <32 x i1> - %8 = select <32 x i1> %7, <32 x i8> %6, <32 x i8> %passThru - ret <32 x i8> %8 -} - -define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %1 = zext <32 x i8> %a to <32 x i16> - %2 = zext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp ult <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = trunc <32 x i16> %5 to <32 x i8> - %7 = bitcast i32 %mask to <32 x i1> - %8 = select <32 x i1> %7, <32 x i8> %6, <32 x i8> zeroinitializer - ret <32 x i8> %8 -} - -define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_adds_epu8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = zext <32 x i8> %a to <32 x i16> - %2 = zext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp ult <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = trunc <32 x i16> %5 to <32 x i8> - ret <32 x i8> %6 -} - -define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = zext <32 x i8> %a to <32 x i16> - %2 = zext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp ult <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = trunc <32 x i16> %5 to <32 x i8> - %7 = bitcast i32 %mask to <32 x i1> - %8 = select <32 x i1> %7, <32 x i8> %6, <32 x i8> %passThru - ret <32 x i8> %8 -} - -define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = zext <32 x i8> %a to <32 x i16> - %2 = zext <32 x i8> %b to <32 x i16> - %3 = add nsw <32 x i16> %1, %2 - %4 = icmp ult <32 x i16> %3, - %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> - %6 = trunc <32 x i16> %5 to <32 x i8> - %7 = bitcast i32 %mask to <32 x i1> - %8 = select <32 x i1> %7, <32 x i8> %6, <32 x i8> zeroinitializer - ret <32 x i8> %8 -} - -define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epu8_rr_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %cmp = icmp ugt <16 x i8> %a, %b - %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b - %sub = sub <16 x i8> %sel, %b - ret <16 x i8> %sub -} - -define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: retq - %cmp = icmp ugt <16 x i8> %a, %b - %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b - %sub = sub <16 x i8> %sel, %b - %bc = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %cmp = icmp ugt <16 x i8> %a, %b - %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b - %sub = sub <16 x i8> %sel, %b - %bc = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu8_rm_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %cmp = icmp ugt <16 x i8> %a, %b - %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b - %sub = sub <16 x i8> %sel, %b - ret <16 x i8> %sub -} - -define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmk_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %cmp = icmp ugt <16 x i8> %a, %b - %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b - %sub = sub <16 x i8> %sel, %b - %bc = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru - ret <16 x i8> %res -} - -define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <16 x i8>, <16 x i8>* %ptr_b - %cmp = icmp ugt <16 x i8> %a, %b - %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b - %sub = sub <16 x i8> %sel, %b - %bc = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer - ret <16 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: test_mask_subs_epu8_rr_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %cmp = icmp ugt <32 x i8> %a, %b - %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b - %sub = sub <32 x i8> %sel, %b - ret <32 x i8> %sub -} - -define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: retq - %cmp = icmp ugt <32 x i8> %a, %b - %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b - %sub = sub <32 x i8> %sel, %b - %bc = bitcast i32 %mask to <32 x i1> - %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %cmp = icmp ugt <32 x i8> %a, %b - %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b - %sub = sub <32 x i8> %sel, %b - %bc = bitcast i32 %mask to <32 x i1> - %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { -; CHECK-LABEL: test_mask_subs_epu8_rm_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %cmp = icmp ugt <32 x i8> %a, %b - %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b - %sub = sub <32 x i8> %sel, %b - ret <32 x i8> %sub -} - -define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmk_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %cmp = icmp ugt <32 x i8> %a, %b - %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b - %sub = sub <32 x i8> %sel, %b - %bc = bitcast i32 %mask to <32 x i1> - %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru - ret <32 x i8> %res -} - -define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %b = load <32 x i8>, <32 x i8>* %ptr_b - %cmp = icmp ugt <32 x i8> %a, %b - %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b - %sub = sub <32 x i8> %sel, %b - %bc = bitcast i32 %mask to <32 x i1> - %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer - ret <32 x i8> %res -} - -define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_padds_b: -; SSE: ## %bb.0: -; SSE-NEXT: paddsb %xmm1, %xmm0 -; SSE-NEXT: retl - %1 = sext <16 x i8> %a0 to <16 x i16> - %2 = sext <16 x i8> %a1 to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - ret <16 x i8> %8 -} - - -define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_padds_w: -; SSE: ## %bb.0: -; SSE-NEXT: paddsw %xmm1, %xmm0 -; SSE-NEXT: retl - %1 = sext <8 x i16> %a0 to <8 x i32> - %2 = sext <8 x i16> %a1 to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - ret <8 x i16> %8 -} - - -define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_paddus_b: -; SSE: ## %bb.0: -; SSE-NEXT: paddusb %xmm1, %xmm0 -; SSE-NEXT: retl - %1 = zext <16 x i8> %a0 to <16 x i16> - %2 = zext <16 x i8> %a1 to <16 x i16> - %3 = add nsw <16 x i16> %1, %2 - %4 = icmp ult <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = trunc <16 x i16> %5 to <16 x i8> - ret <16 x i8> %6 -} - - -define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_paddus_w: -; SSE: ## %bb.0: -; SSE-NEXT: paddusw %xmm1, %xmm0 -; SSE-NEXT: retl - %1 = zext <8 x i16> %a0 to <8 x i32> - %2 = zext <8 x i16> %a1 to <8 x i32> - %3 = add nsw <8 x i32> %1, %2 - %4 = icmp ult <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = trunc <8 x i32> %5 to <8 x i16> - ret <8 x i16> %6 -} - -define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psubs_b: -; SSE: ## %bb.0: -; SSE-NEXT: psubsb %xmm1, %xmm0 -; SSE-NEXT: retl - %1 = sext <16 x i8> %a0 to <16 x i16> - %2 = sext <16 x i8> %a1 to <16 x i16> - %3 = sub nsw <16 x i16> %1, %2 - %4 = icmp slt <16 x i16> %3, - %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> - %6 = icmp sgt <16 x i16> %5, - %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> - %8 = trunc <16 x i16> %7 to <16 x i8> - ret <16 x i8> %8 -} - - -define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_psubs_w: -; SSE: ## %bb.0: -; SSE-NEXT: psubsw %xmm1, %xmm0 -; SSE-NEXT: retl - %1 = sext <8 x i16> %a0 to <8 x i32> - %2 = sext <8 x i16> %a1 to <8 x i32> - %3 = sub nsw <8 x i32> %1, %2 - %4 = icmp slt <8 x i32> %3, - %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> - %6 = icmp sgt <8 x i32> %5, - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> - %8 = trunc <8 x i32> %7 to <8 x i16> - ret <8 x i16> %8 -} - - -define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psubus_b: -; SSE: ## %bb.0: -; SSE-NEXT: psubusb %xmm1, %xmm0 -; SSE-NEXT: retl - %cmp = icmp ugt <16 x i8> %a0, %a1 - %sel = select <16 x i1> %cmp, <16 x i8> %a0, <16 x i8> %a1 - %sub = sub <16 x i8> %sel, %a1 - ret <16 x i8> %sub -} - - -define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_psubus_w: -; SSE: ## %bb.0: -; SSE-NEXT: psubusw %xmm1, %xmm0 -; SSE-NEXT: retl - %cmp = icmp ugt <8 x i16> %a0, %a1 - %sel = select <8 x i1> %cmp, <8 x i16> %a0, <8 x i16> %a1 - %sub = sub <8 x i16> %sel, %a1 - ret <8 x i16> %sub -} - -define <8 x i8> @test_x86_sse2_padds_b_64(<8 x i8> %a0, <8 x i8> %a1) { -; AVX512BW-LABEL: test_x86_sse2_padds_b_64: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; SSE-LABEL: test_x86_sse2_padds_b_64: -; SSE: ## %bb.0: -; SSE-NEXT: psllw $8, %xmm0 -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: psllw $8, %xmm1 -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pminsw LCPI144_0, %xmm0 -; SSE-NEXT: pmaxsw LCPI144_1, %xmm0 -; SSE-NEXT: retl - %1 = sext <8 x i8> %a0 to <8 x i16> - %2 = sext <8 x i8> %a1 to <8 x i16> - %3 = add nsw <8 x i16> %1, %2 - %4 = icmp slt <8 x i16> %3, - %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> - %6 = icmp sgt <8 x i16> %5, - %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> - %8 = trunc <8 x i16> %7 to <8 x i8> - ret <8 x i8> %8 -} - -define <4 x i16> @test_x86_sse2_padds_w_64(<4 x i16> %a0, <4 x i16> %a1) { -; AVX512BW-LABEL: test_x86_sse2_padds_w_64: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] -; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; SSE-LABEL: test_x86_sse2_padds_w_64: -; SSE: ## %bb.0: -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: retl - %1 = sext <4 x i16> %a0 to <4 x i32> - %2 = sext <4 x i16> %a1 to <4 x i32> - %3 = add nsw <4 x i32> %1, %2 - %4 = icmp slt <4 x i32> %3, - %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> - %6 = icmp sgt <4 x i32> %5, - %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> - %8 = trunc <4 x i32> %7 to <4 x i16> - ret <4 x i16> %8 -} - - -define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) { -; AVX512BW-LABEL: test_x86_sse2_paddus_b_64: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; SSE-LABEL: test_x86_sse2_paddus_b_64: -; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pminsw LCPI146_0, %xmm0 -; SSE-NEXT: retl - %1 = zext <8 x i8> %a0 to <8 x i16> - %2 = zext <8 x i8> %a1 to <8 x i16> - %3 = add nsw <8 x i16> %1, %2 - %4 = icmp ult <8 x i16> %3, - %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> - %6 = trunc <8 x i16> %5 to <8 x i8> - ret <8 x i8> %6 -} - - -define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) { -; AVX512BW-LABEL: test_x86_sse2_paddus_w_64: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] -; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; SSE-LABEL: test_x86_sse2_paddus_w_64: -; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: retl - %1 = zext <4 x i16> %a0 to <4 x i32> - %2 = zext <4 x i16> %a1 to <4 x i32> - %3 = add nsw <4 x i32> %1, %2 - %4 = icmp ult <4 x i32> %3, - %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> - %6 = trunc <4 x i32> %5 to <4 x i16> - ret <4 x i16> %6 -} - -define <8 x i8> @test_x86_sse2_psubs_b_64(<8 x i8> %a0, <8 x i8> %a1) { -; AVX512BW-LABEL: test_x86_sse2_psubs_b_64: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; SSE-LABEL: test_x86_sse2_psubs_b_64: -; SSE: ## %bb.0: -; SSE-NEXT: psllw $8, %xmm0 -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: psllw $8, %xmm1 -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: psubw %xmm1, %xmm0 -; SSE-NEXT: pminsw LCPI148_0, %xmm0 -; SSE-NEXT: pmaxsw LCPI148_1, %xmm0 -; SSE-NEXT: retl - %1 = sext <8 x i8> %a0 to <8 x i16> - %2 = sext <8 x i8> %a1 to <8 x i16> - %3 = sub nsw <8 x i16> %1, %2 - %4 = icmp slt <8 x i16> %3, - %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> - %6 = icmp sgt <8 x i16> %5, - %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> - %8 = trunc <8 x i16> %7 to <8 x i8> - ret <8 x i8> %8 -} - - -define <4 x i16> @test_x86_sse2_psubs_w_64(<4 x i16> %a0, <4 x i16> %a1) { -; AVX512BW-LABEL: test_x86_sse2_psubs_w_64: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] -; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; SSE-LABEL: test_x86_sse2_psubs_w_64: -; SSE: ## %bb.0: -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psubd %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: retl - %1 = sext <4 x i16> %a0 to <4 x i32> - %2 = sext <4 x i16> %a1 to <4 x i32> - %3 = sub nsw <4 x i32> %1, %2 - %4 = icmp slt <4 x i32> %3, - %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> - %6 = icmp sgt <4 x i32> %5, - %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> - %8 = trunc <4 x i32> %7 to <4 x i16> - ret <4 x i16> %8 -} - - -define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) { -; AVX512BW-LABEL: test_x86_sse2_psubus_b_64: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; SSE-LABEL: test_x86_sse2_psubus_b_64: -; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pmaxsw %xmm3, %xmm0 -; SSE-NEXT: psubw %xmm1, %xmm0 -; SSE-NEXT: retl - %cmp = icmp ugt <8 x i8> %a0, %a1 - %sel = select <8 x i1> %cmp, <8 x i8> %a0, <8 x i8> %a1 - %sub = sub <8 x i8> %sel, %a1 - ret <8 x i8> %sub -} - - -define <4 x i16> @test_x86_sse2_psubus_w_64(<4 x i16> %a0, <4 x i16> %a1) { -; AVX512BW-LABEL: test_x86_sse2_psubus_w_64: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512BW-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; SSE-LABEL: test_x86_sse2_psubus_w_64: -; SSE: ## %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: psubd %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: retl - %cmp = icmp ugt <4 x i16> %a0, %a1 - %sel = select <4 x i1> %cmp, <4 x i16> %a0, <4 x i16> %a1 - %sub = sub <4 x i16> %sel, %a1 - ret <4 x i16> %sub -} diff --git a/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll b/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll index 14aaec92388..c4ec7fa2919 100644 --- a/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll +++ b/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll @@ -46,14 +46,14 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p) nounwind ; Check that shadow is OR'ed, and origin is Select'ed ; And no shadow checks! -define <8 x i16> @Pmulhuw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory { - %call = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a, <8 x i16> %b) +define <8 x i16> @Paddsw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory { + %call = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) ret <8 x i16> %call } -declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a, <8 x i16> %b) nounwind +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) nounwind -; CHECK-LABEL: @Pmulhuw128 +; CHECK-LABEL: @Paddsw128 ; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls ; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls ; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls @@ -62,7 +62,7 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a, <8 x i16> %b) nounwind ; CHECK-ORIGINS: = bitcast <8 x i16> {{.*}} to i128 ; CHECK-ORIGINS-NEXT: = icmp ne i128 {{.*}}, 0 ; CHECK-ORIGINS-NEXT: = select i1 {{.*}}, i32 {{.*}}, i32 -; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.pmulhu.w +; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.padds.w ; CHECK-NEXT: store <8 x i16> {{.*}} @__msan_retval_tls ; CHECK-ORIGINS: store i32 {{.*}} @__msan_retval_origin_tls ; CHECK-NEXT: ret <8 x i16>