[X86][FPEnv] Teach X86 mask compare intrinsics to respect strict FP semantics.

When we use mask compare intrinsics under strict FP option, the masked elements shouldn't raise any exception. So, we cann't replace the intrinsic with a full compare + "and" operation. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D85385
2025-01-31 12:41:49 +01:00 · 2020-08-11 10:04:13 +08:00 · 2020-08-11 10:04:13 +08:00 · 72838e8fb2
commit 72838e8fb2
parent 99ed94570d
20 changed files with 457 additions and 539 deletions
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@ -4749,26 +4749,26 @@ let TargetPrefix = "x86" in {
 let TargetPrefix = "x86" in {
  // NOTE: These comparison intrinsics are not used by clang as long as the
  //       distinction in signaling behaviour is not implemented.
-  def int_x86_avx512_cmp_ps_512 :
+  def int_x86_avx512_mask_cmp_ps_512 :
              Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                         llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_cmp_pd_512 :
+                         llvm_i32_ty, llvm_v16i1_ty, llvm_i32_ty],
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
+  def int_x86_avx512_mask_cmp_pd_512 :
              Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                         llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_cmp_ps_256 :
+                         llvm_i32_ty, llvm_v8i1_ty, llvm_i32_ty],
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
+  def int_x86_avx512_mask_cmp_ps_256 :
              Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cmp_pd_256 :
+                         llvm_i32_ty, llvm_v8i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_x86_avx512_mask_cmp_pd_256 :
              Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cmp_ps_128 :
+                         llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_x86_avx512_mask_cmp_ps_128 :
            Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                       llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cmp_pd_128 :
+                       llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_x86_avx512_mask_cmp_pd_128 :
            Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                       llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+                       llvm_i32_ty, llvm_v2i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;

  def int_x86_avx512_mask_cmp_ss :
        GCCBuiltin<"__builtin_ia32_cmpss_mask">,
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@ -68,6 +68,19 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
  return true;
 }

+// Upgrade the declaration of fp compare intrinsics that change return type
+// from scalar to vXi1 mask.
+static bool UpgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
+                                      Function *&NewFn) {
+  // Check if the return type is a vector.
+  if (F->getReturnType()->isVectorTy())
+    return false;
+
+  rename(F);
+  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  return true;
+}
+
 static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
  // All of the intrinsics matches below should be marked with which llvm
  // version started autoupgrading them. At some point in the future we would
@ -241,7 +254,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
      Name.startswith("avx512.mask.cmp.d") || // Added in 5.0
      Name.startswith("avx512.mask.cmp.q") || // Added in 5.0
      Name.startswith("avx512.mask.cmp.w") || // Added in 5.0
-      Name.startswith("avx512.mask.cmp.p") || // Added in 7.0
+      Name.startswith("avx512.cmp.p") || // Added in 12.0
      Name.startswith("avx512.mask.ucmp.") || // Added in 5.0
      Name.startswith("avx512.cvtb2mask.") || // Added in 7.0
      Name.startswith("avx512.cvtw2mask.") || // Added in 7.0
@ -456,6 +469,24 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
  if (Name == "avx2.mpsadbw") // Added in 3.6
    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
                                            NewFn);
+  if (Name == "avx512.mask.cmp.pd.128") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_128,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.pd.256") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_256,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.pd.512") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_512,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.ps.128") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_128,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.ps.256") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_256,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.ps.512") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_512,
+                                     NewFn);

  // frcz.ss/sd may need to have an argument dropped. Added in 3.2
  if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
@ -2000,38 +2031,36 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                               { CI->getOperand(0), CI->getArgOperand(1) });
      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.cmp.p")) {
-      Type *OpTy = CI->getArgOperand(0)->getType();
+    } else if (IsX86 && Name.startswith("avx512.cmp.p")) {
+      SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                   CI->arg_operands().end());
+      Type *OpTy = Args[0]->getType();
      unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
      unsigned EltWidth = OpTy->getScalarSizeInBits();
      Intrinsic::ID IID;
      if (VecWidth == 128 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_cmp_ps_128;
+        IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
      else if (VecWidth == 256 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_cmp_ps_256;
+        IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
      else if (VecWidth == 512 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_cmp_ps_512;
+        IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
      else if (VecWidth == 128 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_cmp_pd_128;
+        IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
      else if (VecWidth == 256 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_cmp_pd_256;
+        IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
      else if (VecWidth == 512 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_cmp_pd_512;
+        IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
      else
        llvm_unreachable("Unexpected intrinsic");

-      SmallVector<Value *, 4> Args;
-      Args.push_back(CI->getArgOperand(0));
-      Args.push_back(CI->getArgOperand(1));
-      Args.push_back(CI->getArgOperand(2));
-      if (CI->getNumArgOperands() == 5)
-        Args.push_back(CI->getArgOperand(4));
+      Value *Mask = Constant::getAllOnesValue(CI->getType());
+      if (VecWidth == 512)
+        std::swap(Mask, Args.back());
+      Args.push_back(Mask);

      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                               Args);
-      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(3));
-    } else if (IsX86 && Name.startswith("avx512.mask.cmp.") &&
-               Name[16] != 'p') {
+    } else if (IsX86 && Name.startswith("avx512.mask.cmp.")) {
      // Integer compare intrinsics.
      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
      Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
@ -3718,6 +3747,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
    break;
  }

+  case Intrinsic::x86_avx512_mask_cmp_pd_128:
+  case Intrinsic::x86_avx512_mask_cmp_pd_256:
+  case Intrinsic::x86_avx512_mask_cmp_pd_512:
+  case Intrinsic::x86_avx512_mask_cmp_ps_128:
+  case Intrinsic::x86_avx512_mask_cmp_ps_256:
+  case Intrinsic::x86_avx512_mask_cmp_ps_512: {
+    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                 CI->arg_operands().end());
+    unsigned NumElts = cast<VectorType>(Args[0]->getType())->getNumElements();
+    Args[3] = getX86MaskVec(Builder, Args[3], NumElts);
+
+    NewCall = Builder.CreateCall(NewFn, Args);
+    Value *Res = ApplyX86MaskOn1BitsVec(Builder, NewCall, nullptr);
+
+    StringRef Name = CI->getName();
+    if (!Name.empty()) {
+      CI->setName(Name + ".old");
+      NewCall->setName(Name);
+    }
+    CI->replaceAllUsesWith(Res);
+    CI->eraseFromParent();
+    return;
+  }
+
  case Intrinsic::thread_pointer: {
    NewCall = Builder.CreateCall(NewFn, {});
    break;
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -521,9 +521,9 @@ namespace {
 // type.
 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
  unsigned Opcode = N->getOpcode();
-  if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM ||
-      Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE ||
-      Opcode == X86ISD::VFPCLASS) {
+  if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
+      Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
+      Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
    // We can get 256-bit 8 element types here without VLX being enabled. When
    // this happens we will use 512-bit operations and the mask will not be
    // zero extended.
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -24709,20 +24709,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    case CMP_MASK_CC: {
      MVT MaskVT = Op.getSimpleValueType();
      SDValue CC = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
      // We specify 2 possible opcodes for intrinsics with rounding modes.
      // First, we check if the intrinsic may have non-default rounding mode,
      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
      if (IntrData->Opc1 != 0) {
-        SDValue Sae = Op.getOperand(4);
+        SDValue Sae = Op.getOperand(5);
        if (isRoundModeSAE(Sae))
          return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
-                             Op.getOperand(2), CC, Sae);
+                             Op.getOperand(2), CC, Mask, Sae);
        if (!isRoundModeCurDirection(Sae))
          return SDValue();
      }
      //default rounding mode
      return DAG.getNode(IntrData->Opc0, dl, MaskVT,
-                         {Op.getOperand(1), Op.getOperand(2), CC});
+                         {Op.getOperand(1), Op.getOperand(2), CC, Mask});
    }
    case CMP_MASK_SCALAR_CC: {
      SDValue Src1 = Op.getOperand(1);
@ -30302,8 +30303,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  NODE_NAME_CASE(COMI)
  NODE_NAME_CASE(UCOMI)
  NODE_NAME_CASE(CMPM)
+  NODE_NAME_CASE(CMPMM)
  NODE_NAME_CASE(STRICT_CMPM)
-  NODE_NAME_CASE(CMPM_SAE)
+  NODE_NAME_CASE(CMPMM_SAE)
  NODE_NAME_CASE(SETCC)
  NODE_NAME_CASE(SETCC_CARRY)
  NODE_NAME_CASE(FSETCC)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -384,8 +384,10 @@ namespace llvm {
    /// Vector comparison generating mask bits for fp and
    /// integer signed and unsigned data types.
    CMPM,
-    // Vector comparison with SAE for FP values
-    CMPM_SAE,
+    // Vector mask comparison generating mask bits for FP values.
+    CMPMM,
+    // Vector mask comparison with SAE for FP values.
+    CMPMM_SAE,

    // Arithmetic operations with FLAGS results.
    ADD,
--- a/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/lib/Target/X86/X86InstCombineIntrinsic.cpp
@ -1150,39 +1150,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
    }
    break;
  }
-  case Intrinsic::x86_avx512_cmp_pd_128:
-  case Intrinsic::x86_avx512_cmp_pd_256:
-  case Intrinsic::x86_avx512_cmp_pd_512:
-  case Intrinsic::x86_avx512_cmp_ps_128:
-  case Intrinsic::x86_avx512_cmp_ps_256:
-  case Intrinsic::x86_avx512_cmp_ps_512: {
-    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
-    Value *Arg0 = II.getArgOperand(0);
-    Value *Arg1 = II.getArgOperand(1);
-    bool Arg0IsZero = match(Arg0, PatternMatch::m_PosZeroFP());
-    if (Arg0IsZero)
-      std::swap(Arg0, Arg1);
-    Value *A, *B;
-    // This fold requires only the NINF(not +/- inf) since inf minus
-    // inf is nan.
-    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
-    // equal for both compares.
-    // NNAN is not needed because nans compare the same for both compares.
-    // The compare intrinsic uses the above assumptions and therefore
-    // doesn't require additional flags.
-    if ((match(Arg0,
-               PatternMatch::m_OneUse(PatternMatch::m_FSub(
-                   PatternMatch::m_Value(A), PatternMatch::m_Value(B)))) &&
-         match(Arg1, PatternMatch::m_PosZeroFP()) && isa<Instruction>(Arg0) &&
-         cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
-      if (Arg0IsZero)
-        std::swap(A, B);
-      IC.replaceOperand(II, 0, A);
-      IC.replaceOperand(II, 1, B);
-      return &II;
-    }
-    break;
-  }

  case Intrinsic::x86_avx512_add_ps_512:
  case Intrinsic::x86_avx512_div_ps_512:
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@ -2494,10 +2494,6 @@ def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
                         (X86cmpm node:$src1, node:$src2, node:$cc), [{
  return N->hasOneUse();
 }]>;
-def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
-                            (X86cmpmSAE node:$src1, node:$src2, node:$cc), [{
-  return N->hasOneUse();
-}]>;

 def X86cmpm_imm_commute : SDNodeXForm<timm, [{
  uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f);
@ -2564,19 +2560,71 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
            (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
                                                        _.RC:$src1, addr:$src2,
                                                        (X86cmpm_imm_commute timm:$cc))>;
+
+  // Patterns for mask intrinsics.
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1,
+                                                       _.RC:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+                      _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1,
+                                                       addr:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+                      _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1,
+                                                        addr:$src2, timm:$cc)>;
+
+  // Patterns for mask intrinsics with loads in other operand.
+  def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+                                                      (X86cmpm_imm_commute timm:$cc))>;
+
+  def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+                      _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
+                                                       _.RC:$src1, addr:$src2,
+                                                       (X86cmpm_imm_commute timm:$cc))>;
+
+  def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+                                                       (X86cmpm_imm_commute timm:$cc))>;
+
+  def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+                      _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+                                                        _.RC:$src1, addr:$src2,
+                                                        (X86cmpm_imm_commute  timm:$cc))>;
 }

 multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
  // comparison code form (VCMP[EQ/LT/LE/...]
  let Uses = [MXCSR] in
-  defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
-                     (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+  defm  rrib  : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst),
+                     (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                     (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc),
                     "vcmp"#_.Suffix,
                     "$cc, {sae}, $src2, $src1",
                     "$src1, $src2, {sae}, $cc",
-                     (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
-                     (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                    timm:$cc)>,
+                     [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+                                        (_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))],
+                     [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+                                        (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>,
                     EVEX_B, Sched<[sched]>;
 }

--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@ -207,16 +207,21 @@ def X86CmpMaskCC :
      SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
                       SDTCisVec<1>, SDTCisSameAs<2, 1>,
                       SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+def X86MaskCmpMaskCC :
+      SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+                       SDTCisVec<1>, SDTCisSameAs<2, 1>,
+                       SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>;
 def X86CmpMaskCCScalar :
      SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
                           SDTCisVT<3, i8>]>;

 def X86cmpm     : SDNode<"X86ISD::CMPM",     X86CmpMaskCC>;
+def X86cmpmm    : SDNode<"X86ISD::CMPMM",    X86MaskCmpMaskCC>;
 def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>;
 def X86any_cmpm    : PatFrags<(ops node:$src1, node:$src2, node:$src3),
                               [(X86strict_cmpm node:$src1, node:$src2, node:$src3),
                                (X86cmpm node:$src1, node:$src2, node:$src3)]>;
-def X86cmpmSAE  : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>;
+def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>;
 def X86cmpms    : SDNode<"X86ISD::FSETCCM",   X86CmpMaskCCScalar>;
 def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE",   X86CmpMaskCCScalar>;

--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@ -417,12 +417,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
  X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
  X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
-  X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
-  X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
  X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
  X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
  X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
@ -464,6 +458,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                     X86ISD::FADDS, X86ISD::FADDS_RND),
  X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK,
                     X86ISD::FADDS, X86ISD::FADDS_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
  X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC,
                     X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
  X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC,
--- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@ -22,13 +22,13 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 entry:
-  %0 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, i32 4)
+  %0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %1 = bitcast <16 x i1> %0 to i16
-  %2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %b, <16 x float> %x, i32 13, i32 4)
+  %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %b, <16 x float> %x, i32 13, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %3 = bitcast <16 x i1> %2 to i16
-  %4 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %c, <16 x float> %x, i32 13, i32 4)
+  %4 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %x, i32 13, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %5 = bitcast <16 x i1> %4 to i16
-  %6 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %d, <16 x float> %x, i32 13, i32 4)
+  %6 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %d, <16 x float> %x, i32 13, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %7 = bitcast <16 x i1> %6 to i16
  %8 = bitcast i16 %1 to <16 x i1>
  %9 = bitcast i16 %3 to <16 x i1>
@ -46,7 +46,7 @@ entry:
 }

 ; Function Attrs: nounwind readnone
-declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) #1
+declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1

 attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
--- a/test/CodeGen/X86/avx512-cmp-mask.ll
+++ b/test/CodeGen/X86/avx512-cmp-mask.ll
@ -1,40 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512DQVL

 define <4 x i64> @PR32546(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
-; AVX512F-LABEL: PR32546:
-; AVX512F:       ## %bb.0: ## %entry
-; AVX512F-NEXT:    ## kill: def $ymm3 killed $ymm3 def $zmm3
-; AVX512F-NEXT:    ## kill: def $ymm2 killed $ymm2 def $zmm2
-; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vcmpltps %zmm3, %zmm2, %k1
-; AVX512F-NEXT:    vcmpltps %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-NEXT:    kmovw %k0, %eax
-; AVX512F-NEXT:    movzbl %al, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm0
-; AVX512F-NEXT:    vpbroadcastd %xmm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512DQ-LABEL: PR32546:
-; AVX512DQ:       ## %bb.0: ## %entry
-; AVX512DQ-NEXT:    ## kill: def $ymm3 killed $ymm3 def $zmm3
-; AVX512DQ-NEXT:    ## kill: def $ymm2 killed $ymm2 def $zmm2
-; AVX512DQ-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512DQ-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcmpltps %zmm3, %zmm2, %k1
-; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm0, %k0 {%k1}
-; AVX512DQ-NEXT:    kmovb %k0, %eax
-; AVX512DQ-NEXT:    vmovd %eax, %xmm0
-; AVX512DQ-NEXT:    vpbroadcastd %xmm0, %ymm0
-; AVX512DQ-NEXT:    retq
+; AVX512VL-LABEL: PR32546:
+; AVX512VL:       ## %bb.0: ## %entry
+; AVX512VL-NEXT:    vcmpltps %ymm1, %ymm0, %k0
+; AVX512VL-NEXT:    vcmpltps %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    kandw %k0, %k1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    movzbl %al, %eax
+; AVX512VL-NEXT:    vpbroadcastd %eax, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQVL-LABEL: PR32546:
 ; AVX512DQVL:       ## %bb.0: ## %entry
+; AVX512DQVL-NEXT:    vcmpltps %ymm1, %ymm0, %k0
 ; AVX512DQVL-NEXT:    vcmpltps %ymm3, %ymm2, %k1
-; AVX512DQVL-NEXT:    vcmpltps %ymm1, %ymm0, %k0 {%k1}
+; AVX512DQVL-NEXT:    kandb %k0, %k1, %k0
 ; AVX512DQVL-NEXT:    kmovb %k0, %eax
 ; AVX512DQVL-NEXT:    vpbroadcastd %eax, %ymm0
 ; AVX512DQVL-NEXT:    retq
@ -48,4 +31,108 @@ entry:
  %3 = bitcast <8 x i32> %vecinit7.i to <4 x i64>
  ret <4 x i64> %3
 }
- declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
+
+define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
+; CHECK-LABEL: PR32547:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpltps %ymm1, %ymm0, %k0
+; CHECK-NEXT:    vcmpltps %ymm3, %ymm2, %k1
+; CHECK-NEXT:    kunpckbw %k1, %k0, %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+ entry:
+   %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1)
+   %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1)
+   %conv.i = zext i8 %0 to i16
+   %conv.i18 = zext i8 %1 to i16
+   %shl = shl nuw i16 %conv.i, 8
+   %or = or i16 %shl, %conv.i18
+   %2 = bitcast float* %p to <16 x float>*
+   %3 = bitcast i16 %or to <16 x i1>
+   tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3)
+   ret void
+}
+
+define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
+; CHECK-LABEL: PR32547_swap:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpltps %ymm1, %ymm0, %k0
+; CHECK-NEXT:    vcmpltps %ymm3, %ymm2, %k1
+; CHECK-NEXT:    kunpckbw %k1, %k0, %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+ entry:
+   %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1)
+   %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1)
+   %conv.i = zext i8 %0 to i16
+   %conv.i18 = zext i8 %1 to i16
+   %shl = shl nuw i16 %conv.i, 8
+   %or = or i16 %conv.i18, %shl
+   %2 = bitcast float* %p to <16 x float>*
+   %3 = bitcast i16 %or to <16 x i1>
+   tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3)
+   ret void
+}
+
+define void @mask_cmp_128(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %p) {
+; AVX512VL-LABEL: mask_cmp_128:
+; AVX512VL:       ## %bb.0: ## %entry
+; AVX512VL-NEXT:    vcmpltps %xmm1, %xmm0, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    vcmpltps %xmm3, %xmm2, %k0
+; AVX512VL-NEXT:    shlb $4, %al
+; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    korw %k1, %k0, %k1
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQVL-LABEL: mask_cmp_128:
+; AVX512DQVL:       ## %bb.0: ## %entry
+; AVX512DQVL-NEXT:    vcmpltps %xmm1, %xmm0, %k0
+; AVX512DQVL-NEXT:    vcmpltps %xmm3, %xmm2, %k1
+; AVX512DQVL-NEXT:    kshiftlb $4, %k0, %k0
+; AVX512DQVL-NEXT:    korb %k0, %k1, %k1
+; AVX512DQVL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovaps %ymm0, (%rdi) {%k1}
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+entry:
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 1, i8 -1)
+  %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %c, <4 x float> %d, i32 1, i8 -1)
+  %shl = shl nuw i8 %0, 4
+  %or = or i8 %1, %shl
+  %2 = bitcast float* %p to <8 x float>*
+  %3 = bitcast i8 %or to <8 x i1>
+  tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> zeroinitializer, <8 x float>* %2, i32 64, <8 x i1> %3)
+  ret void
+}
+
+define <16 x float> @mask_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, float* %p) {
+; CHECK-LABEL: mask_cmp_512:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpltps {sae}, %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vcmpltps %zmm3, %zmm2, %k1
+; CHECK-NEXT:    kxnorw %k1, %k0, %k1
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+ entry:
+   %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i16 -1, i32 8)
+   %1 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i16 -1, i32 4)
+   %2 = bitcast float* %p to <16 x float>*
+   %3 = load <16 x float>, <16 x float>* %2
+   %4 = xor i16 %0, %1
+   %5 = bitcast i16 %4 to <16 x i1>
+   %6 = select <16 x i1> %5, <16 x float> zeroinitializer, <16 x float> %3
+   ret <16 x float> %6
+}
+declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
+declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32)
+declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@ -10870,3 +10870,32 @@ define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) {
 }

 declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
+
+define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, float* %p) {
+; X86-LABEL: test_cmp_512:
+; X86:       ## %bb.0: ## %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcmpltps {sae}, %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x01]
+; X86-NEXT:    vcmpltps %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6c,0x48,0xc2,0xcb,0x01]
+; X86-NEXT:    kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9]
+; X86-NEXT:    vmovaps (%eax), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_512:
+; X64:       ## %bb.0: ## %entry
+; X64-NEXT:    vcmpltps {sae}, %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x01]
+; X64-NEXT:    vcmpltps %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6c,0x48,0xc2,0xcb,0x01]
+; X64-NEXT:    kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9]
+; X64-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
+ entry:
+   %0 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i32 8)
+   %1 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i32 4)
+   %2 = bitcast float* %p to <16 x float>*
+   %3 = load <16 x float>, <16 x float>* %2
+   %4 = xor <16 x i1> %0, %1
+   %5 = select <16 x i1> %4, <16 x float> zeroinitializer, <16 x float> %3
+   ret <16 x float> %5
+}
+
+declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@ -1046,11 +1046,11 @@ define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
+  %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %1 = bitcast <16 x i1> %res to i16
  ret i16 %1
 }
-declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
+declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)

 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: test_cmppd:
@ -1060,11 +1060,11 @@ define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4)
+  %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %1 = bitcast <8 x i1> %res to i8
  ret i8 %1
 }
-declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)

 ; Function Attrs: nounwind readnone

@ -7521,9 +7521,9 @@ define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 entry:
-  %0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
+  %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %1 = bitcast <8 x i1> %0 to i8
-  %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i32 4)
+  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %3 = bitcast <8 x i1> %2 to i8
  %conv = zext i8 %1 to i16
  %conv2 = zext i8 %3 to i16
@ -7561,7 +7561,7 @@ define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 entry:
-  %0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
+  %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %1 = bitcast <8 x i1> %0 to i8
  %conv = zext i8 %1 to i16
  %2 = bitcast i16 %conv to <16 x i1>
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@ -17233,3 +17233,90 @@ define <8 x i32> @test_maskz_expand_d_256(<8 x i32> %data, i8 %mask) {
 }

 declare <8 x i32> @llvm.x86.avx512.mask.expand.d.256(<8 x i32> %data, <8 x i32> %src0, i8 %mask)
+
+define void @test_cmp_128(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %p) {
+; X86-LABEL: test_cmp_128:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp # encoding: [0x55]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
+; X86-NEXT:    subl $16, %esp # encoding: [0x83,0xec,0x10]
+; X86-NEXT:    movl 24(%ebp), %eax # encoding: [0x8b,0x45,0x18]
+; X86-NEXT:    vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
+; X86-NEXT:    vcmpltps 8(%ebp), %xmm2, %k1 # encoding: [0x62,0xf1,0x6c,0x08,0xc2,0x8d,0x08,0x00,0x00,0x00,0x01]
+; X86-NEXT:    kshiftlw $4, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x04]
+; X86-NEXT:    korw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x45,0xc9]
+; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; X86-NEXT:    vmovaps %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x00]
+; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
+; X86-NEXT:    popl %ebp # encoding: [0x5d]
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_128:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
+; X64-NEXT:    vcmpltps %xmm3, %xmm2, %k1 # encoding: [0x62,0xf1,0x6c,0x08,0xc2,0xcb,0x01]
+; X64-NEXT:    kshiftlw $4, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x04]
+; X64-NEXT:    korw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x45,0xc9]
+; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; X64-NEXT:    vmovaps %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
+ entry:
+   %0 = tail call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 1)
+   %1 = tail call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %c, <4 x float> %d, i32 1)
+   %2 = bitcast float* %p to <8 x float>*
+   %3 = shufflevector <4 x i1> %0, <4 x i1> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+   tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> zeroinitializer, <8 x float>* %2, i32 64, <8 x i1> %3)
+   ret void
+}
+
+define void @test_cmp_256(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
+; X86-LABEL: test_cmp_256:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp # encoding: [0x55]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
+; X86-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
+; X86-NEXT:    movl 40(%ebp), %eax # encoding: [0x8b,0x45,0x28]
+; X86-NEXT:    vcmpltps %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x01]
+; X86-NEXT:    vcmpltps 8(%ebp), %ymm2, %k1 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0x8d,0x08,0x00,0x00,0x00,0x01]
+; X86-NEXT:    kunpckbw %k0, %k1, %k1 # encoding: [0xc5,0xf5,0x4b,0xc8]
+; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; X86-NEXT:    vmovaps %zmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x29,0x00]
+; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
+; X86-NEXT:    popl %ebp # encoding: [0x5d]
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_256:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vcmpltps %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x01]
+; X64-NEXT:    vcmpltps %ymm3, %ymm2, %k1 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0xcb,0x01]
+; X64-NEXT:    kunpckbw %k0, %k1, %k1 # encoding: [0xc5,0xf5,0x4b,0xc8]
+; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; X64-NEXT:    vmovaps %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x29,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
+ entry:
+   %0 = tail call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1)
+   %1 = tail call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1)
+   %2 = bitcast float* %p to <16 x float>*
+   %3 = shufflevector <8 x i1> %0, <8 x i1> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+   tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3)
+   ret void
+}
+
+declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
+declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
+declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@ -770,11 +770,11 @@ define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2)
+  %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
  %1 = bitcast <8 x i1> %res to i8
  ret i8 %1
 }
-declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, <8 x i1>)

 define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test_cmpps_128:
@ -783,12 +783,12 @@ define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2)
+  %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
  %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %2 = bitcast <8 x i1> %1 to i8
  ret i8 %2
 }
-declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
+declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, <4 x i1>)

 define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: test_cmppd_256:
@ -798,12 +798,12 @@ define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2)
+  %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
  %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %2 = bitcast <8 x i1> %1 to i8
  ret i8 %2
 }
-declare <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double>, <4 x double>, i32)
+declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, <4 x i1>)

 define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_cmppd_128:
@ -812,12 +812,12 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
 ; CHECK-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2)
+  %res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, <2 x i1> <i1 true, i1 true>)
  %1 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
  %2 = bitcast <8 x i1> %1 to i8
  ret i8 %2
 }
-declare <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double>, <2 x double>, i32)
+declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, <2 x i1>)

 define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
 ; X86-LABEL: test_mm512_maskz_max_ps_256:
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@ -19358,7 +19358,7 @@ entry:
 }


-declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
+declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
 define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
 ; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
 ; VLX:       # %bb.0: # %entry
@ -20799,7 +20799,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64
 entry:
  %0 = bitcast <8 x i64> %__a to <16 x float>
  %1 = bitcast <8 x i64> %__b to <16 x float>
-  %2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
+  %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast <16 x i1> %2 to i16
  %4 = zext i16 %3 to i32
  ret i32 %4
@ -20824,7 +20824,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u,
 entry:
  %0 = bitcast <8 x i64> %__a to <16 x float>
  %1 = bitcast <8 x i64> %__b to <16 x float>
-  %2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
+  %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast i16 %__u to <16 x i1>
  %4 = and <16 x i1> %2, %3
  %5 = bitcast <16 x i1> %4 to i16
@ -21002,7 +21002,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64
 entry:
  %0 = bitcast <8 x i64> %__a to <16 x float>
  %1 = bitcast <8 x i64> %__b to <16 x float>
-  %2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
+  %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast <16 x i1> %2 to i16
  %4 = zext i16 %3 to i64
  ret i64 %4
@ -21027,7 +21027,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u,
 entry:
  %0 = bitcast <8 x i64> %__a to <16 x float>
  %1 = bitcast <8 x i64> %__b to <16 x float>
-  %2 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
+  %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast i16 %__u to <16 x i1>
  %4 = and <16 x i1> %2, %3
  %5 = bitcast <16 x i1> %4 to i16
@ -21037,7 +21037,7 @@ entry:



-declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
 define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
 ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
 ; VLX:       # %bb.0: # %entry
@ -22867,7 +22867,7 @@ define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64>
 entry:
  %0 = bitcast <8 x i64> %__a to <8 x double>
  %1 = bitcast <8 x i64> %__b to <8 x double>
-  %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
+  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast <8 x i1> %2 to i8
  %4 = zext i8 %3 to i16
  ret i16 %4
@ -22896,7 +22896,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <
 entry:
  %0 = bitcast <8 x i64> %__a to <8 x double>
  %1 = bitcast <8 x i64> %__b to <8 x double>
-  %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
+  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast i8 %__u to <8 x i1>
  %4 = and <8 x i1> %2, %3
  %5 = bitcast <8 x i1> %4 to i8
@ -23082,7 +23082,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64>
 entry:
  %0 = bitcast <8 x i64> %__a to <8 x double>
  %1 = bitcast <8 x i64> %__b to <8 x double>
-  %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
+  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast <8 x i1> %2 to i8
  %4 = zext i8 %3 to i32
  ret i32 %4
@ -23109,7 +23109,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <
 entry:
  %0 = bitcast <8 x i64> %__a to <8 x double>
  %1 = bitcast <8 x i64> %__b to <8 x double>
-  %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
+  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast i8 %__u to <8 x i1>
  %4 = and <8 x i1> %2, %3
  %5 = bitcast <8 x i1> %4 to i8
@ -23295,7 +23295,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64>
 entry:
  %0 = bitcast <8 x i64> %__a to <8 x double>
  %1 = bitcast <8 x i64> %__b to <8 x double>
-  %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
+  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast <8 x i1> %2 to i8
  %4 = zext i8 %3 to i64
  ret i64 %4
@ -23322,7 +23322,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <
 entry:
  %0 = bitcast <8 x i64> %__a to <8 x double>
  %1 = bitcast <8 x i64> %__b to <8 x double>
-  %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
+  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %3 = bitcast i8 %__u to <8 x i1>
  %4 = and <8 x i1> %2, %3
  %5 = bitcast <8 x i1> %4 to i8
@ -23345,7 +23345,7 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
-  %res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
+  %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
  %1 = bitcast <16 x i1> %res to i16
  %cast = bitcast i16 %1 to <16 x i1>
  %shuffle = shufflevector <16 x i1> %cast, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
--- a/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512.ll
@ -312,11 +312,11 @@ define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, i32 4)
+  %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %2 = bitcast <8 x i1> %res to i8
  ret i8 %2
 }
-declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)

 define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
 ; CHECK-LABEL: stack_fold_cmppd_mask:
@ -332,8 +332,9 @@ define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
 ; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    kandb %k0, %k1, %k1
 ; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
 ; CHECK-NEXT:    addq $184, %rsp
@ -344,7 +345,7 @@ define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <
  %2 = load <8 x double>, <8 x double>* %a2
  %3 = fadd <8 x double> %a1, %2
  %4 = bitcast i8 %mask to <8 x i1>
-  %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, i32 4)
+  %5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %6 = and <8 x i1> %4, %5
  %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
  ret <8 x double> %7
@ -364,8 +365,9 @@ define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x doubl
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
 ; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    kandb %k0, %k1, %k1
 ; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
 ; CHECK-NEXT:    addq $184, %rsp
@ -376,7 +378,7 @@ define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x doubl
  %2 = load <8 x double>, <8 x double>* %a2
  %3 = fadd <8 x double> %a1, %2
  %4 = bitcast i8 %mask to <8 x i1>
-  %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, i32 4)
+  %5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %6 = and <8 x i1> %4, %5
  %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
  ret <8 x double> %7
@ -395,11 +397,11 @@ define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, i32 4)
+  %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0,  <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %2 = bitcast <16 x i1> %res to i16
  ret i16 %2
 }
-declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
+declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)

 define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
 ; CHECK-LABEL: stack_fold_cmpps_mask:
@ -415,8 +417,9 @@ define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vaddps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
 ; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    kandw %k0, %k1, %k1
 ; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
 ; CHECK-NEXT:    addq $184, %rsp
@ -427,7 +430,7 @@ define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <
  %2 = load <16 x float>, <16 x float>* %a2
  %3 = fadd <16 x float> %a1, %2
  %4 = bitcast i16 %mask to <16 x i1>
-  %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, i32 4)
+  %5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0,  <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %6 = and <16 x i1> %4, %5
  %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
  ret <16 x float> %7
@ -447,8 +450,9 @@ define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x floa
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vaddps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
 ; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    kandw %k0, %k1, %k1
 ; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
 ; CHECK-NEXT:    addq $184, %rsp
@ -459,7 +463,7 @@ define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x floa
  %2 = load <16 x float>, <16 x float>* %a2
  %3 = fadd <16 x float> %a1, %2
  %4 = bitcast i16 %mask to <16 x i1>
-  %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, i32 4)
+  %5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
  %6 = and <16 x i1> %4, %5
  %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
  ret <16 x float> %7
--- a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@ -249,12 +249,12 @@ define i8 @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0)
+  %res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0, <2 x i1> <i1 true, i1 true>)
  %2 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
  %3 = bitcast <8 x i1> %2 to i8
  ret i8 %3
 }
-declare <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double>, <2 x double>, i32)
+declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, <2 x i1>)

 define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_cmppd_ymm:
@ -269,12 +269,12 @@ define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
+  %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
  %2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %3 = bitcast <8 x i1> %2 to i8
  ret i8 %3
 }
-declare <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double>, <4 x double>, i32)
+declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, <4 x i1>)

 define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_cmpps:
@ -288,12 +288,12 @@ define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0)
+  %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
  %2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %3 = bitcast <8 x i1> %2 to i8
  ret i8 %3
 }
-declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
+declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, <4 x i1>)

 define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_cmpps_ymm:
@ -308,11 +308,11 @@ define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
+  %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
  %2 = bitcast <8 x i1> %res to i8
  ret i8 %2
 }
-declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, <8 x i1>)

 define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_divpd:
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@ -881,159 +881,3 @@ define i64 @shuf64i1_zero(i64 %a) {
  %d = bitcast <64 x i1> %c to i64
  ret i64 %d
 }
-
-; OR(KSHIFTL(X,8),Y) -> KUNPCKBW
-define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
-; AVX512F-LABEL: PR32547:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
-; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
-; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vcmpltps %zmm1, %zmm0, %k0
-; AVX512F-NEXT:    vcmpltps %zmm3, %zmm2, %k1
-; AVX512F-NEXT:    kunpckbw %k1, %k0, %k1
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: PR32547:
-; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vcmpltps %ymm1, %ymm0, %k0
-; AVX512VL-NEXT:    vcmpltps %ymm3, %ymm2, %k1
-; AVX512VL-NEXT:    kunpckbw %k1, %k0, %k1
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; VL_BW_DQ-LABEL: PR32547:
-; VL_BW_DQ:       # %bb.0: # %entry
-; VL_BW_DQ-NEXT:    vcmpltps %ymm1, %ymm0, %k0
-; VL_BW_DQ-NEXT:    vcmpltps %ymm3, %ymm2, %k1
-; VL_BW_DQ-NEXT:    kunpckbw %k1, %k0, %k1
-; VL_BW_DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; VL_BW_DQ-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
-; VL_BW_DQ-NEXT:    vzeroupper
-; VL_BW_DQ-NEXT:    retq
- entry:
-   %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1)
-   %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1)
-   %conv.i = zext i8 %0 to i16
-   %conv.i18 = zext i8 %1 to i16
-   %shl = shl nuw i16 %conv.i, 8
-   %or = or i16 %shl, %conv.i18
-   %2 = bitcast float* %p to <16 x float>*
-   %3 = bitcast i16 %or to <16 x i1>
-   tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3) #4
-   ret void
-}
-
-; OR(X, KSHIFTL(Y,8)) -> KUNPCKBW
-define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, float* %p) {
-; AVX512F-LABEL: PR32547_swap:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
-; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
-; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vcmpltps %zmm1, %zmm0, %k0
-; AVX512F-NEXT:    vcmpltps %zmm3, %zmm2, %k1
-; AVX512F-NEXT:    kunpckbw %k1, %k0, %k1
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: PR32547_swap:
-; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vcmpltps %ymm1, %ymm0, %k0
-; AVX512VL-NEXT:    vcmpltps %ymm3, %ymm2, %k1
-; AVX512VL-NEXT:    kunpckbw %k1, %k0, %k1
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; VL_BW_DQ-LABEL: PR32547_swap:
-; VL_BW_DQ:       # %bb.0: # %entry
-; VL_BW_DQ-NEXT:    vcmpltps %ymm1, %ymm0, %k0
-; VL_BW_DQ-NEXT:    vcmpltps %ymm3, %ymm2, %k1
-; VL_BW_DQ-NEXT:    kunpckbw %k1, %k0, %k1
-; VL_BW_DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; VL_BW_DQ-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
-; VL_BW_DQ-NEXT:    vzeroupper
-; VL_BW_DQ-NEXT:    retq
- entry:
-   %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1)
-   %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1)
-   %conv.i = zext i8 %0 to i16
-   %conv.i18 = zext i8 %1 to i16
-   %shl = shl nuw i16 %conv.i, 8
-   %or = or i16 %conv.i18, %shl
-   %2 = bitcast float* %p to <16 x float>*
-   %3 = bitcast i16 %or to <16 x i1>
-   tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> zeroinitializer, <16 x float>* %2, i32 64, <16 x i1> %3) #4
-   ret void
-}
-
-define void @mask_cmp_128(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %p) {
-; AVX512F-LABEL: mask_cmp_128:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    # kill: def $xmm3 killed $xmm3 def $zmm3
-; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
-; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vcmpltps %zmm1, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, %eax
-; AVX512F-NEXT:    vcmpltps %zmm3, %zmm2, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
-; AVX512F-NEXT:    shlb $4, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    korw %k1, %k0, %k0
-; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: mask_cmp_128:
-; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vcmpltps %xmm1, %xmm0, %k0
-; AVX512VL-NEXT:    kmovw %k0, %eax
-; AVX512VL-NEXT:    vcmpltps %xmm3, %xmm2, %k0
-; AVX512VL-NEXT:    shlb $4, %al
-; AVX512VL-NEXT:    kmovw %eax, %k1
-; AVX512VL-NEXT:    korw %k1, %k0, %k1
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovaps %ymm0, (%rdi) {%k1}
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; VL_BW_DQ-LABEL: mask_cmp_128:
-; VL_BW_DQ:       # %bb.0: # %entry
-; VL_BW_DQ-NEXT:    vcmpltps %xmm1, %xmm0, %k0
-; VL_BW_DQ-NEXT:    vcmpltps %xmm3, %xmm2, %k1
-; VL_BW_DQ-NEXT:    kshiftlb $4, %k0, %k0
-; VL_BW_DQ-NEXT:    korb %k0, %k1, %k1
-; VL_BW_DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; VL_BW_DQ-NEXT:    vmovaps %ymm0, (%rdi) {%k1}
-; VL_BW_DQ-NEXT:    vzeroupper
-; VL_BW_DQ-NEXT:    retq
-entry:
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 1, i8 -1)
-  %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %c, <4 x float> %d, i32 1, i8 -1)
-  %shl = shl nuw i8 %0, 4
-  %or = or i8 %1, %shl
-  %2 = bitcast float* %p to <8 x float>*
-  %3 = bitcast i8 %or to <8 x i1>
-  tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> zeroinitializer, <8 x float>* %2, i32 64, <8 x i1> %3)
-  ret void
-}
-declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
-declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
-declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
--- a/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
+++ b/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
@ -1,210 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
-
-; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).
-
-define i8 @sub_compare_foldingPD128_safe(<2 x double> %a, <2 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD128_safe(
-; CHECK-NEXT:    [[SUB_SAFE:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5)
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %sub.safe = fsub <2 x double> %a, %b
-  %t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.safe, <2 x double> zeroinitializer, i32 5)
-  %t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %t2 = bitcast <8 x i1> %t1 to i8
-  ret i8 %t2
-}
-
-define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD128(
-; CHECK-NEXT:    [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5)
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %sub.i = fsub ninf <2 x double> %a, %b
-  %t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.i, <2 x double> zeroinitializer, i32 5)
-  %t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %t2 = bitcast <8 x i1> %t1 to i8
-  ret i8 %t2
-}
-
-define i8 @sub_compare_foldingPD128_undef_elt(<2 x double> %a, <2 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD128_undef_elt(
-; CHECK-NEXT:    [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5)
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %sub.i = fsub ninf <2 x double> %a, %b
-  %t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.i, <2 x double> <double 0.0, double undef>, i32 5)
-  %t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %t2 = bitcast <8 x i1> %t1 to i8
-  ret i8 %t2
-}
-
-define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD256(
-; CHECK-NEXT:    [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5)
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %sub.i1 = fsub ninf <4 x double> %a, %b
-  %t0 = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5)
-  %t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %t2 = bitcast <8 x i1> %t1 to i8
-  ret i8 %t2
-}
-
-define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD512(
-; CHECK-NEXT:    [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
-; CHECK-NEXT:    ret i8 [[T1]]
-;
-  %sub.i2 = fsub ninf <8 x double> %a, %b
-  %t0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i32 4)
-  %t1 = bitcast <8 x i1> %t0 to i8
-  ret i8 %t1
-}
-
-define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){
-; CHECK-LABEL: @sub_compare_foldingPS128(
-; CHECK-NEXT:    [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12)
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %sub.i3 = fsub ninf <4 x float> %a, %b
-  %t0 = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12)
-  %t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %t2 = bitcast <8 x i1> %t1 to i8
-  ret i8 %t2
-}
-
-define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){
-; CHECK-LABEL: @sub_compare_foldingPS256(
-; CHECK-NEXT:    [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5)
-; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
-; CHECK-NEXT:    ret i8 [[T1]]
-;
-  %sub.i4 = fsub ninf <8 x float> %a, %b
-  %t0 = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5)
-  %t1 = bitcast <8 x i1> %t0 to i8
-  ret i8 %t1
-}
-
-define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){
-; CHECK-LABEL: @sub_compare_foldingPS512(
-; CHECK-NEXT:    [[T0:%.*]] = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    [[T1:%.*]] = bitcast <16 x i1> [[T0]] to i16
-; CHECK-NEXT:    ret i16 [[T1]]
-;
-  %sub.i5 = fsub ninf <16 x float> %a, %b
-  %t0 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i32 4)
-  %t1 = bitcast <16 x i1> %t0 to i16
-  ret i16 %t1
-}
-
-define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPD128(
-; CHECK-NEXT:    [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5)
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %sub.i = fsub ninf <2 x double> %a, %b
-  %t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5)
-  %t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %t2 = bitcast <8 x i1> %t1 to i8
-  ret i8 %t2
-}
-
-define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPD256(
-; CHECK-NEXT:    [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5)
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %sub.i = fsub ninf <4 x double> %a, %b
-  %t0 = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5)
-  %t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %t2 = bitcast <8 x i1> %t1 to i8
-  ret i8 %t2
-}
-
-define i8 @sub_compare_folding_swapPD256_undef(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @sub_compare_folding_swapPD256_undef(
-; CHECK-NEXT:    [[TMP:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> undef, <4 x double> zeroinitializer, i32 5)
-; CHECK-NEXT:    [[T0:%.*]] = shufflevector <4 x i1> [[TMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
-; CHECK-NEXT:    ret i8 [[T1]]
-;
-  %sub.i1 = fsub ninf <4 x double> undef, undef
-  %tmp = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5)
-  %t0 = shufflevector <4 x i1> %tmp, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %t1 = bitcast <8 x i1> %t0 to i8
-  ret i8 %t1
-}
-
-define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPD512(
-; CHECK-NEXT:    [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
-; CHECK-NEXT:    ret i8 [[T1]]
-;
-  %sub.i = fsub ninf <8 x double> %a, %b
-  %t0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i32 4)
-  %t1 = bitcast <8 x i1> %t0 to i8
-  ret i8 %t1
-}
-
-define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPS128(
-; CHECK-NEXT:    [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12)
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %sub.i = fsub ninf <4 x float> %a, %b
-  %t0 = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12)
-  %t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %t2 = bitcast <8 x i1> %t1 to i8
-  ret i8 %t2
-}
-
-define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPS256(
-; CHECK-NEXT:    [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5)
-; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
-; CHECK-NEXT:    ret i8 [[T1]]
-;
-  %sub.i = fsub ninf <8 x float> %a, %b
-  %t0 = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5)
-  %t1 = bitcast <8 x i1> %t0 to i8
-  ret i8 %t1
-}
-
-define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPS512(
-; CHECK-NEXT:    [[T0:%.*]] = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    [[T1:%.*]] = bitcast <16 x i1> [[T0]] to i16
-; CHECK-NEXT:    ret i16 [[T1]]
-;
-  %sub.i = fsub ninf <16 x float> %a, %b
-  %t0 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i32 4)
-  %t1 = bitcast <16 x i1> %t0 to i16
-  ret i16 %t1
-}
-
-declare <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double>, <2 x double>, i32)
-declare <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double>, <4 x double>, i32)
-declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
-declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
-declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
-declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)