1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

revert r344082: [InstCombine] reverse 'trunc X to <N x i1>' canonicalization

This commit accidentally included the diffs from D53057.

llvm-svn: 344178
This commit is contained in:
Sanjay Patel 2018-10-10 20:39:39 +00:00
parent 5db2d20226
commit 8a4850aac2
9 changed files with 212 additions and 245 deletions

View File

@ -706,35 +706,12 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
if (SimplifyDemandedInstructionBits(CI))
return &CI;
// Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
if (DestTy->getScalarSizeInBits() == 1) {
Constant *One = ConstantInt::get(SrcTy, 1);
Src = Builder.CreateAnd(Src, One);
Value *Zero = Constant::getNullValue(Src->getType());
if (DestTy->isIntegerTy()) {
// Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
// TODO: We canonicalize to more instructions here because we are probably
// lacking equivalent analysis for trunc relative to icmp. There may also
// be codegen concerns. If those trunc limitations were removed, we could
// remove this transform.
Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1));
return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
}
// For vectors, we do not canonicalize all truncs to icmp, so optimize
// patterns that would be covered within visitICmpInst.
Value *X;
const APInt *C;
if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
// trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C);
Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
}
if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)),
m_Deferred(X))))) {
// trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1;
Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
}
return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
}
// FIXME: Maybe combine the next two transforms to handle the no cast case

View File

@ -1609,13 +1609,6 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
BinaryOperator *And,
const APInt &C1) {
// For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
// TODO: We canonicalize to the longer form for scalars because we have
// better analysis/folds for icmp, and codegen may be better with icmp.
if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() &&
C1.isNullValue() && match(And->getOperand(1), m_One()))
return new TruncInst(And->getOperand(0), Cmp.getType());
const APInt *C2;
if (!match(And->getOperand(1), m_APInt(C2)))
return nullptr;

View File

@ -1477,33 +1477,6 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
}
/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.
static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1))
return nullptr;
Value *X, *Y;
Constant *Mask;
if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask))))
return nullptr;
// We are extracting a subvector from a shuffle. Remove excess elements from
// the 1st shuffle mask to eliminate the extract.
// shuf (shuf X, Y, <C0, C1, C2, C3>), undef, <0, undef, 2> -->
// shuf X, Y, <C0, undef, C2>
unsigned NumElts = Shuf.getType()->getVectorNumElements();
SmallVector<Constant *, 16> NewMask(NumElts);
for (unsigned i = 0; i != NumElts; ++i) {
// If the extracting shuffle has an undef mask element, it transfers to the
// new shuffle mask. Otherwise, copy the original mask element.
Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i);
Constant *MaskElt = Mask->getAggregateElement(i);
NewMask[i] = isa<UndefValue>(ExtractMaskElt) ? ExtractMaskElt : MaskElt;
}
return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
}
Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
Value *LHS = SVI.getOperand(0);
Value *RHS = SVI.getOperand(1);
@ -1526,9 +1499,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
return &SVI;
}
if (Instruction *I = foldIdentityExtractShuffle(SVI))
return I;
SmallVector<int, 16> Mask = SVI.getShuffleMask();
Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
unsigned LHSWidth = LHS->getType()->getVectorNumElements();

View File

@ -319,8 +319,8 @@ define i1 @test16(i84 %X) {
define <2 x i1> @test16vec(<2 x i84> %X) {
; CHECK-LABEL: @test16vec(
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], <i84 16, i84 16>
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[AND:%.*]] = and <2 x i84> %X, <i84 16, i84 16>
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[CMP]]
;
%shr = ashr <2 x i84> %X, <i84 4, i84 4>

View File

@ -27,8 +27,9 @@ define i1 @test1(i799 %X, i799 %A) {
define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) {
; CHECK-LABEL: @test0vec(
; CHECK-NEXT: [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]]
; CHECK-NEXT: [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1>
; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i39> <i39 1, i39 1>, [[A:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]]
; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[D]]
;
%B = lshr <2 x i39> %X, %A

View File

@ -2427,9 +2427,10 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {
define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: @icmp_and_or_lshr_vec(
; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]]
; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]]
; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
; CHECK-NEXT: [[SHF1:%.*]] = shl nuw <2 x i32> <i32 1, i32 1>, [[Y:%.*]]
; CHECK-NEXT: [[OR2:%.*]] = or <2 x i32> [[SHF1]], <i32 1, i32 1>
; CHECK-NEXT: [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]]
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[RET]]
;
%shf = lshr <2 x i32> %x, %y
@ -2444,7 +2445,8 @@ define <2 x i1> @icmp_and_or_lshr_vec_commute(<2 x i32> %xp, <2 x i32> %y) {
; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]]
; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[RET]]
;
%x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
@ -2470,8 +2472,8 @@ define i1 @icmp_and_or_lshr_cst(i32 %x) {
define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
; CHECK-LABEL: @icmp_and_or_lshr_cst_vec(
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[RET]]
;
%shf = lshr <2 x i32> %x, <i32 1, i32 1>
@ -2484,8 +2486,10 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) {
; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute(
; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X]], <i32 3, i32 3>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], <i32 1, i32 1>
; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[RET]]
;
%x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization

View File

@ -174,7 +174,8 @@ define <8 x i8> @test12a(<8 x i8> %t6, <8 x i8> %t2) {
define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: @extract_subvector_of_shuffle(
; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <3 x i32> <i32 0, i32 2, i32 undef>
; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <3 x i8> [[SHUF]], <3 x i8> undef, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: ret <2 x i8> [[EXTRACT_SUBV]]
;
%shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <3 x i32> <i32 0, i32 2, i32 0>
@ -193,7 +194,7 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y
; CHECK-LABEL: @extract_subvector_of_shuffle_extra_use(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>
; CHECK-NEXT: call void @use_v5i8(<5 x i8> [[SHUF]])
; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <4 x i32> <i32 undef, i32 2, i32 0, i32 undef>
; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
; CHECK-NEXT: ret <4 x i8> [[EXTRACT_SUBV]]
;
%shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>

View File

@ -1,22 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -instcombine -S | FileCheck %s
; Can't get smaller than this.
; This turns into a&1 != 0
; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high.
; This pattern does not appear to meet that standard.
define <2 x i1> @trunc(<2 x i64> %a) {
; CHECK-LABEL: @trunc(
; CHECK-NEXT: [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
; CHECK-NEXT: [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[T]]
;
%t = trunc <2 x i64> %a to <2 x i1>
ret <2 x i1> %t
}
; This is trunc.
; TODO: This could be just 1 instruction (trunc).
define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
; CHECK-LABEL: @and_cmp_is_trunc(
; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[R]]
;
%t = and <2 x i64> %a, <i64 1, i64 1>
@ -24,11 +28,12 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
ret <2 x i1> %r
}
; This is trunc.
; TODO: This could be just 1 instruction (trunc).
define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt(
; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 undef, i64 1>
; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[R]]
;
%t = and <2 x i64> %a, <i64 undef, i64 1>
@ -36,7 +41,7 @@ define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
ret <2 x i1> %r
}
; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete.
; TODO: This could be just 1 instruction (trunc).
define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) {
; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts(

View File

@ -2901,45 +2901,49 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
; AVX-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef)
; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4
; AVX-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef)
; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer
; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer
; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef)
; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12
; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef)
; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 4
; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef)
; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef)
; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8
; AVX-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef)
; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 12
; AVX-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef)
; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]]
; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]]
; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]]
; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]]
; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]])
; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4
; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]])
; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]])
; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12
; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]])
; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
; AVX: middle.block:
; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@ -2949,14 +2953,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
; AVX: for.body:
; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1
; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX: land.lhs.true:
; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
; AVX-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null
; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX: if.then:
; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@ -2994,45 +2998,49 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef)
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
; AVX512-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef)
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16
; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer
; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer
; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef)
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef)
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8
; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef)
; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef)
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 16
; AVX512-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef)
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 24
; AVX512-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef)
; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]]
; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]]
; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]]
; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]]
; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]])
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]])
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16
; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]])
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24
; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]])
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@ -3042,14 +3050,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1
; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX512: land.lhs.true:
; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
; AVX512-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null
; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@ -3154,45 +3162,49 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
; AVX-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef)
; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4
; AVX-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef)
; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer
; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer
; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef)
; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12
; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef)
; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 4
; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef)
; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef)
; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8
; AVX-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef)
; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 12
; AVX-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>*
; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef)
; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]]
; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]]
; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]]
; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]]
; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]])
; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4
; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]])
; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]])
; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12
; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]])
; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
; AVX: middle.block:
; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@ -3202,14 +3214,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
; AVX: for.body:
; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1
; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX: land.lhs.true:
; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
; AVX-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null
; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX: if.then:
; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@ -3247,45 +3259,49 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef)
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef)
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16
; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer
; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer
; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef)
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef)
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8
; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef)
; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef)
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 16
; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef)
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 24
; AVX512-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef)
; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]]
; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]]
; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]]
; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]]
; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]])
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]])
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16
; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]])
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24
; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]])
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@ -3295,14 +3311,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1
; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX512: land.lhs.true:
; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
; AVX512-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null
; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]