1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[ScalarizeMaskedMemIntrin] Bitcast the mask to the scalar domain and use scalar bit tests for the branches.

X86 at least is able to use movmsk or kmov to move the mask to the scalar
domain. Then we can just use test instructions to test individual bits.

This is more efficient than extracting each mask element
individually.

I special cased v1i1 to use the previous behavior. This avoids
poor type legalization of bitcast of v1i1 to i1.

I've skipped expandload/compressstore as I think we need to
handle constant masks for those better first.

Many tests end up with duplicate test instructions due to tail
duplication in the branch folding pass. But the same thing
happens when constructing similar code in C. So its not unique
to the scalarization.

Not sure if this lowering code will also be good for other targets,
but we're only testing X86 today.

Differential Revision: https://reviews.llvm.org/D65319

llvm-svn: 367489
This commit is contained in:
Craig Topper 2019-07-31 22:58:15 +00:00
parent f234df8098
commit 4b86c5ad65
14 changed files with 23131 additions and 26591 deletions

View File

@ -173,15 +173,30 @@ static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
return;
}
// If the mask is not v1i1, use scalar bit test operations. This generates
// better results on X86 at least.
Value *SclrMask;
if (VectorWidth != 1) {
Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
}
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
// Fill the "else" block, created in the previous iteration
//
// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
// %mask_1 = extractelement <16 x i1> %mask, i32 Idx
// %mask_1 = and i16 %scalar_mask, i32 1 << Idx
// %cond = icmp ne i16 %mask_1, 0
// br i1 %mask_1, label %cond.load, label %else
//
Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
Value *Predicate;
if (VectorWidth != 1) {
Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
Builder.getIntN(VectorWidth, 0));
} else {
Predicate = Builder.CreateExtractElement(Mask, Idx);
}
// Create "cond" block
//
@ -290,13 +305,29 @@ static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
return;
}
// If the mask is not v1i1, use scalar bit test operations. This generates
// better results on X86 at least.
Value *SclrMask;
if (VectorWidth != 1) {
Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
}
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
// Fill the "else" block, created in the previous iteration
//
// %mask_1 = extractelement <16 x i1> %mask, i32 Idx
// %mask_1 = and i16 %scalar_mask, i32 1 << Idx
// %cond = icmp ne i16 %mask_1, 0
// br i1 %mask_1, label %cond.store, label %else
//
Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
Value *Predicate;
if (VectorWidth != 1) {
Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
Builder.getIntN(VectorWidth, 0));
} else {
Predicate = Builder.CreateExtractElement(Mask, Idx);
}
// Create "cond" block
//
@ -392,15 +423,30 @@ static void scalarizeMaskedGather(CallInst *CI, bool &ModifiedDT) {
return;
}
// If the mask is not v1i1, use scalar bit test operations. This generates
// better results on X86 at least.
Value *SclrMask;
if (VectorWidth != 1) {
Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
}
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
// Fill the "else" block, created in the previous iteration
//
// %Mask1 = extractelement <16 x i1> %Mask, i32 1
// %Mask1 = and i16 %scalar_mask, i32 1 << Idx
// %cond = icmp ne i16 %mask_1, 0
// br i1 %Mask1, label %cond.load, label %else
//
Value *Predicate =
Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
Value *Predicate;
if (VectorWidth != 1) {
Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
Builder.getIntN(VectorWidth, 0));
} else {
Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
}
// Create "cond" block
//
@ -499,14 +545,29 @@ static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
return;
}
// If the mask is not v1i1, use scalar bit test operations. This generates
// better results on X86 at least.
Value *SclrMask;
if (VectorWidth != 1) {
Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
}
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
// Fill the "else" block, created in the previous iteration
//
// %Mask1 = extractelement <16 x i1> %Mask, i32 Idx
// %Mask1 = and i16 %scalar_mask, i32 1 << Idx
// %cond = icmp ne i16 %mask_1, 0
// br i1 %Mask1, label %cond.store, label %else
//
Value *Predicate =
Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
Value *Predicate;
if (VectorWidth != 1) {
Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
Builder.getIntN(VectorWidth, 0));
} else {
Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
}
// Create "cond" block
//

View File

@ -31,22 +31,26 @@ define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i3
; NOGATHER-LABEL: masked_gather_v2i32:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB0_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: movl (%rax), %eax
; NOGATHER-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
; NOGATHER-NEXT: .LBB0_2: # %else
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
; NOGATHER-NEXT: vmovmskpd %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: jne .LBB0_1
; NOGATHER-NEXT: # %bb.2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: jne .LBB0_3
; NOGATHER-NEXT: .LBB0_4: # %else2
; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB0_1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: movl (%rcx), %ecx
; NOGATHER-NEXT: vpinsrq $0, %rcx, %xmm1, %xmm1
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB0_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: .LBB0_3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: movl (%rax), %eax
; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1
; NOGATHER-NEXT: .LBB0_4: # %else2
; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0
; NOGATHER-NEXT: retq
entry:
@ -80,22 +84,26 @@ define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks,
; NOGATHER-LABEL: masked_gather_v2i32_concat:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB1_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: movl (%rax), %eax
; NOGATHER-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
; NOGATHER-NEXT: .LBB1_2: # %else
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
; NOGATHER-NEXT: vmovmskpd %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: jne .LBB1_1
; NOGATHER-NEXT: # %bb.2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: jne .LBB1_3
; NOGATHER-NEXT: .LBB1_4: # %else2
; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB1_1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: movl (%rcx), %ecx
; NOGATHER-NEXT: vpinsrq $0, %rcx, %xmm1, %xmm1
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB1_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: .LBB1_3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: movl (%rax), %eax
; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1
; NOGATHER-NEXT: .LBB1_4: # %else2
; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; NOGATHER-NEXT: retq
entry:
@ -130,21 +138,25 @@ define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, <
; NOGATHER-LABEL: masked_gather_v2float:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB2_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
; NOGATHER-NEXT: .LBB2_2: # %else
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
; NOGATHER-NEXT: vmovmskpd %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: jne .LBB2_1
; NOGATHER-NEXT: # %bb.2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: jne .LBB2_3
; NOGATHER-NEXT: .LBB2_4: # %else2
; NOGATHER-NEXT: vmovaps %xmm1, %xmm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB2_1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB2_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: .LBB2_3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
; NOGATHER-NEXT: .LBB2_4: # %else2
; NOGATHER-NEXT: vmovaps %xmm1, %xmm0
; NOGATHER-NEXT: retq
entry:
@ -176,21 +188,25 @@ define <4 x float> @masked_gather_v2float_concat(<2 x float*>* %ptr, <2 x i1> %m
; NOGATHER-LABEL: masked_gather_v2float_concat:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB3_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
; NOGATHER-NEXT: .LBB3_2: # %else
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
; NOGATHER-NEXT: vmovmskpd %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: jne .LBB3_1
; NOGATHER-NEXT: # %bb.2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: jne .LBB3_3
; NOGATHER-NEXT: .LBB3_4: # %else2
; NOGATHER-NEXT: vmovaps %xmm1, %xmm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB3_1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB3_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: .LBB3_3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
; NOGATHER-NEXT: .LBB3_4: # %else2
; NOGATHER-NEXT: vmovaps %xmm1, %xmm0
; NOGATHER-NEXT: retq
entry:
@ -221,35 +237,38 @@ define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i3
;
; NOGATHER-LABEL: masked_gather_v4i32:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vpextrb $0, %xmm1, %eax
; NOGATHER-NEXT: vpslld $31, %xmm1, %xmm1
; NOGATHER-NEXT: vmovmskps %xmm1, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB4_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm0, %rax
; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
; NOGATHER-NEXT: .LBB4_2: # %else
; NOGATHER-NEXT: vpextrb $4, %xmm1, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB4_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2
; NOGATHER-NEXT: .LBB4_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB4_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rax
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
; NOGATHER-NEXT: .LBB4_6: # %else5
; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $4, %al
; NOGATHER-NEXT: jne .LBB4_5
; NOGATHER-NEXT: # %bb.6: # %else5
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: jne .LBB4_7
; NOGATHER-NEXT: .LBB4_8: # %else8
; NOGATHER-NEXT: vmovdqa %xmm2, %xmm0
; NOGATHER-NEXT: vzeroupper
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB4_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm2, %xmm2
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB4_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: .LBB4_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm2
; NOGATHER-NEXT: .LBB4_8: # %else8
; NOGATHER-NEXT: vmovdqa %xmm2, %xmm0
; NOGATHER-NEXT: vzeroupper
; NOGATHER-NEXT: retq
@ -278,36 +297,39 @@ define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <
;
; NOGATHER-LABEL: masked_gather_v4float:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vpextrb $0, %xmm1, %eax
; NOGATHER-NEXT: vpslld $31, %xmm1, %xmm1
; NOGATHER-NEXT: vmovmskps %xmm1, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB5_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm0, %rax
; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; NOGATHER-NEXT: .LBB5_2: # %else
; NOGATHER-NEXT: vpextrb $4, %xmm1, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB5_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; NOGATHER-NEXT: .LBB5_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB5_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rax
; NOGATHER-NEXT: testb $4, %al
; NOGATHER-NEXT: jne .LBB5_5
; NOGATHER-NEXT: # %bb.6: # %else5
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: jne .LBB5_7
; NOGATHER-NEXT: .LBB5_8: # %else8
; NOGATHER-NEXT: vmovaps %xmm2, %xmm0
; NOGATHER-NEXT: vzeroupper
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB5_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; NOGATHER-NEXT: .LBB5_6: # %else5
; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB5_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: .LBB5_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; NOGATHER-NEXT: .LBB5_8: # %else8
; NOGATHER-NEXT: vmovaps %xmm2, %xmm0
; NOGATHER-NEXT: vzeroupper
; NOGATHER-NEXT: retq
@ -347,76 +369,82 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0
; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; NOGATHER-NEXT: vpmovmskb %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm1, %xmm4
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: vmovq %xmm3, %rcx
; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_2: # %else
; NOGATHER-NEXT: vpextrb $2, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB6_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm4
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx
; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_4: # %else2
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm1, %xmm4
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_6: # %else5
; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm3
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0
; NOGATHER-NEXT: testb $4, %al
; NOGATHER-NEXT: jne .LBB6_5
; NOGATHER-NEXT: # %bb.6: # %else5
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: jne .LBB6_7
; NOGATHER-NEXT: .LBB6_8: # %else8
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_10
; NOGATHER-NEXT: # %bb.9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: testb $16, %al
; NOGATHER-NEXT: jne .LBB6_9
; NOGATHER-NEXT: .LBB6_10: # %else11
; NOGATHER-NEXT: vpextrb $10, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: je .LBB6_12
; NOGATHER-NEXT: # %bb.11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_12: # %else14
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_14
; NOGATHER-NEXT: # %bb.13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_14: # %else17
; NOGATHER-NEXT: vpextrb $14, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: testb $64, %al
; NOGATHER-NEXT: jne .LBB6_13
; NOGATHER-NEXT: # %bb.14: # %else17
; NOGATHER-NEXT: testb $-128, %al
; NOGATHER-NEXT: jne .LBB6_15
; NOGATHER-NEXT: .LBB6_16: # %else20
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB6_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm3
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB6_8
; NOGATHER-NEXT: .LBB6_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: testb $16, %al
; NOGATHER-NEXT: je .LBB6_10
; NOGATHER-NEXT: .LBB6_9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm0, %xmm0
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: jne .LBB6_11
; NOGATHER-NEXT: jmp .LBB6_12
; NOGATHER-NEXT: .LBB6_13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm2, %xmm2
; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: testb $-128, %al
; NOGATHER-NEXT: je .LBB6_16
; NOGATHER-NEXT: # %bb.15: # %cond.load19
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: .LBB6_15: # %cond.load19
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm0, %xmm0
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_16: # %else20
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
@ -456,77 +484,83 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0
; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; NOGATHER-NEXT: vpmovmskb %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7]
; NOGATHER-NEXT: vmovq %xmm3, %rcx
; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; NOGATHER-NEXT: .LBB7_2: # %else
; NOGATHER-NEXT: vpextrb $2, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB7_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0],mem[0],xmm1[2,3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],mem[0],xmm1[2,3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_4: # %else2
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1],mem[0],xmm1[3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_6: # %else5
; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],mem[0]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0
; NOGATHER-NEXT: testb $4, %al
; NOGATHER-NEXT: jne .LBB7_5
; NOGATHER-NEXT: # %bb.6: # %else5
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: jne .LBB7_7
; NOGATHER-NEXT: .LBB7_8: # %else8
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_10
; NOGATHER-NEXT: # %bb.9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: testb $16, %al
; NOGATHER-NEXT: jne .LBB7_9
; NOGATHER-NEXT: .LBB7_10: # %else11
; NOGATHER-NEXT: vpextrb $10, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: je .LBB7_12
; NOGATHER-NEXT: # %bb.11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_12: # %else14
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_14
; NOGATHER-NEXT: # %bb.13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_14: # %else17
; NOGATHER-NEXT: vpextrb $14, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: testb $64, %al
; NOGATHER-NEXT: jne .LBB7_13
; NOGATHER-NEXT: # %bb.14: # %else17
; NOGATHER-NEXT: testb $-128, %al
; NOGATHER-NEXT: jne .LBB7_15
; NOGATHER-NEXT: .LBB7_16: # %else20
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB7_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],mem[0],xmm1[3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB7_8
; NOGATHER-NEXT: .LBB7_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: testb $16, %al
; NOGATHER-NEXT: je .LBB7_10
; NOGATHER-NEXT: .LBB7_9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: jne .LBB7_11
; NOGATHER-NEXT: jmp .LBB7_12
; NOGATHER-NEXT: .LBB7_13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: testb $-128, %al
; NOGATHER-NEXT: je .LBB7_16
; NOGATHER-NEXT: # %bb.15: # %cond.load19
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: .LBB7_15: # %cond.load19
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_16: # %else20
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
@ -560,41 +594,43 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i6
; NOGATHER-LABEL: masked_gather_v4i64:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
; NOGATHER-NEXT: vmovmskps %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB8_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm3
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB8_2: # %else
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB8_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm3
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vpinsrq $1, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB8_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB8_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm3, %xmm3
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB8_6: # %else5
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: testb $4, %al
; NOGATHER-NEXT: jne .LBB8_5
; NOGATHER-NEXT: # %bb.6: # %else5
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: jne .LBB8_7
; NOGATHER-NEXT: .LBB8_8: # %else8
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB8_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2
; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB8_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: .LBB8_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB8_8: # %else8
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
@ -628,41 +664,43 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks
; NOGATHER-LABEL: masked_gather_v4double:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
; NOGATHER-NEXT: vmovmskps %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB9_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; NOGATHER-NEXT: .LBB9_2: # %else
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB9_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vmovhps {{.*#+}} xmm3 = xmm1[0,1],mem[0,1]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm1[0,1],mem[0,1]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB9_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB9_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vmovlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB9_6: # %else5
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: testb $4, %al
; NOGATHER-NEXT: jne .LBB9_5
; NOGATHER-NEXT: # %bb.6: # %else5
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: jne .LBB9_7
; NOGATHER-NEXT: .LBB9_8: # %else8
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB9_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
; NOGATHER-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB9_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: .LBB9_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB9_8: # %else8
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
@ -694,20 +732,24 @@ define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i6
; NOGATHER-LABEL: masked_gather_v2i64:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB10_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1
; NOGATHER-NEXT: .LBB10_2: # %else
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
; NOGATHER-NEXT: vmovmskpd %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: jne .LBB10_1
; NOGATHER-NEXT: # %bb.2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: jne .LBB10_3
; NOGATHER-NEXT: .LBB10_4: # %else2
; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB10_1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm1, %xmm1
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB10_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: .LBB10_3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm1
; NOGATHER-NEXT: .LBB10_4: # %else2
; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0
; NOGATHER-NEXT: retq
entry:
@ -739,20 +781,24 @@ define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks
; NOGATHER-LABEL: masked_gather_v2double:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2
; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
; NOGATHER-NEXT: vmovmskpd %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB11_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: jne .LBB11_1
; NOGATHER-NEXT: # %bb.2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: jne .LBB11_3
; NOGATHER-NEXT: .LBB11_4: # %else2
; NOGATHER-NEXT: vmovaps %xmm1, %xmm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB11_1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; NOGATHER-NEXT: .LBB11_2: # %else
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB11_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: .LBB11_3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; NOGATHER-NEXT: .LBB11_4: # %else2
; NOGATHER-NEXT: vmovaps %xmm1, %xmm0
; NOGATHER-NEXT: retq
entry:

View File

@ -158,13 +158,12 @@ define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <
; CHECK: ## %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %xmm0, %k0
; CHECK-NEXT: kmovd %k0, %ecx
; CHECK-NEXT: vpmovmskb %xmm0, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_1
; CHECK-NEXT: ## %bb.2: ## %cond.load
; CHECK-NEXT: movswl (%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: movswl (%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm8
; CHECK-NEXT: jmp LBB12_3
; CHECK-NEXT: LBB12_1:
@ -172,13 +171,11 @@ define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <
; CHECK-NEXT: LBB12_3: ## %else
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vxorps %xmm9, %xmm9, %xmm9
; CHECK-NEXT: kshiftrw $1, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: testb $2, %cl
; CHECK-NEXT: je LBB12_4
; CHECK-NEXT: ## %bb.5: ## %cond.load1
; CHECK-NEXT: movswl 2(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: movswl 2(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vmovaps %xmm2, %xmm1
; CHECK-NEXT: vmovaps %xmm2, %xmm7
; CHECK-NEXT: vmovaps %xmm2, %xmm6
@ -193,7 +190,9 @@ define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <
; CHECK-NEXT: vmovaps %xmm2, %xmm11
; CHECK-NEXT: vmovaps %xmm2, %xmm10
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm2
; CHECK-NEXT: jmp LBB12_6
; CHECK-NEXT: testb $4, %cl
; CHECK-NEXT: jne LBB12_7
; CHECK-NEXT: jmp LBB12_8
; CHECK-NEXT: LBB12_4:
; CHECK-NEXT: vmovaps %xmm2, %xmm1
; CHECK-NEXT: vmovaps %xmm2, %xmm7
@ -208,129 +207,52 @@ define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <
; CHECK-NEXT: vmovaps %xmm2, %xmm12
; CHECK-NEXT: vmovaps %xmm2, %xmm11
; CHECK-NEXT: vmovaps %xmm2, %xmm10
; CHECK-NEXT: LBB12_6: ## %else2
; CHECK-NEXT: kshiftrw $2, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: testb $4, %cl
; CHECK-NEXT: je LBB12_8
; CHECK-NEXT: ## %bb.7: ## %cond.load4
; CHECK-NEXT: movswl 4(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: LBB12_7: ## %cond.load4
; CHECK-NEXT: movswl 4(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1
; CHECK-NEXT: LBB12_8: ## %else5
; CHECK-NEXT: kshiftrw $3, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_10
; CHECK-NEXT: ## %bb.9: ## %cond.load7
; CHECK-NEXT: movswl 6(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm7
; CHECK-NEXT: LBB12_10: ## %else8
; CHECK-NEXT: kshiftrw $4, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_12
; CHECK-NEXT: ## %bb.11: ## %cond.load10
; CHECK-NEXT: movswl 8(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm6
; CHECK-NEXT: testb $8, %cl
; CHECK-NEXT: jne LBB12_9
; CHECK-NEXT: ## %bb.10: ## %else8
; CHECK-NEXT: testb $16, %cl
; CHECK-NEXT: jne LBB12_11
; CHECK-NEXT: LBB12_12: ## %else11
; CHECK-NEXT: kshiftrw $5, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_14
; CHECK-NEXT: ## %bb.13: ## %cond.load13
; CHECK-NEXT: movswl 10(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm5
; CHECK-NEXT: testb $32, %cl
; CHECK-NEXT: jne LBB12_13
; CHECK-NEXT: LBB12_14: ## %else14
; CHECK-NEXT: kshiftrw $6, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_16
; CHECK-NEXT: ## %bb.15: ## %cond.load16
; CHECK-NEXT: movswl 12(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm4
; CHECK-NEXT: testb $64, %cl
; CHECK-NEXT: jne LBB12_15
; CHECK-NEXT: LBB12_16: ## %else17
; CHECK-NEXT: kshiftrw $7, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_18
; CHECK-NEXT: ## %bb.17: ## %cond.load19
; CHECK-NEXT: movswl 14(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm3
; CHECK-NEXT: testb $-128, %cl
; CHECK-NEXT: jne LBB12_17
; CHECK-NEXT: LBB12_18: ## %else20
; CHECK-NEXT: kshiftrw $8, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_20
; CHECK-NEXT: ## %bb.19: ## %cond.load22
; CHECK-NEXT: movswl 16(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm16
; CHECK-NEXT: testl $256, %ecx ## imm = 0x100
; CHECK-NEXT: jne LBB12_19
; CHECK-NEXT: LBB12_20: ## %else23
; CHECK-NEXT: kshiftrw $9, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_22
; CHECK-NEXT: ## %bb.21: ## %cond.load25
; CHECK-NEXT: movswl 18(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm15
; CHECK-NEXT: testl $512, %ecx ## imm = 0x200
; CHECK-NEXT: jne LBB12_21
; CHECK-NEXT: LBB12_22: ## %else26
; CHECK-NEXT: kshiftrw $10, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_24
; CHECK-NEXT: ## %bb.23: ## %cond.load28
; CHECK-NEXT: movswl 20(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm14
; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400
; CHECK-NEXT: jne LBB12_23
; CHECK-NEXT: LBB12_24: ## %else29
; CHECK-NEXT: kshiftrw $11, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_26
; CHECK-NEXT: ## %bb.25: ## %cond.load31
; CHECK-NEXT: movswl 22(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm13
; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800
; CHECK-NEXT: jne LBB12_25
; CHECK-NEXT: LBB12_26: ## %else32
; CHECK-NEXT: kshiftrw $12, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_28
; CHECK-NEXT: ## %bb.27: ## %cond.load34
; CHECK-NEXT: movswl 24(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm12
; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000
; CHECK-NEXT: jne LBB12_27
; CHECK-NEXT: LBB12_28: ## %else35
; CHECK-NEXT: kshiftrw $13, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_30
; CHECK-NEXT: ## %bb.29: ## %cond.load37
; CHECK-NEXT: movswl 26(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm11
; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000
; CHECK-NEXT: jne LBB12_29
; CHECK-NEXT: LBB12_30: ## %else38
; CHECK-NEXT: kshiftrw $14, %k0, %k1
; CHECK-NEXT: kmovd %k1, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_32
; CHECK-NEXT: ## %bb.31: ## %cond.load40
; CHECK-NEXT: movswl 28(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm10
; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000
; CHECK-NEXT: jne LBB12_31
; CHECK-NEXT: LBB12_32: ## %else41
; CHECK-NEXT: kshiftrw $15, %k0, %k0
; CHECK-NEXT: kmovd %k0, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000
; CHECK-NEXT: je LBB12_34
; CHECK-NEXT: ## %bb.33: ## %cond.load43
; CHECK-NEXT: LBB12_33: ## %cond.load43
; CHECK-NEXT: movswl 30(%rsi), %ecx
; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm9
@ -384,6 +306,79 @@ define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 30(%rax)
; CHECK-NEXT: retq
; CHECK-NEXT: LBB12_9: ## %cond.load7
; CHECK-NEXT: movswl 6(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm7
; CHECK-NEXT: testb $16, %cl
; CHECK-NEXT: je LBB12_12
; CHECK-NEXT: LBB12_11: ## %cond.load10
; CHECK-NEXT: movswl 8(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm6
; CHECK-NEXT: testb $32, %cl
; CHECK-NEXT: je LBB12_14
; CHECK-NEXT: LBB12_13: ## %cond.load13
; CHECK-NEXT: movswl 10(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm5
; CHECK-NEXT: testb $64, %cl
; CHECK-NEXT: je LBB12_16
; CHECK-NEXT: LBB12_15: ## %cond.load16
; CHECK-NEXT: movswl 12(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm4
; CHECK-NEXT: testb $-128, %cl
; CHECK-NEXT: je LBB12_18
; CHECK-NEXT: LBB12_17: ## %cond.load19
; CHECK-NEXT: movswl 14(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm3
; CHECK-NEXT: testl $256, %ecx ## imm = 0x100
; CHECK-NEXT: je LBB12_20
; CHECK-NEXT: LBB12_19: ## %cond.load22
; CHECK-NEXT: movswl 16(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm16
; CHECK-NEXT: testl $512, %ecx ## imm = 0x200
; CHECK-NEXT: je LBB12_22
; CHECK-NEXT: LBB12_21: ## %cond.load25
; CHECK-NEXT: movswl 18(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm15
; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400
; CHECK-NEXT: je LBB12_24
; CHECK-NEXT: LBB12_23: ## %cond.load28
; CHECK-NEXT: movswl 20(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm14
; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800
; CHECK-NEXT: je LBB12_26
; CHECK-NEXT: LBB12_25: ## %cond.load31
; CHECK-NEXT: movswl 22(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm13
; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000
; CHECK-NEXT: je LBB12_28
; CHECK-NEXT: LBB12_27: ## %cond.load34
; CHECK-NEXT: movswl 24(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm12
; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000
; CHECK-NEXT: je LBB12_30
; CHECK-NEXT: LBB12_29: ## %cond.load37
; CHECK-NEXT: movswl 26(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm11
; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000
; CHECK-NEXT: je LBB12_32
; CHECK-NEXT: LBB12_31: ## %cond.load40
; CHECK-NEXT: movswl 28(%rsi), %edx
; CHECK-NEXT: vmovd %edx, %xmm0
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm10
; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000
; CHECK-NEXT: jne LBB12_33
; CHECK-NEXT: jmp LBB12_34
%res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer)
ret <16 x half> %res
}
@ -394,159 +389,159 @@ define void @test_mask_store_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x h
; CHECK-LABEL: test_mask_store_16xf16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %xmm0, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vpmovmskb %xmm0, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_2
; CHECK-NEXT: ## %bb.1: ## %cond.store
; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, (%rdi)
; CHECK-NEXT: LBB13_2: ## %else
; CHECK-NEXT: kshiftrw $1, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_4
; CHECK-NEXT: ## %bb.3: ## %cond.store1
; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 2(%rdi)
; CHECK-NEXT: jne LBB13_1
; CHECK-NEXT: ## %bb.2: ## %else
; CHECK-NEXT: testb $2, %al
; CHECK-NEXT: jne LBB13_3
; CHECK-NEXT: LBB13_4: ## %else2
; CHECK-NEXT: kshiftrw $2, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_6
; CHECK-NEXT: ## %bb.5: ## %cond.store3
; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 4(%rdi)
; CHECK-NEXT: testb $4, %al
; CHECK-NEXT: jne LBB13_5
; CHECK-NEXT: LBB13_6: ## %else4
; CHECK-NEXT: kshiftrw $3, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_8
; CHECK-NEXT: ## %bb.7: ## %cond.store5
; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 6(%rdi)
; CHECK-NEXT: testb $8, %al
; CHECK-NEXT: jne LBB13_7
; CHECK-NEXT: LBB13_8: ## %else6
; CHECK-NEXT: kshiftrw $4, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_10
; CHECK-NEXT: ## %bb.9: ## %cond.store7
; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 8(%rdi)
; CHECK-NEXT: testb $16, %al
; CHECK-NEXT: jne LBB13_9
; CHECK-NEXT: LBB13_10: ## %else8
; CHECK-NEXT: kshiftrw $5, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_12
; CHECK-NEXT: ## %bb.11: ## %cond.store9
; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 10(%rdi)
; CHECK-NEXT: testb $32, %al
; CHECK-NEXT: jne LBB13_11
; CHECK-NEXT: LBB13_12: ## %else10
; CHECK-NEXT: kshiftrw $6, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_14
; CHECK-NEXT: ## %bb.13: ## %cond.store11
; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 12(%rdi)
; CHECK-NEXT: testb $64, %al
; CHECK-NEXT: jne LBB13_13
; CHECK-NEXT: LBB13_14: ## %else12
; CHECK-NEXT: kshiftrw $7, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_16
; CHECK-NEXT: ## %bb.15: ## %cond.store13
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 14(%rdi)
; CHECK-NEXT: testb $-128, %al
; CHECK-NEXT: jne LBB13_15
; CHECK-NEXT: LBB13_16: ## %else14
; CHECK-NEXT: kshiftrw $8, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_18
; CHECK-NEXT: ## %bb.17: ## %cond.store15
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 16(%rdi)
; CHECK-NEXT: testl $256, %eax ## imm = 0x100
; CHECK-NEXT: jne LBB13_17
; CHECK-NEXT: LBB13_18: ## %else16
; CHECK-NEXT: kshiftrw $9, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_20
; CHECK-NEXT: ## %bb.19: ## %cond.store17
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 18(%rdi)
; CHECK-NEXT: testl $512, %eax ## imm = 0x200
; CHECK-NEXT: jne LBB13_19
; CHECK-NEXT: LBB13_20: ## %else18
; CHECK-NEXT: kshiftrw $10, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_22
; CHECK-NEXT: ## %bb.21: ## %cond.store19
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 20(%rdi)
; CHECK-NEXT: testl $1024, %eax ## imm = 0x400
; CHECK-NEXT: jne LBB13_21
; CHECK-NEXT: LBB13_22: ## %else20
; CHECK-NEXT: kshiftrw $11, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_24
; CHECK-NEXT: ## %bb.23: ## %cond.store21
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 22(%rdi)
; CHECK-NEXT: testl $2048, %eax ## imm = 0x800
; CHECK-NEXT: jne LBB13_23
; CHECK-NEXT: LBB13_24: ## %else22
; CHECK-NEXT: kshiftrw $12, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_26
; CHECK-NEXT: ## %bb.25: ## %cond.store23
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 24(%rdi)
; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000
; CHECK-NEXT: jne LBB13_25
; CHECK-NEXT: LBB13_26: ## %else24
; CHECK-NEXT: kshiftrw $13, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_28
; CHECK-NEXT: ## %bb.27: ## %cond.store25
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 26(%rdi)
; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000
; CHECK-NEXT: jne LBB13_27
; CHECK-NEXT: LBB13_28: ## %else26
; CHECK-NEXT: kshiftrw $14, %k0, %k1
; CHECK-NEXT: kmovd %k1, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je LBB13_30
; CHECK-NEXT: ## %bb.29: ## %cond.store27
; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000
; CHECK-NEXT: jne LBB13_29
; CHECK-NEXT: LBB13_30: ## %else28
; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000
; CHECK-NEXT: jne LBB13_31
; CHECK-NEXT: LBB13_32: ## %else30
; CHECK-NEXT: retq
; CHECK-NEXT: LBB13_1: ## %cond.store
; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, (%rdi)
; CHECK-NEXT: testb $2, %al
; CHECK-NEXT: je LBB13_4
; CHECK-NEXT: LBB13_3: ## %cond.store1
; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 2(%rdi)
; CHECK-NEXT: testb $4, %al
; CHECK-NEXT: je LBB13_6
; CHECK-NEXT: LBB13_5: ## %cond.store3
; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 4(%rdi)
; CHECK-NEXT: testb $8, %al
; CHECK-NEXT: je LBB13_8
; CHECK-NEXT: LBB13_7: ## %cond.store5
; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 6(%rdi)
; CHECK-NEXT: testb $16, %al
; CHECK-NEXT: je LBB13_10
; CHECK-NEXT: LBB13_9: ## %cond.store7
; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 8(%rdi)
; CHECK-NEXT: testb $32, %al
; CHECK-NEXT: je LBB13_12
; CHECK-NEXT: LBB13_11: ## %cond.store9
; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 10(%rdi)
; CHECK-NEXT: testb $64, %al
; CHECK-NEXT: je LBB13_14
; CHECK-NEXT: LBB13_13: ## %cond.store11
; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 12(%rdi)
; CHECK-NEXT: testb $-128, %al
; CHECK-NEXT: je LBB13_16
; CHECK-NEXT: LBB13_15: ## %cond.store13
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 28(%rdi)
; CHECK-NEXT: LBB13_30: ## %else28
; CHECK-NEXT: kshiftrw $15, %k0, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 14(%rdi)
; CHECK-NEXT: testl $256, %eax ## imm = 0x100
; CHECK-NEXT: je LBB13_18
; CHECK-NEXT: LBB13_17: ## %cond.store15
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 16(%rdi)
; CHECK-NEXT: testl $512, %eax ## imm = 0x200
; CHECK-NEXT: je LBB13_20
; CHECK-NEXT: LBB13_19: ## %cond.store17
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 18(%rdi)
; CHECK-NEXT: testl $1024, %eax ## imm = 0x400
; CHECK-NEXT: je LBB13_22
; CHECK-NEXT: LBB13_21: ## %cond.store19
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 20(%rdi)
; CHECK-NEXT: testl $2048, %eax ## imm = 0x800
; CHECK-NEXT: je LBB13_24
; CHECK-NEXT: LBB13_23: ## %cond.store21
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 22(%rdi)
; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000
; CHECK-NEXT: je LBB13_26
; CHECK-NEXT: LBB13_25: ## %cond.store23
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 24(%rdi)
; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000
; CHECK-NEXT: je LBB13_28
; CHECK-NEXT: LBB13_27: ## %cond.store25
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 26(%rdi)
; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000
; CHECK-NEXT: je LBB13_30
; CHECK-NEXT: LBB13_29: ## %cond.store27
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 28(%rdi)
; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000
; CHECK-NEXT: je LBB13_32
; CHECK-NEXT: ## %bb.31: ## %cond.store29
; CHECK-NEXT: LBB13_31: ## %cond.store29
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: movw %ax, 30(%rdi)
; CHECK-NEXT: LBB13_32: ## %else30
; CHECK-NEXT: retq
call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask)
ret void

File diff suppressed because it is too large Load Diff

View File

@ -69,8 +69,9 @@ declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> ,
; SCALAR-NEXT: br label %else
; SCALAR: else:
; SCALAR-NEXT: %res.phi.else = phi
; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i64 1
; SCALAR-NEXT: br i1 %Mask1, label %cond.load1, label %else2
; SCALAR-NEXT: and i16 %{{.*}}, 2
; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
; SCALAR-NEXT: br i1 %{{.*}}, label %cond.load1, label %else2
define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test2:
@ -211,16 +212,18 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; SCALAR-LABEL: test5
; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i64 0
; SCALAR-NEXT: br i1 %Mask0, label %cond.store, label %else
; SCALAR: and i16 %scalar_mask, 1
; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store, label %else
; SCALAR: cond.store:
; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i64 0
; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i64 0
; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
; SCALAR-NEXT: br label %else
; SCALAR: else:
; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i64 1
; SCALAR-NEXT: br i1 %Mask1, label %cond.store1, label %else2
; SCALAR-NEXT: and i16 %scalar_mask, 2
; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store1, label %else2
define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; KNL_64-LABEL: test5:
@ -1660,33 +1663,47 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL_64-NEXT: kshiftrw $1, %k0, %k1
; KNL_64-NEXT: kshiftrw $2, %k0, %k2
; KNL_64-NEXT: kmovw %k0, %eax
; KNL_64-NEXT: andb $1, %al
; KNL_64-NEXT: kmovw %k1, %ecx
; KNL_64-NEXT: andb $1, %cl
; KNL_64-NEXT: addb %cl, %cl
; KNL_64-NEXT: orb %al, %cl
; KNL_64-NEXT: kmovw %k2, %eax
; KNL_64-NEXT: andb $1, %al
; KNL_64-NEXT: shlb $2, %al
; KNL_64-NEXT: orb %cl, %al
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; KNL_64-NEXT: testb $1, %al
; KNL_64-NEXT: je .LBB31_2
; KNL_64-NEXT: # %bb.1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rax
; KNL_64-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3
; KNL_64-NEXT: .LBB31_2: # %else
; KNL_64-NEXT: kshiftrw $1, %k0, %k1
; KNL_64-NEXT: kmovw %k1, %eax
; KNL_64-NEXT: testb $1, %al
; KNL_64-NEXT: je .LBB31_4
; KNL_64-NEXT: # %bb.3: # %cond.load1
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
; KNL_64-NEXT: jne .LBB31_1
; KNL_64-NEXT: # %bb.2: # %else
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: jne .LBB31_3
; KNL_64-NEXT: .LBB31_4: # %else2
; KNL_64-NEXT: kshiftrw $2, %k0, %k0
; KNL_64-NEXT: kmovw %k0, %eax
; KNL_64-NEXT: testb $1, %al
; KNL_64-NEXT: testb $4, %al
; KNL_64-NEXT: jne .LBB31_5
; KNL_64-NEXT: .LBB31_6: # %else5
; KNL_64-NEXT: vmovdqa %xmm3, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB31_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: je .LBB31_4
; KNL_64-NEXT: .LBB31_3: # %cond.load1
; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3
; KNL_64-NEXT: testb $4, %al
; KNL_64-NEXT: je .LBB31_6
; KNL_64-NEXT: # %bb.5: # %cond.load4
; KNL_64-NEXT: .LBB31_5: # %cond.load4
; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL_64-NEXT: vmovq %xmm0, %rax
; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
; KNL_64-NEXT: .LBB31_6: # %else5
; KNL_64-NEXT: vmovdqa %xmm3, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
@ -1698,32 +1715,48 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; KNL_32-NEXT: vmovdqa %xmm0, %xmm3
; KNL_32-NEXT: vpslld $31, %xmm2, %xmm0
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL_32-NEXT: kshiftrw $1, %k0, %k1
; KNL_32-NEXT: kshiftrw $2, %k0, %k2
; KNL_32-NEXT: kmovw %k0, %eax
; KNL_32-NEXT: andb $1, %al
; KNL_32-NEXT: kmovw %k1, %ecx
; KNL_32-NEXT: andb $1, %cl
; KNL_32-NEXT: addb %cl, %cl
; KNL_32-NEXT: orb %al, %cl
; KNL_32-NEXT: kmovw %k2, %eax
; KNL_32-NEXT: andb $1, %al
; KNL_32-NEXT: shlb $2, %al
; KNL_32-NEXT: orb %cl, %al
; KNL_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
; KNL_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; KNL_32-NEXT: testb $1, %al
; KNL_32-NEXT: je .LBB31_2
; KNL_32-NEXT: # %bb.1: # %cond.load
; KNL_32-NEXT: vmovd %xmm1, %eax
; KNL_32-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0
; KNL_32-NEXT: .LBB31_2: # %else
; KNL_32-NEXT: kshiftrw $1, %k0, %k1
; KNL_32-NEXT: kmovw %k1, %eax
; KNL_32-NEXT: testb $1, %al
; KNL_32-NEXT: je .LBB31_4
; KNL_32-NEXT: # %bb.3: # %cond.load1
; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0
; KNL_32-NEXT: jne .LBB31_1
; KNL_32-NEXT: # %bb.2: # %else
; KNL_32-NEXT: testb $2, %al
; KNL_32-NEXT: jne .LBB31_3
; KNL_32-NEXT: .LBB31_4: # %else2
; KNL_32-NEXT: kshiftrw $2, %k0, %k0
; KNL_32-NEXT: kmovw %k0, %eax
; KNL_32-NEXT: testb $1, %al
; KNL_32-NEXT: testb $4, %al
; KNL_32-NEXT: jne .LBB31_5
; KNL_32-NEXT: .LBB31_6: # %else5
; KNL_32-NEXT: addl $12, %esp
; KNL_32-NEXT: .cfi_def_cfa_offset 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB31_1: # %cond.load
; KNL_32-NEXT: .cfi_def_cfa_offset 16
; KNL_32-NEXT: vmovd %xmm1, %ecx
; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0
; KNL_32-NEXT: testb $2, %al
; KNL_32-NEXT: je .LBB31_4
; KNL_32-NEXT: .LBB31_3: # %cond.load1
; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
; KNL_32-NEXT: testb $4, %al
; KNL_32-NEXT: je .LBB31_6
; KNL_32-NEXT: # %bb.5: # %cond.load4
; KNL_32-NEXT: .LBB31_5: # %cond.load4
; KNL_32-NEXT: vpextrd $2, %xmm1, %eax
; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
; KNL_32-NEXT: .LBB31_6: # %else5
; KNL_32-NEXT: addl $12, %esp
; KNL_32-NEXT: .cfi_def_cfa_offset 4
; KNL_32-NEXT: vzeroupper
@ -1733,33 +1766,47 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vpmovd2m %xmm2, %k0
; SKX-NEXT: kshiftrb $1, %k0, %k1
; SKX-NEXT: kshiftrb $2, %k0, %k2
; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: kmovw %k1, %ecx
; SKX-NEXT: andb $1, %cl
; SKX-NEXT: addb %cl, %cl
; SKX-NEXT: orb %al, %cl
; SKX-NEXT: kmovw %k2, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: shlb $2, %al
; SKX-NEXT: orb %cl, %al
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; SKX-NEXT: testb $1, %al
; SKX-NEXT: je .LBB31_2
; SKX-NEXT: # %bb.1: # %cond.load
; SKX-NEXT: vmovq %xmm0, %rax
; SKX-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3
; SKX-NEXT: .LBB31_2: # %else
; SKX-NEXT: kshiftrb $1, %k0, %k1
; SKX-NEXT: kmovw %k1, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: je .LBB31_4
; SKX-NEXT: # %bb.3: # %cond.load1
; SKX-NEXT: vpextrq $1, %xmm0, %rax
; SKX-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
; SKX-NEXT: jne .LBB31_1
; SKX-NEXT: # %bb.2: # %else
; SKX-NEXT: testb $2, %al
; SKX-NEXT: jne .LBB31_3
; SKX-NEXT: .LBB31_4: # %else2
; SKX-NEXT: kshiftrb $2, %k0, %k0
; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: testb $4, %al
; SKX-NEXT: jne .LBB31_5
; SKX-NEXT: .LBB31_6: # %else5
; SKX-NEXT: vmovdqa %xmm3, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: .LBB31_1: # %cond.load
; SKX-NEXT: vmovq %xmm0, %rcx
; SKX-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3
; SKX-NEXT: testb $2, %al
; SKX-NEXT: je .LBB31_4
; SKX-NEXT: .LBB31_3: # %cond.load1
; SKX-NEXT: vpextrq $1, %xmm0, %rcx
; SKX-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3
; SKX-NEXT: testb $4, %al
; SKX-NEXT: je .LBB31_6
; SKX-NEXT: # %bb.5: # %cond.load4
; SKX-NEXT: .LBB31_5: # %cond.load4
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vmovq %xmm0, %rax
; SKX-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
; SKX-NEXT: .LBB31_6: # %else5
; SKX-NEXT: vmovdqa %xmm3, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@ -1771,32 +1818,47 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; SKX_32-NEXT: vmovdqa %xmm0, %xmm3
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm0
; SKX_32-NEXT: vpmovd2m %xmm0, %k0
; SKX_32-NEXT: kshiftrb $1, %k0, %k1
; SKX_32-NEXT: kshiftrb $2, %k0, %k2
; SKX_32-NEXT: kmovw %k0, %eax
; SKX_32-NEXT: andb $1, %al
; SKX_32-NEXT: kmovw %k1, %ecx
; SKX_32-NEXT: andb $1, %cl
; SKX_32-NEXT: addb %cl, %cl
; SKX_32-NEXT: orb %al, %cl
; SKX_32-NEXT: kmovw %k2, %eax
; SKX_32-NEXT: andb $1, %al
; SKX_32-NEXT: shlb $2, %al
; SKX_32-NEXT: orb %cl, %al
; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: je .LBB31_2
; SKX_32-NEXT: # %bb.1: # %cond.load
; SKX_32-NEXT: vmovd %xmm1, %eax
; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: .LBB31_2: # %else
; SKX_32-NEXT: kshiftrb $1, %k0, %k1
; SKX_32-NEXT: kmovw %k1, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: je .LBB31_4
; SKX_32-NEXT: # %bb.3: # %cond.load1
; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: jne .LBB31_1
; SKX_32-NEXT: # %bb.2: # %else
; SKX_32-NEXT: testb $2, %al
; SKX_32-NEXT: jne .LBB31_3
; SKX_32-NEXT: .LBB31_4: # %else2
; SKX_32-NEXT: kshiftrb $2, %k0, %k0
; SKX_32-NEXT: kmovw %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: testb $4, %al
; SKX_32-NEXT: jne .LBB31_5
; SKX_32-NEXT: .LBB31_6: # %else5
; SKX_32-NEXT: addl $12, %esp
; SKX_32-NEXT: .cfi_def_cfa_offset 4
; SKX_32-NEXT: retl
; SKX_32-NEXT: .LBB31_1: # %cond.load
; SKX_32-NEXT: .cfi_def_cfa_offset 16
; SKX_32-NEXT: vmovd %xmm1, %ecx
; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0
; SKX_32-NEXT: testb $2, %al
; SKX_32-NEXT: je .LBB31_4
; SKX_32-NEXT: .LBB31_3: # %cond.load1
; SKX_32-NEXT: vpextrd $1, %xmm1, %ecx
; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
; SKX_32-NEXT: testb $4, %al
; SKX_32-NEXT: je .LBB31_6
; SKX_32-NEXT: # %bb.5: # %cond.load4
; SKX_32-NEXT: .LBB31_5: # %cond.load4
; SKX_32-NEXT: vpextrd $2, %xmm1, %eax
; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: .LBB31_6: # %else5
; SKX_32-NEXT: addl $12, %esp
; SKX_32-NEXT: .cfi_def_cfa_offset 4
; SKX_32-NEXT: retl

View File

@ -122,20 +122,23 @@ define void @test_scatter_v2i32_index(<2 x double> %a1, double* %base, <2 x i32>
; WIDEN_AVX2-NEXT: vmovq %rdi, %xmm3
; WIDEN_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
; WIDEN_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; WIDEN_AVX2-NEXT: vpextrb $0, %xmm2, %eax
; WIDEN_AVX2-NEXT: testb $1, %al
; WIDEN_AVX2-NEXT: je .LBB1_2
; WIDEN_AVX2-NEXT: # %bb.1: # %cond.store
; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax
; WIDEN_AVX2-NEXT: vmovlps %xmm0, (%rax)
; WIDEN_AVX2-NEXT: .LBB1_2: # %else
; WIDEN_AVX2-NEXT: vpextrb $8, %xmm2, %eax
; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax
; WIDEN_AVX2-NEXT: testb $1, %al
; WIDEN_AVX2-NEXT: jne .LBB1_1
; WIDEN_AVX2-NEXT: # %bb.2: # %else
; WIDEN_AVX2-NEXT: testb $2, %al
; WIDEN_AVX2-NEXT: jne .LBB1_3
; WIDEN_AVX2-NEXT: .LBB1_4: # %else2
; WIDEN_AVX2-NEXT: retq
; WIDEN_AVX2-NEXT: .LBB1_1: # %cond.store
; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx
; WIDEN_AVX2-NEXT: vmovlps %xmm0, (%rcx)
; WIDEN_AVX2-NEXT: testb $2, %al
; WIDEN_AVX2-NEXT: je .LBB1_4
; WIDEN_AVX2-NEXT: # %bb.3: # %cond.store1
; WIDEN_AVX2-NEXT: .LBB1_3: # %cond.store1
; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; WIDEN_AVX2-NEXT: vmovhps %xmm0, (%rax)
; WIDEN_AVX2-NEXT: .LBB1_4: # %else2
; WIDEN_AVX2-NEXT: retq
;
; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index:
@ -147,20 +150,23 @@ define void @test_scatter_v2i32_index(<2 x double> %a1, double* %base, <2 x i32>
; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3
; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; PROMOTE_AVX2-NEXT: vpextrb $0, %xmm2, %eax
; PROMOTE_AVX2-NEXT: testb $1, %al
; PROMOTE_AVX2-NEXT: je .LBB1_2
; PROMOTE_AVX2-NEXT: # %bb.1: # %cond.store
; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rax
; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: .LBB1_2: # %else
; PROMOTE_AVX2-NEXT: vpextrb $8, %xmm2, %eax
; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax
; PROMOTE_AVX2-NEXT: testb $1, %al
; PROMOTE_AVX2-NEXT: jne .LBB1_1
; PROMOTE_AVX2-NEXT: # %bb.2: # %else
; PROMOTE_AVX2-NEXT: testb $2, %al
; PROMOTE_AVX2-NEXT: jne .LBB1_3
; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2
; PROMOTE_AVX2-NEXT: retq
; PROMOTE_AVX2-NEXT: .LBB1_1: # %cond.store
; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx
; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rcx)
; PROMOTE_AVX2-NEXT: testb $2, %al
; PROMOTE_AVX2-NEXT: je .LBB1_4
; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1
; PROMOTE_AVX2-NEXT: .LBB1_3: # %cond.store1
; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; PROMOTE_AVX2-NEXT: vmovhps %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2
; PROMOTE_AVX2-NEXT: retq
%gep = getelementptr double, double *%base, <2 x i32> %ind
call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
@ -273,38 +279,44 @@ define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mas
;
; WIDEN_AVX2-LABEL: test_scatter_v2i32_data:
; WIDEN_AVX2: # %bb.0:
; WIDEN_AVX2-NEXT: vpextrb $0, %xmm2, %eax
; WIDEN_AVX2-NEXT: testb $1, %al
; WIDEN_AVX2-NEXT: je .LBB3_2
; WIDEN_AVX2-NEXT: # %bb.1: # %cond.store
; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax
; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax)
; WIDEN_AVX2-NEXT: .LBB3_2: # %else
; WIDEN_AVX2-NEXT: vpextrb $8, %xmm2, %eax
; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax
; WIDEN_AVX2-NEXT: testb $1, %al
; WIDEN_AVX2-NEXT: jne .LBB3_1
; WIDEN_AVX2-NEXT: # %bb.2: # %else
; WIDEN_AVX2-NEXT: testb $2, %al
; WIDEN_AVX2-NEXT: jne .LBB3_3
; WIDEN_AVX2-NEXT: .LBB3_4: # %else2
; WIDEN_AVX2-NEXT: retq
; WIDEN_AVX2-NEXT: .LBB3_1: # %cond.store
; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx
; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rcx)
; WIDEN_AVX2-NEXT: testb $2, %al
; WIDEN_AVX2-NEXT: je .LBB3_4
; WIDEN_AVX2-NEXT: # %bb.3: # %cond.store1
; WIDEN_AVX2-NEXT: .LBB3_3: # %cond.store1
; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax)
; WIDEN_AVX2-NEXT: .LBB3_4: # %else2
; WIDEN_AVX2-NEXT: retq
;
; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data:
; PROMOTE_AVX2: # %bb.0:
; PROMOTE_AVX2-NEXT: vpextrb $0, %xmm2, %eax
; PROMOTE_AVX2-NEXT: testb $1, %al
; PROMOTE_AVX2-NEXT: je .LBB3_2
; PROMOTE_AVX2-NEXT: # %bb.1: # %cond.store
; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rax
; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: .LBB3_2: # %else
; PROMOTE_AVX2-NEXT: vpextrb $8, %xmm2, %eax
; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax
; PROMOTE_AVX2-NEXT: testb $1, %al
; PROMOTE_AVX2-NEXT: jne .LBB3_1
; PROMOTE_AVX2-NEXT: # %bb.2: # %else
; PROMOTE_AVX2-NEXT: testb $2, %al
; PROMOTE_AVX2-NEXT: jne .LBB3_3
; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2
; PROMOTE_AVX2-NEXT: retq
; PROMOTE_AVX2-NEXT: .LBB3_1: # %cond.store
; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx
; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx)
; PROMOTE_AVX2-NEXT: testb $2, %al
; PROMOTE_AVX2-NEXT: je .LBB3_4
; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1
; PROMOTE_AVX2-NEXT: .LBB3_3: # %cond.store1
; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2
; PROMOTE_AVX2-NEXT: retq
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
ret void
@ -425,20 +437,23 @@ define void @test_scatter_v2i32_data_index(<2 x i32> %a1, i32* %base, <2 x i32>
; WIDEN_AVX2-NEXT: vmovq %rdi, %xmm3
; WIDEN_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
; WIDEN_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; WIDEN_AVX2-NEXT: vpextrb $0, %xmm2, %eax
; WIDEN_AVX2-NEXT: testb $1, %al
; WIDEN_AVX2-NEXT: je .LBB5_2
; WIDEN_AVX2-NEXT: # %bb.1: # %cond.store
; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax
; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax)
; WIDEN_AVX2-NEXT: .LBB5_2: # %else
; WIDEN_AVX2-NEXT: vpextrb $8, %xmm2, %eax
; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax
; WIDEN_AVX2-NEXT: testb $1, %al
; WIDEN_AVX2-NEXT: jne .LBB5_1
; WIDEN_AVX2-NEXT: # %bb.2: # %else
; WIDEN_AVX2-NEXT: testb $2, %al
; WIDEN_AVX2-NEXT: jne .LBB5_3
; WIDEN_AVX2-NEXT: .LBB5_4: # %else2
; WIDEN_AVX2-NEXT: retq
; WIDEN_AVX2-NEXT: .LBB5_1: # %cond.store
; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx
; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rcx)
; WIDEN_AVX2-NEXT: testb $2, %al
; WIDEN_AVX2-NEXT: je .LBB5_4
; WIDEN_AVX2-NEXT: # %bb.3: # %cond.store1
; WIDEN_AVX2-NEXT: .LBB5_3: # %cond.store1
; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax)
; WIDEN_AVX2-NEXT: .LBB5_4: # %else2
; WIDEN_AVX2-NEXT: retq
;
; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index:
@ -450,20 +465,23 @@ define void @test_scatter_v2i32_data_index(<2 x i32> %a1, i32* %base, <2 x i32>
; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3
; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; PROMOTE_AVX2-NEXT: vpextrb $0, %xmm2, %eax
; PROMOTE_AVX2-NEXT: testb $1, %al
; PROMOTE_AVX2-NEXT: je .LBB5_2
; PROMOTE_AVX2-NEXT: # %bb.1: # %cond.store
; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rax
; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: .LBB5_2: # %else
; PROMOTE_AVX2-NEXT: vpextrb $8, %xmm2, %eax
; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax
; PROMOTE_AVX2-NEXT: testb $1, %al
; PROMOTE_AVX2-NEXT: jne .LBB5_1
; PROMOTE_AVX2-NEXT: # %bb.2: # %else
; PROMOTE_AVX2-NEXT: testb $2, %al
; PROMOTE_AVX2-NEXT: jne .LBB5_3
; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2
; PROMOTE_AVX2-NEXT: retq
; PROMOTE_AVX2-NEXT: .LBB5_1: # %cond.store
; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx
; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx)
; PROMOTE_AVX2-NEXT: testb $2, %al
; PROMOTE_AVX2-NEXT: je .LBB5_4
; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1
; PROMOTE_AVX2-NEXT: .LBB5_3: # %cond.store1
; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2
; PROMOTE_AVX2-NEXT: retq
%gep = getelementptr i32, i32 *%base, <2 x i32> %ind
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %gep, i32 4, <2 x i1> %mask)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,8 +3,10 @@
define <2 x i64> @scalarize_v2i64(<2 x i64*> %p, <2 x i1> %mask, <2 x i64> %passthru) {
; CHECK-LABEL: @scalarize_v2i64(
; CHECK-NEXT: [[MASK0:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
; CHECK-NEXT: br i1 [[MASK0]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK: cond.load:
; CHECK-NEXT: [[PTR0:%.*]] = extractelement <2 x i64*> [[P:%.*]], i64 0
; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[PTR0]], align 8
@ -12,8 +14,9 @@ define <2 x i64> @scalarize_v2i64(<2 x i64*> %p, <2 x i1> %mask, <2 x i64> %pass
; CHECK-NEXT: br label [[ELSE]]
; CHECK: else:
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[RES0]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-NEXT: [[MASK1:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
; CHECK-NEXT: br i1 [[MASK1]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK-NEXT: [[TMP3:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i2 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK: cond.load1:
; CHECK-NEXT: [[PTR1:%.*]] = extractelement <2 x i64*> [[P]], i64 1
; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[PTR1]], align 8

View File

@ -4,24 +4,27 @@
define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %passthru) {
; CHECK-LABEL: @scalarize_v2i64(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0
; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK: cond.load:
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP4]], i64 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP5]], i64 0
; CHECK-NEXT: br label [[ELSE]]
; CHECK: else:
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0
; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK: cond.load1:
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP8]], i64 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP9]], align 8
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP10]], i64 1
; CHECK-NEXT: br label [[ELSE2]]
; CHECK: else2:
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 128, <2 x i1> %mask, <2 x i64> %passthru)
@ -62,24 +65,27 @@ define <2 x i64> @scalarize_v2i64_const_mask(<2 x i64>* %p, <2 x i64> %passthru)
define <2 x i24> @scalarize_v2i24(<2 x i24>* %p, <2 x i1> %mask, <2 x i24> %passthru) {
; CHECK-LABEL: @scalarize_v2i24(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i24>* [[P:%.*]] to i24*
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0
; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK: cond.load:
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = load i24, i24* [[TMP3]], align 1
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP4]], i64 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = load i24, i24* [[TMP4]], align 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP5]], i64 0
; CHECK-NEXT: br label [[ELSE]]
; CHECK: else:
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0
; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK: cond.load1:
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = load i24, i24* [[TMP7]], align 1
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP8]], i64 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = load i24, i24* [[TMP9]], align 1
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP10]], i64 1
; CHECK-NEXT: br label [[ELSE2]]
; CHECK: else2:
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-NEXT: ret <2 x i24> [[RES_PHI_ELSE3]]
;
%ret = call <2 x i24> @llvm.masked.load.v2i24.p0v2i24(<2 x i24>* %p, i32 8, <2 x i1> %mask, <2 x i24> %passthru)
@ -90,24 +96,27 @@ define <2 x i24> @scalarize_v2i24(<2 x i24>* %p, <2 x i1> %mask, <2 x i24> %pass
define <2 x i48> @scalarize_v2i48(<2 x i48>* %p, <2 x i1> %mask, <2 x i48> %passthru) {
; CHECK-LABEL: @scalarize_v2i48(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i48>* [[P:%.*]] to i48*
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0
; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK: cond.load:
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = load i48, i48* [[TMP3]], align 2
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP4]], i64 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = load i48, i48* [[TMP4]], align 2
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP5]], i64 0
; CHECK-NEXT: br label [[ELSE]]
; CHECK: else:
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0
; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK: cond.load1:
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = load i48, i48* [[TMP7]], align 2
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP8]], i64 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = load i48, i48* [[TMP9]], align 2
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP10]], i64 1
; CHECK-NEXT: br label [[ELSE2]]
; CHECK: else2:
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-NEXT: ret <2 x i48> [[RES_PHI_ELSE3]]
;
%ret = call <2 x i48> @llvm.masked.load.v2i48.p0v2i48(<2 x i48>* %p, i32 16, <2 x i1> %mask, <2 x i48> %passthru)

View File

@ -4,20 +4,23 @@
define void @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %data) {
; CHECK-LABEL: @scalarize_v2i64(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
; CHECK-NEXT: br i1 [[TMP2]], label [[COND_STORE:%.*]], label [[ELSE:%.*]]
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0
; CHECK-NEXT: br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[ELSE:%.*]]
; CHECK: cond.store:
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
; CHECK-NEXT: store i64 [[TMP3]], i64* [[TMP4]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
; CHECK-NEXT: store i64 [[TMP4]], i64* [[TMP5]], align 8
; CHECK-NEXT: br label [[ELSE]]
; CHECK: else:
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
; CHECK-NEXT: br i1 [[TMP5]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
; CHECK-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
; CHECK: cond.store1:
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP7]], align 8
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
; CHECK-NEXT: store i64 [[TMP8]], i64* [[TMP9]], align 8
; CHECK-NEXT: br label [[ELSE2]]
; CHECK: else2:
; CHECK-NEXT: ret void