1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[X86] Break non-power of 2 vXi1 vectors into scalars for argument passing with avx512.

This generates worse code, but matches what is done for avx2 and
prevents crashes when more arguments are passed than we have
registers for.

llvm-svn: 372200
This commit is contained in:
Craig Topper 2019-09-18 06:06:11 +00:00
parent 043fb9113b
commit a5385636e7
4 changed files with 2048 additions and 557 deletions

View File

@ -1996,14 +1996,16 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
// Break wide vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return MVT::i8;
// v32i1 vectors should be promoted to v32i8 to match avx2.
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return MVT::v32i8;
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return MVT::i8;
// FIXME: Should we just make these types legal and custom split operations?
if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
@ -2014,14 +2016,16 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
// Break wide vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return VT.getVectorNumElements();
// v32i1 vectors should be promoted to v32i8 to match avx2.
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return 1;
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return VT.getVectorNumElements();
// FIXME: Should we just make these types legal and custom split operations?
if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
@ -2032,10 +2036,11 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
// Break wide vXi1 vectors into scalars to match avx2 behavior.
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) ||
(!isPowerOf2_32(VT.getVectorNumElements()) ||
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
RegisterVT = MVT::i8;
IntermediateVT = MVT::i1;

File diff suppressed because it is too large Load Diff

View File

@ -1626,75 +1626,62 @@ declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <
define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
; KNL_64-LABEL: test30:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL_64-NEXT: kshiftrw $1, %k0, %k1
; KNL_64-NEXT: kshiftrw $2, %k0, %k2
; KNL_64-NEXT: kmovw %k0, %eax
; KNL_64-NEXT: andb $1, %al
; KNL_64-NEXT: kmovw %k1, %ecx
; KNL_64-NEXT: andb $1, %cl
; KNL_64-NEXT: addb %cl, %cl
; KNL_64-NEXT: orb %al, %cl
; KNL_64-NEXT: kmovw %k2, %eax
; KNL_64-NEXT: andb $1, %al
; KNL_64-NEXT: shlb $2, %al
; KNL_64-NEXT: orb %cl, %al
; KNL_64-NEXT: andb $1, %dil
; KNL_64-NEXT: andb $1, %sil
; KNL_64-NEXT: addb %sil, %sil
; KNL_64-NEXT: orb %dil, %sil
; KNL_64-NEXT: andb $1, %dl
; KNL_64-NEXT: shlb $2, %dl
; KNL_64-NEXT: orb %sil, %dl
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; KNL_64-NEXT: testb $1, %al
; KNL_64-NEXT: testb $1, %dl
; KNL_64-NEXT: jne .LBB31_1
; KNL_64-NEXT: # %bb.2: # %else
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: testb $2, %dl
; KNL_64-NEXT: jne .LBB31_3
; KNL_64-NEXT: .LBB31_4: # %else2
; KNL_64-NEXT: testb $4, %al
; KNL_64-NEXT: testb $4, %dl
; KNL_64-NEXT: jne .LBB31_5
; KNL_64-NEXT: .LBB31_6: # %else5
; KNL_64-NEXT: vmovdqa %xmm3, %xmm0
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB31_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: vmovq %xmm0, %rax
; KNL_64-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2
; KNL_64-NEXT: testb $2, %dl
; KNL_64-NEXT: je .LBB31_4
; KNL_64-NEXT: .LBB31_3: # %cond.load1
; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3
; KNL_64-NEXT: testb $4, %al
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
; KNL_64-NEXT: testb $4, %dl
; KNL_64-NEXT: je .LBB31_6
; KNL_64-NEXT: .LBB31_5: # %cond.load4
; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL_64-NEXT: vmovq %xmm0, %rax
; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
; KNL_64-NEXT: vmovdqa %xmm3, %xmm0
; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test30:
; KNL_32: # %bb.0:
; KNL_32-NEXT: subl $12, %esp
; KNL_32-NEXT: .cfi_def_cfa_offset 16
; KNL_32-NEXT: vmovdqa %xmm0, %xmm3
; KNL_32-NEXT: vpslld $31, %xmm2, %xmm0
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL_32-NEXT: kshiftrw $1, %k0, %k1
; KNL_32-NEXT: kshiftrw $2, %k0, %k2
; KNL_32-NEXT: kmovw %k0, %eax
; KNL_32-NEXT: pushl %eax
; KNL_32-NEXT: .cfi_def_cfa_offset 8
; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
; KNL_32-NEXT: andb $1, %al
; KNL_32-NEXT: kmovw %k1, %ecx
; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %cl
; KNL_32-NEXT: andb $1, %cl
; KNL_32-NEXT: addb %cl, %cl
; KNL_32-NEXT: orb %al, %cl
; KNL_32-NEXT: kmovw %k2, %eax
; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
; KNL_32-NEXT: andb $1, %al
; KNL_32-NEXT: shlb $2, %al
; KNL_32-NEXT: orb %cl, %al
; KNL_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
; KNL_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; KNL_32-NEXT: testb $1, %al
; KNL_32-NEXT: jne .LBB31_1
; KNL_32-NEXT: # %bb.2: # %else
@ -1702,102 +1689,86 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; KNL_32-NEXT: jne .LBB31_3
; KNL_32-NEXT: .LBB31_4: # %else2
; KNL_32-NEXT: testb $4, %al
; KNL_32-NEXT: jne .LBB31_5
; KNL_32-NEXT: je .LBB31_6
; KNL_32-NEXT: .LBB31_5: # %cond.load4
; KNL_32-NEXT: vpextrd $2, %xmm0, %eax
; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
; KNL_32-NEXT: .LBB31_6: # %else5
; KNL_32-NEXT: addl $12, %esp
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
; KNL_32-NEXT: popl %eax
; KNL_32-NEXT: .cfi_def_cfa_offset 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB31_1: # %cond.load
; KNL_32-NEXT: .cfi_def_cfa_offset 16
; KNL_32-NEXT: vmovd %xmm1, %ecx
; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0
; KNL_32-NEXT: .cfi_def_cfa_offset 8
; KNL_32-NEXT: vmovd %xmm0, %ecx
; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
; KNL_32-NEXT: testb $2, %al
; KNL_32-NEXT: je .LBB31_4
; KNL_32-NEXT: .LBB31_3: # %cond.load1
; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2
; KNL_32-NEXT: testb $4, %al
; KNL_32-NEXT: je .LBB31_6
; KNL_32-NEXT: .LBB31_5: # %cond.load4
; KNL_32-NEXT: vpextrd $2, %xmm1, %eax
; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
; KNL_32-NEXT: addl $12, %esp
; KNL_32-NEXT: .cfi_def_cfa_offset 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: jne .LBB31_5
; KNL_32-NEXT: jmp .LBB31_6
;
; SKX-LABEL: test30:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vpmovd2m %xmm2, %k0
; SKX-NEXT: kshiftrb $1, %k0, %k1
; SKX-NEXT: kshiftrb $2, %k0, %k2
; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: kmovw %k1, %ecx
; SKX-NEXT: andb $1, %cl
; SKX-NEXT: addb %cl, %cl
; SKX-NEXT: orb %al, %cl
; SKX-NEXT: kmovw %k2, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: shlb $2, %al
; SKX-NEXT: orb %cl, %al
; SKX-NEXT: andb $1, %dil
; SKX-NEXT: andb $1, %sil
; SKX-NEXT: addb %sil, %sil
; SKX-NEXT: orb %dil, %sil
; SKX-NEXT: andb $1, %dl
; SKX-NEXT: shlb $2, %dl
; SKX-NEXT: orb %sil, %dl
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; SKX-NEXT: testb $1, %al
; SKX-NEXT: testb $1, %dl
; SKX-NEXT: jne .LBB31_1
; SKX-NEXT: # %bb.2: # %else
; SKX-NEXT: testb $2, %al
; SKX-NEXT: testb $2, %dl
; SKX-NEXT: jne .LBB31_3
; SKX-NEXT: .LBB31_4: # %else2
; SKX-NEXT: testb $4, %al
; SKX-NEXT: testb $4, %dl
; SKX-NEXT: jne .LBB31_5
; SKX-NEXT: .LBB31_6: # %else5
; SKX-NEXT: vmovdqa %xmm3, %xmm0
; SKX-NEXT: vmovdqa %xmm2, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: .LBB31_1: # %cond.load
; SKX-NEXT: vmovq %xmm0, %rcx
; SKX-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3
; SKX-NEXT: testb $2, %al
; SKX-NEXT: vmovq %xmm0, %rax
; SKX-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2
; SKX-NEXT: testb $2, %dl
; SKX-NEXT: je .LBB31_4
; SKX-NEXT: .LBB31_3: # %cond.load1
; SKX-NEXT: vpextrq $1, %xmm0, %rcx
; SKX-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3
; SKX-NEXT: testb $4, %al
; SKX-NEXT: vpextrq $1, %xmm0, %rax
; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
; SKX-NEXT: testb $4, %dl
; SKX-NEXT: je .LBB31_6
; SKX-NEXT: .LBB31_5: # %cond.load4
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vmovq %xmm0, %rax
; SKX-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
; SKX-NEXT: vmovdqa %xmm3, %xmm0
; SKX-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
; SKX-NEXT: vmovdqa %xmm2, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test30:
; SKX_32: # %bb.0:
; SKX_32-NEXT: subl $12, %esp
; SKX_32-NEXT: .cfi_def_cfa_offset 16
; SKX_32-NEXT: vmovdqa %xmm0, %xmm3
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm0
; SKX_32-NEXT: vpmovd2m %xmm0, %k0
; SKX_32-NEXT: kshiftrb $1, %k0, %k1
; SKX_32-NEXT: kshiftrb $2, %k0, %k2
; SKX_32-NEXT: kmovw %k0, %eax
; SKX_32-NEXT: pushl %eax
; SKX_32-NEXT: .cfi_def_cfa_offset 8
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
; SKX_32-NEXT: andb $1, %al
; SKX_32-NEXT: kmovw %k1, %ecx
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %cl
; SKX_32-NEXT: andb $1, %cl
; SKX_32-NEXT: addb %cl, %cl
; SKX_32-NEXT: orb %al, %cl
; SKX_32-NEXT: kmovw %k2, %eax
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
; SKX_32-NEXT: andb $1, %al
; SKX_32-NEXT: shlb $2, %al
; SKX_32-NEXT: orb %cl, %al
; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB31_1
; SKX_32-NEXT: # %bb.2: # %else
@ -1805,28 +1776,27 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; SKX_32-NEXT: jne .LBB31_3
; SKX_32-NEXT: .LBB31_4: # %else2
; SKX_32-NEXT: testb $4, %al
; SKX_32-NEXT: jne .LBB31_5
; SKX_32-NEXT: je .LBB31_6
; SKX_32-NEXT: .LBB31_5: # %cond.load4
; SKX_32-NEXT: vpextrd $2, %xmm0, %eax
; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
; SKX_32-NEXT: .LBB31_6: # %else5
; SKX_32-NEXT: addl $12, %esp
; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
; SKX_32-NEXT: popl %eax
; SKX_32-NEXT: .cfi_def_cfa_offset 4
; SKX_32-NEXT: retl
; SKX_32-NEXT: .LBB31_1: # %cond.load
; SKX_32-NEXT: .cfi_def_cfa_offset 16
; SKX_32-NEXT: vmovd %xmm1, %ecx
; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0
; SKX_32-NEXT: .cfi_def_cfa_offset 8
; SKX_32-NEXT: vmovd %xmm0, %ecx
; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
; SKX_32-NEXT: testb $2, %al
; SKX_32-NEXT: je .LBB31_4
; SKX_32-NEXT: .LBB31_3: # %cond.load1
; SKX_32-NEXT: vpextrd $1, %xmm1, %ecx
; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2
; SKX_32-NEXT: testb $4, %al
; SKX_32-NEXT: je .LBB31_6
; SKX_32-NEXT: .LBB31_5: # %cond.load4
; SKX_32-NEXT: vpextrd $2, %xmm1, %eax
; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: addl $12, %esp
; SKX_32-NEXT: .cfi_def_cfa_offset 4
; SKX_32-NEXT: retl
; SKX_32-NEXT: jne .LBB31_5
; SKX_32-NEXT: jmp .LBB31_6
%sext_ind = sext <3 x i32> %ind to <3 x i64>
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind

View File

@ -4913,12 +4913,24 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
; AVX512F-LABEL: widen_masked_store:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %edx, %k0
; AVX512F-NEXT: andl $1, %esi
; AVX512F-NEXT: kmovw %esi, %k1
; AVX512F-NEXT: kxorw %k0, %k0, %k2
; AVX512F-NEXT: kshiftrw $1, %k2, %k2
; AVX512F-NEXT: kshiftlw $1, %k2, %k2
; AVX512F-NEXT: korw %k1, %k2, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k2
; AVX512F-NEXT: kxorw %k0, %k2, %k0
; AVX512F-NEXT: kshiftlw $15, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k0
; AVX512F-NEXT: kxorw %k0, %k1, %k0
; AVX512F-NEXT: kshiftrw $2, %k0, %k1
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kxorw %k2, %k1, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $13, %k1, %k1
; AVX512F-NEXT: kxorw %k1, %k0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
@ -4927,24 +4939,48 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
;
; AVX512VLDQ-LABEL: widen_masked_store:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k0
; AVX512VLDQ-NEXT: vpmovm2d %k0, %xmm1
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1
; AVX512VLDQ-NEXT: kmovw %edx, %k0
; AVX512VLDQ-NEXT: kmovw %esi, %k1
; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1
; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1
; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k2
; AVX512VLDQ-NEXT: kshiftrb $1, %k2, %k2
; AVX512VLDQ-NEXT: kshiftlb $1, %k2, %k2
; AVX512VLDQ-NEXT: korb %k1, %k2, %k1
; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k2
; AVX512VLDQ-NEXT: kxorb %k0, %k2, %k0
; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0
; AVX512VLDQ-NEXT: kxorb %k0, %k1, %k0
; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %ecx, %k2
; AVX512VLDQ-NEXT: kxorb %k2, %k1, %k1
; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1
; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1
; AVX512VLDQ-NEXT: kxorw %k1, %k0, %k1
; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: widen_masked_store:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512VLBW-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512VLBW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm1 {%k1} {z}
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
; AVX512VLBW-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512VLBW-NEXT: kmovd %edx, %k0
; AVX512VLBW-NEXT: andl $1, %esi
; AVX512VLBW-NEXT: kmovw %esi, %k1
; AVX512VLBW-NEXT: kxorw %k0, %k0, %k2
; AVX512VLBW-NEXT: kshiftrw $1, %k2, %k2
; AVX512VLBW-NEXT: kshiftlw $1, %k2, %k2
; AVX512VLBW-NEXT: korw %k1, %k2, %k1
; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k2
; AVX512VLBW-NEXT: kxorw %k0, %k2, %k0
; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k0
; AVX512VLBW-NEXT: kxorw %k0, %k1, %k0
; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1
; AVX512VLBW-NEXT: kmovd %ecx, %k2
; AVX512VLBW-NEXT: kxorw %k2, %k1, %k1
; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1
; AVX512VLBW-NEXT: kxorw %k1, %k0, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)