1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00
llvm-mirror/test/CodeGen/X86/masked_load.ll

6859 lines
265 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
;
; vXf64
;
define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
; SSE-LABEL: load_v1f64_v1i64:
; SSE: ## %bb.0:
; SSE-NEXT: testq %rdi, %rdi
; SSE-NEXT: jne LBB0_2
; SSE-NEXT: ## %bb.1: ## %cond.load
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: LBB0_2: ## %else
; SSE-NEXT: retq
;
; AVX-LABEL: load_v1f64_v1i64:
; AVX: ## %bb.0:
; AVX-NEXT: testq %rdi, %rdi
; AVX-NEXT: jne LBB0_2
; AVX-NEXT: ## %bb.1: ## %cond.load
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: LBB0_2: ## %else
; AVX-NEXT: retq
%mask = icmp eq <1 x i64> %trigger, zeroinitializer
%res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1> %mask, <1 x double> %dst)
ret <1 x double> %res
}
define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
; SSE2-LABEL: load_v2f64_v2i64:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB1_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB1_3
; SSE2-NEXT: LBB1_4: ## %else2
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: LBB1_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB1_4
; SSE2-NEXT: LBB1_3: ## %cond.load1
; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v2f64_v2i64:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pcmpeqq %xmm0, %xmm2
; SSE42-NEXT: movmskpd %xmm2, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB1_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB1_3
; SSE42-NEXT: LBB1_4: ## %else2
; SSE42-NEXT: movaps %xmm1, %xmm0
; SSE42-NEXT: retq
; SSE42-NEXT: LBB1_1: ## %cond.load
; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB1_4
; SSE42-NEXT: LBB1_3: ## %cond.load1
; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: movaps %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: load_v2f64_v2i64:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1OR2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX1OR2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
; AVX1OR2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: load_v2f64_v2i64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v2f64_v2i64:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
ret <2 x double> %res
}
define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
; SSE-LABEL: load_v4f64_v4i32:
; SSE: ## %bb.0:
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: pcmpeqd %xmm0, %xmm3
; SSE-NEXT: movmskps %xmm3, %eax
; SSE-NEXT: testb $1, %al
; SSE-NEXT: jne LBB2_1
; SSE-NEXT: ## %bb.2: ## %else
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne LBB2_3
; SSE-NEXT: LBB2_4: ## %else2
; SSE-NEXT: testb $4, %al
; SSE-NEXT: jne LBB2_5
; SSE-NEXT: LBB2_6: ## %else5
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je LBB2_8
; SSE-NEXT: LBB2_7: ## %cond.load7
; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE-NEXT: LBB2_8: ## %else8
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
; SSE-NEXT: LBB2_1: ## %cond.load
; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB2_4
; SSE-NEXT: LBB2_3: ## %cond.load1
; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je LBB2_6
; SSE-NEXT: LBB2_5: ## %cond.load4
; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE-NEXT: testb $8, %al
; SSE-NEXT: jne LBB2_7
; SSE-NEXT: jmp LBB2_8
;
; AVX1-LABEL: load_v4f64_v4i32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v4f64_v4i32:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v4f64_v4i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v4f64_v4i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double> %dst)
ret <4 x double> %res
}
define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %addr) {
; SSE-LABEL: load_v4f64_v4i32_zero:
; SSE: ## %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pcmpeqd %xmm0, %xmm1
; SSE-NEXT: movmskps %xmm1, %eax
; SSE-NEXT: testb $1, %al
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: jne LBB3_1
; SSE-NEXT: ## %bb.2: ## %else
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne LBB3_3
; SSE-NEXT: LBB3_4: ## %else2
; SSE-NEXT: testb $4, %al
; SSE-NEXT: jne LBB3_5
; SSE-NEXT: LBB3_6: ## %else5
; SSE-NEXT: testb $8, %al
; SSE-NEXT: jne LBB3_7
; SSE-NEXT: LBB3_8: ## %else8
; SSE-NEXT: retq
; SSE-NEXT: LBB3_1: ## %cond.load
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB3_4
; SSE-NEXT: LBB3_3: ## %cond.load1
; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je LBB3_6
; SSE-NEXT: LBB3_5: ## %cond.load4
; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je LBB3_8
; SSE-NEXT: LBB3_7: ## %cond.load7
; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_v4f64_v4i32_zero:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v4f64_v4i32_zero:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v4f64_v4i32_zero:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v4f64_v4i32_zero:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double>zeroinitializer)
ret <4 x double> %res
}
define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x double> %dst) {
; SSE2-LABEL: load_v4f64_v4i64:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm5, %xmm1
; SSE2-NEXT: movmskps %xmm1, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB4_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB4_3
; SSE2-NEXT: LBB4_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB4_5
; SSE2-NEXT: LBB4_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB4_8
; SSE2-NEXT: LBB4_7: ## %cond.load7
; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE2-NEXT: LBB4_8: ## %else8
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: movaps %xmm3, %xmm1
; SSE2-NEXT: retq
; SSE2-NEXT: LBB4_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB4_4
; SSE2-NEXT: LBB4_3: ## %cond.load1
; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB4_6
; SSE2-NEXT: LBB4_5: ## %cond.load4
; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB4_7
; SSE2-NEXT: jmp LBB4_8
;
; SSE42-LABEL: load_v4f64_v4i64:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm4, %xmm4
; SSE42-NEXT: pcmpeqq %xmm4, %xmm1
; SSE42-NEXT: pcmpeqq %xmm4, %xmm0
; SSE42-NEXT: packssdw %xmm1, %xmm0
; SSE42-NEXT: movmskps %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB4_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB4_3
; SSE42-NEXT: LBB4_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB4_5
; SSE42-NEXT: LBB4_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB4_8
; SSE42-NEXT: LBB4_7: ## %cond.load7
; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE42-NEXT: LBB4_8: ## %else8
; SSE42-NEXT: movaps %xmm2, %xmm0
; SSE42-NEXT: movaps %xmm3, %xmm1
; SSE42-NEXT: retq
; SSE42-NEXT: LBB4_1: ## %cond.load
; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB4_4
; SSE42-NEXT: LBB4_3: ## %cond.load1
; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB4_6
; SSE42-NEXT: LBB4_5: ## %cond.load4
; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB4_7
; SSE42-NEXT: jmp LBB4_8
;
; AVX1-LABEL: load_v4f64_v4i64:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v4f64_v4i64:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v4f64_v4i64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v4f64_v4i64:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i64> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> %mask, <4 x double> %dst)
ret <4 x double> %res
}
define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, <8 x double> %dst) {
; SSE-LABEL: load_v8f64_v8i16:
; SSE: ## %bb.0:
; SSE-NEXT: pxor %xmm5, %xmm5
; SSE-NEXT: pcmpeqw %xmm0, %xmm5
; SSE-NEXT: packsswb %xmm5, %xmm5
; SSE-NEXT: pmovmskb %xmm5, %eax
; SSE-NEXT: testb $1, %al
; SSE-NEXT: jne LBB5_1
; SSE-NEXT: ## %bb.2: ## %else
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne LBB5_3
; SSE-NEXT: LBB5_4: ## %else2
; SSE-NEXT: testb $4, %al
; SSE-NEXT: jne LBB5_5
; SSE-NEXT: LBB5_6: ## %else5
; SSE-NEXT: testb $8, %al
; SSE-NEXT: jne LBB5_7
; SSE-NEXT: LBB5_8: ## %else8
; SSE-NEXT: testb $16, %al
; SSE-NEXT: jne LBB5_9
; SSE-NEXT: LBB5_10: ## %else11
; SSE-NEXT: testb $32, %al
; SSE-NEXT: jne LBB5_11
; SSE-NEXT: LBB5_12: ## %else14
; SSE-NEXT: testb $64, %al
; SSE-NEXT: jne LBB5_13
; SSE-NEXT: LBB5_14: ## %else17
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: je LBB5_16
; SSE-NEXT: LBB5_15: ## %cond.load19
; SSE-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE-NEXT: LBB5_16: ## %else20
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: movaps %xmm3, %xmm2
; SSE-NEXT: movaps %xmm4, %xmm3
; SSE-NEXT: retq
; SSE-NEXT: LBB5_1: ## %cond.load
; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB5_4
; SSE-NEXT: LBB5_3: ## %cond.load1
; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je LBB5_6
; SSE-NEXT: LBB5_5: ## %cond.load4
; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je LBB5_8
; SSE-NEXT: LBB5_7: ## %cond.load7
; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE-NEXT: testb $16, %al
; SSE-NEXT: je LBB5_10
; SSE-NEXT: LBB5_9: ## %cond.load10
; SSE-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE-NEXT: testb $32, %al
; SSE-NEXT: je LBB5_12
; SSE-NEXT: LBB5_11: ## %cond.load13
; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE-NEXT: testb $64, %al
; SSE-NEXT: je LBB5_14
; SSE-NEXT: LBB5_13: ## %cond.load16
; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: jne LBB5_15
; SSE-NEXT: jmp LBB5_16
;
; AVX1-LABEL: load_v8f64_v8i16:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8f64_v8i16:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v8f64_v8i16:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v8f64_v8i16:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
; AVX512VLDQ-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v8f64_v8i16:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp eq <8 x i16> %trigger, zeroinitializer
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
ret <8 x double> %res
}
define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, <8 x double> %dst) {
; SSE2-LABEL: load_v8f64_v8i64:
; SSE2: ## %bb.0:
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: movaps %xmm6, %xmm9
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2]
; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: packssdw %xmm6, %xmm3
; SSE2-NEXT: pcmpeqd %xmm7, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm7, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: packssdw %xmm3, %xmm1
; SSE2-NEXT: packsswb %xmm1, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB6_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB6_3
; SSE2-NEXT: LBB6_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB6_5
; SSE2-NEXT: LBB6_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB6_7
; SSE2-NEXT: LBB6_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB6_9
; SSE2-NEXT: LBB6_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB6_11
; SSE2-NEXT: LBB6_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB6_13
; SSE2-NEXT: LBB6_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB6_16
; SSE2-NEXT: LBB6_15: ## %cond.load19
; SSE2-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1]
; SSE2-NEXT: LBB6_16: ## %else20
; SSE2-NEXT: movaps %xmm4, %xmm0
; SSE2-NEXT: movaps %xmm5, %xmm1
; SSE2-NEXT: movaps %xmm9, %xmm2
; SSE2-NEXT: movaps %xmm8, %xmm3
; SSE2-NEXT: retq
; SSE2-NEXT: LBB6_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB6_4
; SSE2-NEXT: LBB6_3: ## %cond.load1
; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB6_6
; SSE2-NEXT: LBB6_5: ## %cond.load4
; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB6_8
; SSE2-NEXT: LBB6_7: ## %cond.load7
; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB6_10
; SSE2-NEXT: LBB6_9: ## %cond.load10
; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB6_12
; SSE2-NEXT: LBB6_11: ## %cond.load13
; SSE2-NEXT: movhps {{.*#+}} xmm9 = xmm9[0,1],mem[0,1]
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB6_14
; SSE2-NEXT: LBB6_13: ## %cond.load16
; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB6_15
; SSE2-NEXT: jmp LBB6_16
;
; SSE42-LABEL: load_v8f64_v8i64:
; SSE42: ## %bb.0:
; SSE42-NEXT: movdqa %xmm7, %xmm8
; SSE42-NEXT: pxor %xmm7, %xmm7
; SSE42-NEXT: pcmpeqq %xmm7, %xmm3
; SSE42-NEXT: pcmpeqq %xmm7, %xmm2
; SSE42-NEXT: packssdw %xmm3, %xmm2
; SSE42-NEXT: pcmpeqq %xmm7, %xmm1
; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
; SSE42-NEXT: packssdw %xmm1, %xmm0
; SSE42-NEXT: packssdw %xmm2, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB6_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB6_3
; SSE42-NEXT: LBB6_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB6_5
; SSE42-NEXT: LBB6_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB6_7
; SSE42-NEXT: LBB6_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB6_9
; SSE42-NEXT: LBB6_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB6_11
; SSE42-NEXT: LBB6_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB6_13
; SSE42-NEXT: LBB6_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB6_16
; SSE42-NEXT: LBB6_15: ## %cond.load19
; SSE42-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1]
; SSE42-NEXT: LBB6_16: ## %else20
; SSE42-NEXT: movaps %xmm4, %xmm0
; SSE42-NEXT: movaps %xmm5, %xmm1
; SSE42-NEXT: movaps %xmm6, %xmm2
; SSE42-NEXT: movaps %xmm8, %xmm3
; SSE42-NEXT: retq
; SSE42-NEXT: LBB6_1: ## %cond.load
; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB6_4
; SSE42-NEXT: LBB6_3: ## %cond.load1
; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB6_6
; SSE42-NEXT: LBB6_5: ## %cond.load4
; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB6_8
; SSE42-NEXT: LBB6_7: ## %cond.load7
; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB6_10
; SSE42-NEXT: LBB6_9: ## %cond.load10
; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB6_12
; SSE42-NEXT: LBB6_11: ## %cond.load13
; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB6_14
; SSE42-NEXT: LBB6_13: ## %cond.load16
; SSE42-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB6_15
; SSE42-NEXT: jmp LBB6_16
;
; AVX1-LABEL: load_v8f64_v8i64:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8f64_v8i64:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_v8f64_v8i64:
; AVX512: ## %bb.0:
; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
; AVX512-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
%mask = icmp eq <8 x i64> %trigger, zeroinitializer
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
ret <8 x double> %res
}
;
; vXf32
;
define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
; SSE2-LABEL: load_v2f32_v2i32:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB7_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB7_3
; SSE2-NEXT: LBB7_4: ## %else2
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: LBB7_1: ## %cond.load
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB7_4
; SSE2-NEXT: LBB7_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v2f32_v2i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
; SSE42-NEXT: pmovsxdq %xmm2, %xmm0
; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB7_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB7_3
; SSE42-NEXT: LBB7_4: ## %else2
; SSE42-NEXT: movaps %xmm1, %xmm0
; SSE42-NEXT: retq
; SSE42-NEXT: LBB7_1: ## %cond.load
; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB7_4
; SSE42-NEXT: LBB7_3: ## %cond.load1
; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
; SSE42-NEXT: movaps %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: load_v2f32_v2i32:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: load_v2f32_v2i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v2f32_v2i32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
; AVX512VLDQ-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v2f32_v2i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
ret <2 x float> %res
}
define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %addr) {
; SSE2-LABEL: load_v2f32_v2i32_undef:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: ## implicit-def: $xmm0
; SSE2-NEXT: jne LBB8_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB8_3
; SSE2-NEXT: LBB8_4: ## %else2
; SSE2-NEXT: retq
; SSE2-NEXT: LBB8_1: ## %cond.load
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB8_4
; SSE2-NEXT: LBB8_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v2f32_v2i32_undef:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: pcmpeqd %xmm0, %xmm1
; SSE42-NEXT: pmovsxdq %xmm1, %xmm0
; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: ## implicit-def: $xmm0
; SSE42-NEXT: jne LBB8_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB8_3
; SSE42-NEXT: LBB8_4: ## %else2
; SSE42-NEXT: retq
; SSE42-NEXT: LBB8_1: ## %cond.load
; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB8_4
; SSE42-NEXT: LBB8_3: ## %cond.load1
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: load_v2f32_v2i32_undef:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: load_v2f32_v2i32_undef:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v2f32_v2i32_undef:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v2f32_v2i32_undef:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float>undef)
ret <2 x float> %res
}
define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
; SSE2-LABEL: load_v4f32_v4i32:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
; SSE2-NEXT: movmskps %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB9_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB9_3
; SSE2-NEXT: LBB9_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB9_5
; SSE2-NEXT: LBB9_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB9_7
; SSE2-NEXT: LBB9_8: ## %else8
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: LBB9_1: ## %cond.load
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB9_4
; SSE2-NEXT: LBB9_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB9_6
; SSE2-NEXT: LBB9_5: ## %cond.load4
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB9_8
; SSE2-NEXT: LBB9_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v4f32_v4i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
; SSE42-NEXT: movmskps %xmm2, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB9_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB9_3
; SSE42-NEXT: LBB9_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB9_5
; SSE42-NEXT: LBB9_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB9_7
; SSE42-NEXT: LBB9_8: ## %else8
; SSE42-NEXT: movaps %xmm1, %xmm0
; SSE42-NEXT: retq
; SSE42-NEXT: LBB9_1: ## %cond.load
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB9_4
; SSE42-NEXT: LBB9_3: ## %cond.load1
; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB9_6
; SSE42-NEXT: LBB9_5: ## %cond.load4
; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB9_8
; SSE42-NEXT: LBB9_7: ## %cond.load7
; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
; SSE42-NEXT: movaps %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: load_v4f32_v4i32:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: load_v4f32_v4i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v4f32_v4i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> %mask, <4 x float> %dst)
ret <4 x float> %res
}
define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) {
; SSE2-LABEL: load_v8f32_v8i1_zero:
; SSE2: ## %bb.0:
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: jne LBB10_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB10_3
; SSE2-NEXT: LBB10_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB10_5
; SSE2-NEXT: LBB10_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB10_7
; SSE2-NEXT: LBB10_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB10_9
; SSE2-NEXT: LBB10_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB10_11
; SSE2-NEXT: LBB10_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB10_13
; SSE2-NEXT: LBB10_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB10_15
; SSE2-NEXT: LBB10_16: ## %else20
; SSE2-NEXT: retq
; SSE2-NEXT: LBB10_1: ## %cond.load
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB10_4
; SSE2-NEXT: LBB10_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB10_6
; SSE2-NEXT: LBB10_5: ## %cond.load4
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB10_8
; SSE2-NEXT: LBB10_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB10_10
; SSE2-NEXT: LBB10_9: ## %cond.load10
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB10_12
; SSE2-NEXT: LBB10_11: ## %cond.load13
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB10_14
; SSE2-NEXT: LBB10_13: ## %cond.load16
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB10_16
; SSE2-NEXT: LBB10_15: ## %cond.load19
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v8f32_v8i1_zero:
; SSE42: ## %bb.0:
; SSE42-NEXT: psllw $15, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: pxor %xmm0, %xmm0
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: xorps %xmm1, %xmm1
; SSE42-NEXT: jne LBB10_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB10_3
; SSE42-NEXT: LBB10_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB10_5
; SSE42-NEXT: LBB10_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB10_7
; SSE42-NEXT: LBB10_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB10_9
; SSE42-NEXT: LBB10_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB10_11
; SSE42-NEXT: LBB10_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB10_13
; SSE42-NEXT: LBB10_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB10_15
; SSE42-NEXT: LBB10_16: ## %else20
; SSE42-NEXT: retq
; SSE42-NEXT: LBB10_1: ## %cond.load
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: xorps %xmm1, %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB10_4
; SSE42-NEXT: LBB10_3: ## %cond.load1
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB10_6
; SSE42-NEXT: LBB10_5: ## %cond.load4
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB10_8
; SSE42-NEXT: LBB10_7: ## %cond.load7
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB10_10
; SSE42-NEXT: LBB10_9: ## %cond.load10
; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB10_12
; SSE42-NEXT: LBB10_11: ## %cond.load13
; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB10_14
; SSE42-NEXT: LBB10_13: ## %cond.load16
; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB10_16
; SSE42-NEXT: LBB10_15: ## %cond.load19
; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v8f32_v8i1_zero:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8f32_v8i1_zero:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v8f32_v8i1_zero:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v8f32_v8i1_zero:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v8f32_v8i1_zero:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
ret <8 x float> %res
}
define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
; SSE2-LABEL: load_v8f32_v8i32:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB11_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB11_3
; SSE2-NEXT: LBB11_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB11_5
; SSE2-NEXT: LBB11_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB11_7
; SSE2-NEXT: LBB11_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB11_9
; SSE2-NEXT: LBB11_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB11_11
; SSE2-NEXT: LBB11_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB11_13
; SSE2-NEXT: LBB11_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB11_16
; SSE2-NEXT: LBB11_15: ## %cond.load19
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
; SSE2-NEXT: LBB11_16: ## %else20
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: movaps %xmm3, %xmm1
; SSE2-NEXT: retq
; SSE2-NEXT: LBB11_1: ## %cond.load
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB11_4
; SSE2-NEXT: LBB11_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB11_6
; SSE2-NEXT: LBB11_5: ## %cond.load4
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB11_8
; SSE2-NEXT: LBB11_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB11_10
; SSE2-NEXT: LBB11_9: ## %cond.load10
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB11_12
; SSE2-NEXT: LBB11_11: ## %cond.load13
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm3
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB11_14
; SSE2-NEXT: LBB11_13: ## %cond.load16
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB11_15
; SSE2-NEXT: jmp LBB11_16
;
; SSE42-LABEL: load_v8f32_v8i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm4, %xmm4
; SSE42-NEXT: pcmpeqd %xmm4, %xmm1
; SSE42-NEXT: pcmpeqd %xmm4, %xmm0
; SSE42-NEXT: packssdw %xmm1, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB11_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB11_3
; SSE42-NEXT: LBB11_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB11_5
; SSE42-NEXT: LBB11_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB11_7
; SSE42-NEXT: LBB11_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB11_9
; SSE42-NEXT: LBB11_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB11_11
; SSE42-NEXT: LBB11_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB11_13
; SSE42-NEXT: LBB11_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB11_16
; SSE42-NEXT: LBB11_15: ## %cond.load19
; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
; SSE42-NEXT: LBB11_16: ## %else20
; SSE42-NEXT: movaps %xmm2, %xmm0
; SSE42-NEXT: movaps %xmm3, %xmm1
; SSE42-NEXT: retq
; SSE42-NEXT: LBB11_1: ## %cond.load
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB11_4
; SSE42-NEXT: LBB11_3: ## %cond.load1
; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB11_6
; SSE42-NEXT: LBB11_5: ## %cond.load4
; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB11_8
; SSE42-NEXT: LBB11_7: ## %cond.load7
; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB11_10
; SSE42-NEXT: LBB11_9: ## %cond.load10
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB11_12
; SSE42-NEXT: LBB11_11: ## %cond.load13
; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB11_14
; SSE42-NEXT: LBB11_13: ## %cond.load16
; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB11_15
; SSE42-NEXT: jmp LBB11_16
;
; AVX1-LABEL: load_v8f32_v8i32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8f32_v8i32:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v8f32_v8i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v8f32_v8i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> %dst)
ret <8 x float> %res
}
;
; vXf64
;
define <1 x i64> @load_v1i64_v1i64(<1 x i64> %trigger, <1 x i64>* %addr, <1 x i64> %dst) {
; SSE-LABEL: load_v1i64_v1i64:
; SSE: ## %bb.0:
; SSE-NEXT: testq %rdi, %rdi
; SSE-NEXT: jne LBB12_1
; SSE-NEXT: ## %bb.2: ## %cond.load
; SSE-NEXT: movq (%rsi), %rax
; SSE-NEXT: retq
; SSE-NEXT: LBB12_1:
; SSE-NEXT: movq %rdx, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: load_v1i64_v1i64:
; AVX: ## %bb.0:
; AVX-NEXT: testq %rdi, %rdi
; AVX-NEXT: jne LBB12_1
; AVX-NEXT: ## %bb.2: ## %cond.load
; AVX-NEXT: movq (%rsi), %rax
; AVX-NEXT: retq
; AVX-NEXT: LBB12_1:
; AVX-NEXT: movq %rdx, %rax
; AVX-NEXT: retq
%mask = icmp eq <1 x i64> %trigger, zeroinitializer
%res = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* %addr, i32 4, <1 x i1> %mask, <1 x i64> %dst)
ret <1 x i64> %res
}
define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %dst) {
; SSE2-LABEL: load_v2i64_v2i64:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB13_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB13_3
; SSE2-NEXT: LBB13_4: ## %else2
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: LBB13_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB13_4
; SSE2-NEXT: LBB13_3: ## %cond.load1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v2i64_v2i64:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pcmpeqq %xmm0, %xmm2
; SSE42-NEXT: movmskpd %xmm2, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB13_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB13_3
; SSE42-NEXT: LBB13_4: ## %else2
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
; SSE42-NEXT: LBB13_1: ## %cond.load
; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB13_4
; SSE42-NEXT: LBB13_3: ## %cond.load1
; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v2i64_v2i64:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v2i64_v2i64:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v2i64_v2i64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v2i64_v2i64:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpblendmq (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
%res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> %mask, <2 x i64> %dst)
ret <2 x i64> %res
}
define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %dst) {
; SSE2-LABEL: load_v4i64_v4i64:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm5, %xmm1
; SSE2-NEXT: movmskps %xmm1, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB14_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB14_3
; SSE2-NEXT: LBB14_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB14_5
; SSE2-NEXT: LBB14_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB14_8
; SSE2-NEXT: LBB14_7: ## %cond.load7
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE2-NEXT: LBB14_8: ## %else8
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: movaps %xmm3, %xmm1
; SSE2-NEXT: retq
; SSE2-NEXT: LBB14_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB14_4
; SSE2-NEXT: LBB14_3: ## %cond.load1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB14_6
; SSE2-NEXT: LBB14_5: ## %cond.load4
; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB14_7
; SSE2-NEXT: jmp LBB14_8
;
; SSE42-LABEL: load_v4i64_v4i64:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm4, %xmm4
; SSE42-NEXT: pcmpeqq %xmm4, %xmm1
; SSE42-NEXT: pcmpeqq %xmm4, %xmm0
; SSE42-NEXT: packssdw %xmm1, %xmm0
; SSE42-NEXT: movmskps %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB14_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB14_3
; SSE42-NEXT: LBB14_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB14_5
; SSE42-NEXT: LBB14_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB14_8
; SSE42-NEXT: LBB14_7: ## %cond.load7
; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm3
; SSE42-NEXT: LBB14_8: ## %else8
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: movdqa %xmm3, %xmm1
; SSE42-NEXT: retq
; SSE42-NEXT: LBB14_1: ## %cond.load
; SSE42-NEXT: pinsrq $0, (%rdi), %xmm2
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB14_4
; SSE42-NEXT: LBB14_3: ## %cond.load1
; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB14_6
; SSE42-NEXT: LBB14_5: ## %cond.load4
; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm3
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB14_7
; SSE42-NEXT: jmp LBB14_8
;
; AVX1-LABEL: load_v4i64_v4i64:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v4i64_v4i64:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v4i64_v4i64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v4i64_v4i64:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpblendmq (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i64> %trigger, zeroinitializer
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> %mask, <4 x i64> %dst)
ret <4 x i64> %res
}
define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i64> %dst) {
; SSE2-LABEL: load_v8i64_v8i16:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpeqw %xmm0, %xmm5
; SSE2-NEXT: packsswb %xmm5, %xmm5
; SSE2-NEXT: pmovmskb %xmm5, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB15_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB15_3
; SSE2-NEXT: LBB15_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB15_5
; SSE2-NEXT: LBB15_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB15_7
; SSE2-NEXT: LBB15_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB15_9
; SSE2-NEXT: LBB15_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB15_11
; SSE2-NEXT: LBB15_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB15_13
; SSE2-NEXT: LBB15_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB15_16
; SSE2-NEXT: LBB15_15: ## %cond.load19
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE2-NEXT: LBB15_16: ## %else20
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: movaps %xmm3, %xmm2
; SSE2-NEXT: movaps %xmm4, %xmm3
; SSE2-NEXT: retq
; SSE2-NEXT: LBB15_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB15_4
; SSE2-NEXT: LBB15_3: ## %cond.load1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB15_6
; SSE2-NEXT: LBB15_5: ## %cond.load4
; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB15_8
; SSE2-NEXT: LBB15_7: ## %cond.load7
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB15_10
; SSE2-NEXT: LBB15_9: ## %cond.load10
; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB15_12
; SSE2-NEXT: LBB15_11: ## %cond.load13
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB15_14
; SSE2-NEXT: LBB15_13: ## %cond.load16
; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB15_15
; SSE2-NEXT: jmp LBB15_16
;
; SSE42-LABEL: load_v8i64_v8i16:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm5, %xmm5
; SSE42-NEXT: pcmpeqw %xmm0, %xmm5
; SSE42-NEXT: packsswb %xmm5, %xmm5
; SSE42-NEXT: pmovmskb %xmm5, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB15_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB15_3
; SSE42-NEXT: LBB15_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB15_5
; SSE42-NEXT: LBB15_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB15_7
; SSE42-NEXT: LBB15_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB15_9
; SSE42-NEXT: LBB15_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB15_11
; SSE42-NEXT: LBB15_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB15_13
; SSE42-NEXT: LBB15_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB15_16
; SSE42-NEXT: LBB15_15: ## %cond.load19
; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm4
; SSE42-NEXT: LBB15_16: ## %else20
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: movdqa %xmm2, %xmm1
; SSE42-NEXT: movdqa %xmm3, %xmm2
; SSE42-NEXT: movdqa %xmm4, %xmm3
; SSE42-NEXT: retq
; SSE42-NEXT: LBB15_1: ## %cond.load
; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB15_4
; SSE42-NEXT: LBB15_3: ## %cond.load1
; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB15_6
; SSE42-NEXT: LBB15_5: ## %cond.load4
; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm2
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB15_8
; SSE42-NEXT: LBB15_7: ## %cond.load7
; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm2
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB15_10
; SSE42-NEXT: LBB15_9: ## %cond.load10
; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm3
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB15_12
; SSE42-NEXT: LBB15_11: ## %cond.load13
; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm3
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB15_14
; SSE42-NEXT: LBB15_13: ## %cond.load16
; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm4
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB15_15
; SSE42-NEXT: jmp LBB15_16
;
; AVX1-LABEL: load_v8i64_v8i16:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8i64_v8i16:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v8i64_v8i16:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v8i64_v8i16:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
; AVX512VLDQ-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v8i64_v8i16:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp eq <8 x i16> %trigger, zeroinitializer
%res = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
ret <8 x i64> %res
}
define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i64> %dst) {
; SSE2-LABEL: load_v8i64_v8i64:
; SSE2: ## %bb.0:
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: movaps %xmm6, %xmm9
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2]
; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: packssdw %xmm6, %xmm3
; SSE2-NEXT: pcmpeqd %xmm7, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm7, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: packssdw %xmm3, %xmm1
; SSE2-NEXT: packsswb %xmm1, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB16_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB16_3
; SSE2-NEXT: LBB16_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB16_5
; SSE2-NEXT: LBB16_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB16_7
; SSE2-NEXT: LBB16_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB16_9
; SSE2-NEXT: LBB16_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB16_11
; SSE2-NEXT: LBB16_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB16_13
; SSE2-NEXT: LBB16_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB16_16
; SSE2-NEXT: LBB16_15: ## %cond.load19
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0]
; SSE2-NEXT: LBB16_16: ## %else20
; SSE2-NEXT: movaps %xmm4, %xmm0
; SSE2-NEXT: movaps %xmm5, %xmm1
; SSE2-NEXT: movaps %xmm9, %xmm2
; SSE2-NEXT: movdqa %xmm8, %xmm3
; SSE2-NEXT: retq
; SSE2-NEXT: LBB16_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB16_4
; SSE2-NEXT: LBB16_3: ## %cond.load1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB16_6
; SSE2-NEXT: LBB16_5: ## %cond.load4
; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB16_8
; SSE2-NEXT: LBB16_7: ## %cond.load7
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB16_10
; SSE2-NEXT: LBB16_9: ## %cond.load10
; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB16_12
; SSE2-NEXT: LBB16_11: ## %cond.load13
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB16_14
; SSE2-NEXT: LBB16_13: ## %cond.load16
; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB16_15
; SSE2-NEXT: jmp LBB16_16
;
; SSE42-LABEL: load_v8i64_v8i64:
; SSE42: ## %bb.0:
; SSE42-NEXT: movdqa %xmm7, %xmm8
; SSE42-NEXT: pxor %xmm7, %xmm7
; SSE42-NEXT: pcmpeqq %xmm7, %xmm3
; SSE42-NEXT: pcmpeqq %xmm7, %xmm2
; SSE42-NEXT: packssdw %xmm3, %xmm2
; SSE42-NEXT: pcmpeqq %xmm7, %xmm1
; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
; SSE42-NEXT: packssdw %xmm1, %xmm0
; SSE42-NEXT: packssdw %xmm2, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB16_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB16_3
; SSE42-NEXT: LBB16_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB16_5
; SSE42-NEXT: LBB16_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB16_7
; SSE42-NEXT: LBB16_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB16_9
; SSE42-NEXT: LBB16_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB16_11
; SSE42-NEXT: LBB16_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB16_13
; SSE42-NEXT: LBB16_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB16_16
; SSE42-NEXT: LBB16_15: ## %cond.load19
; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm8
; SSE42-NEXT: LBB16_16: ## %else20
; SSE42-NEXT: movdqa %xmm4, %xmm0
; SSE42-NEXT: movdqa %xmm5, %xmm1
; SSE42-NEXT: movdqa %xmm6, %xmm2
; SSE42-NEXT: movdqa %xmm8, %xmm3
; SSE42-NEXT: retq
; SSE42-NEXT: LBB16_1: ## %cond.load
; SSE42-NEXT: pinsrq $0, (%rdi), %xmm4
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB16_4
; SSE42-NEXT: LBB16_3: ## %cond.load1
; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm4
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB16_6
; SSE42-NEXT: LBB16_5: ## %cond.load4
; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB16_8
; SSE42-NEXT: LBB16_7: ## %cond.load7
; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm5
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB16_10
; SSE42-NEXT: LBB16_9: ## %cond.load10
; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm6
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB16_12
; SSE42-NEXT: LBB16_11: ## %cond.load13
; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm6
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB16_14
; SSE42-NEXT: LBB16_13: ## %cond.load16
; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm8
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB16_15
; SSE42-NEXT: jmp LBB16_16
;
; AVX1-LABEL: load_v8i64_v8i64:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8i64_v8i64:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_v8i64_v8i64:
; AVX512: ## %bb.0:
; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
; AVX512-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
%mask = icmp eq <8 x i64> %trigger, zeroinitializer
%res = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
ret <8 x i64> %res
}
;
; vXi32
;
define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
; SSE2-LABEL: load_v2i32_v2i32:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB17_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB17_3
; SSE2-NEXT: LBB17_4: ## %else2
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: LBB17_1: ## %cond.load
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB17_4
; SSE2-NEXT: LBB17_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v2i32_v2i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
; SSE42-NEXT: pmovsxdq %xmm2, %xmm0
; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB17_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB17_3
; SSE42-NEXT: LBB17_4: ## %else2
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
; SSE42-NEXT: LBB17_1: ## %cond.load
; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB17_4
; SSE42-NEXT: LBB17_3: ## %cond.load1
; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v2i32_v2i32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v2i32_v2i32:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v2i32_v2i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v2i32_v2i32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
; AVX512VLDQ-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v2i32_v2i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLBW-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
ret <2 x i32> %res
}
define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
; SSE2-LABEL: load_v4i32_v4i32:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
; SSE2-NEXT: movmskps %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB18_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB18_3
; SSE2-NEXT: LBB18_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB18_5
; SSE2-NEXT: LBB18_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB18_7
; SSE2-NEXT: LBB18_8: ## %else8
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: LBB18_1: ## %cond.load
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB18_4
; SSE2-NEXT: LBB18_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB18_6
; SSE2-NEXT: LBB18_5: ## %cond.load4
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB18_8
; SSE2-NEXT: LBB18_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v4i32_v4i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
; SSE42-NEXT: movmskps %xmm2, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB18_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB18_3
; SSE42-NEXT: LBB18_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB18_5
; SSE42-NEXT: LBB18_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB18_7
; SSE42-NEXT: LBB18_8: ## %else8
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
; SSE42-NEXT: LBB18_1: ## %cond.load
; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB18_4
; SSE42-NEXT: LBB18_3: ## %cond.load1
; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB18_6
; SSE42-NEXT: LBB18_5: ## %cond.load4
; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB18_8
; SSE42-NEXT: LBB18_7: ## %cond.load7
; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v4i32_v4i32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v4i32_v4i32:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v4i32_v4i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_v4i32_v4i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
ret <4 x i32> %res
}
define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; SSE2-LABEL: load_v8i32_v8i1:
; SSE2: ## %bb.0:
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB19_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB19_3
; SSE2-NEXT: LBB19_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB19_5
; SSE2-NEXT: LBB19_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB19_7
; SSE2-NEXT: LBB19_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB19_9
; SSE2-NEXT: LBB19_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB19_11
; SSE2-NEXT: LBB19_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB19_13
; SSE2-NEXT: LBB19_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB19_16
; SSE2-NEXT: LBB19_15: ## %cond.load19
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
; SSE2-NEXT: LBB19_16: ## %else20
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: retq
; SSE2-NEXT: LBB19_1: ## %cond.load
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB19_4
; SSE2-NEXT: LBB19_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB19_6
; SSE2-NEXT: LBB19_5: ## %cond.load4
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB19_8
; SSE2-NEXT: LBB19_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB19_10
; SSE2-NEXT: LBB19_9: ## %cond.load10
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB19_12
; SSE2-NEXT: LBB19_11: ## %cond.load13
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB19_14
; SSE2-NEXT: LBB19_13: ## %cond.load16
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB19_15
; SSE2-NEXT: jmp LBB19_16
;
; SSE42-LABEL: load_v8i32_v8i1:
; SSE42: ## %bb.0:
; SSE42-NEXT: psllw $15, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB19_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB19_3
; SSE42-NEXT: LBB19_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB19_5
; SSE42-NEXT: LBB19_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB19_7
; SSE42-NEXT: LBB19_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB19_9
; SSE42-NEXT: LBB19_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB19_11
; SSE42-NEXT: LBB19_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB19_13
; SSE42-NEXT: LBB19_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB19_16
; SSE42-NEXT: LBB19_15: ## %cond.load19
; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm2
; SSE42-NEXT: LBB19_16: ## %else20
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: movdqa %xmm2, %xmm1
; SSE42-NEXT: retq
; SSE42-NEXT: LBB19_1: ## %cond.load
; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB19_4
; SSE42-NEXT: LBB19_3: ## %cond.load1
; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB19_6
; SSE42-NEXT: LBB19_5: ## %cond.load4
; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB19_8
; SSE42-NEXT: LBB19_7: ## %cond.load7
; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB19_10
; SSE42-NEXT: LBB19_9: ## %cond.load10
; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm2
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB19_12
; SSE42-NEXT: LBB19_11: ## %cond.load13
; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm2
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB19_14
; SSE42-NEXT: LBB19_13: ## %cond.load16
; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm2
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB19_15
; SSE42-NEXT: jmp LBB19_16
;
; AVX1-LABEL: load_v8i32_v8i1:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8i32_v8i1:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v8i32_v8i1:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v8i32_v8i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
; AVX512VLDQ-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v8i32_v8i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
; AVX512VLBW-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> %dst)
ret <8 x i32> %res
}
define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) {
; SSE2-LABEL: load_v8i32_v8i1_zero:
; SSE2: ## %bb.0:
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: jne LBB20_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB20_3
; SSE2-NEXT: LBB20_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB20_5
; SSE2-NEXT: LBB20_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB20_7
; SSE2-NEXT: LBB20_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB20_9
; SSE2-NEXT: LBB20_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB20_11
; SSE2-NEXT: LBB20_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB20_13
; SSE2-NEXT: LBB20_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB20_15
; SSE2-NEXT: LBB20_16: ## %else20
; SSE2-NEXT: retq
; SSE2-NEXT: LBB20_1: ## %cond.load
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB20_4
; SSE2-NEXT: LBB20_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB20_6
; SSE2-NEXT: LBB20_5: ## %cond.load4
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB20_8
; SSE2-NEXT: LBB20_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB20_10
; SSE2-NEXT: LBB20_9: ## %cond.load10
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB20_12
; SSE2-NEXT: LBB20_11: ## %cond.load13
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB20_14
; SSE2-NEXT: LBB20_13: ## %cond.load16
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB20_16
; SSE2-NEXT: LBB20_15: ## %cond.load19
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v8i32_v8i1_zero:
; SSE42: ## %bb.0:
; SSE42-NEXT: psllw $15, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: pxor %xmm0, %xmm0
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: jne LBB20_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB20_3
; SSE42-NEXT: LBB20_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB20_5
; SSE42-NEXT: LBB20_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB20_7
; SSE42-NEXT: LBB20_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB20_9
; SSE42-NEXT: LBB20_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB20_11
; SSE42-NEXT: LBB20_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB20_13
; SSE42-NEXT: LBB20_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB20_15
; SSE42-NEXT: LBB20_16: ## %else20
; SSE42-NEXT: retq
; SSE42-NEXT: LBB20_1: ## %cond.load
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB20_4
; SSE42-NEXT: LBB20_3: ## %cond.load1
; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB20_6
; SSE42-NEXT: LBB20_5: ## %cond.load4
; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB20_8
; SSE42-NEXT: LBB20_7: ## %cond.load7
; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB20_10
; SSE42-NEXT: LBB20_9: ## %cond.load10
; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm1
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB20_12
; SSE42-NEXT: LBB20_11: ## %cond.load13
; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm1
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB20_14
; SSE42-NEXT: LBB20_13: ## %cond.load16
; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm1
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB20_16
; SSE42-NEXT: LBB20_15: ## %cond.load19
; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v8i32_v8i1_zero:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8i32_v8i1_zero:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v8i32_v8i1_zero:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v8i32_v8i1_zero:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v8i32_v8i1_zero:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
ret <8 x i32> %res
}
;
; vXi16
;
define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, <8 x i16>* %addr, <8 x i16> %dst) {
; SSE-LABEL: load_v8i16_v8i16:
; SSE: ## %bb.0:
; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: testb $1, %al
; SSE-NEXT: jne LBB21_1
; SSE-NEXT: ## %bb.2: ## %else
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne LBB21_3
; SSE-NEXT: LBB21_4: ## %else2
; SSE-NEXT: testb $4, %al
; SSE-NEXT: jne LBB21_5
; SSE-NEXT: LBB21_6: ## %else5
; SSE-NEXT: testb $8, %al
; SSE-NEXT: jne LBB21_7
; SSE-NEXT: LBB21_8: ## %else8
; SSE-NEXT: testb $16, %al
; SSE-NEXT: jne LBB21_9
; SSE-NEXT: LBB21_10: ## %else11
; SSE-NEXT: testb $32, %al
; SSE-NEXT: jne LBB21_11
; SSE-NEXT: LBB21_12: ## %else14
; SSE-NEXT: testb $64, %al
; SSE-NEXT: jne LBB21_13
; SSE-NEXT: LBB21_14: ## %else17
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: jne LBB21_15
; SSE-NEXT: LBB21_16: ## %else20
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
; SSE-NEXT: LBB21_1: ## %cond.load
; SSE-NEXT: pinsrw $0, (%rdi), %xmm1
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB21_4
; SSE-NEXT: LBB21_3: ## %cond.load1
; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm1
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je LBB21_6
; SSE-NEXT: LBB21_5: ## %cond.load4
; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm1
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je LBB21_8
; SSE-NEXT: LBB21_7: ## %cond.load7
; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm1
; SSE-NEXT: testb $16, %al
; SSE-NEXT: je LBB21_10
; SSE-NEXT: LBB21_9: ## %cond.load10
; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm1
; SSE-NEXT: testb $32, %al
; SSE-NEXT: je LBB21_12
; SSE-NEXT: LBB21_11: ## %cond.load13
; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm1
; SSE-NEXT: testb $64, %al
; SSE-NEXT: je LBB21_14
; SSE-NEXT: LBB21_13: ## %cond.load16
; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm1
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: je LBB21_16
; SSE-NEXT: LBB21_15: ## %cond.load19
; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: load_v8i16_v8i16:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: jne LBB21_1
; AVX1OR2-NEXT: ## %bb.2: ## %else
; AVX1OR2-NEXT: testb $2, %al
; AVX1OR2-NEXT: jne LBB21_3
; AVX1OR2-NEXT: LBB21_4: ## %else2
; AVX1OR2-NEXT: testb $4, %al
; AVX1OR2-NEXT: jne LBB21_5
; AVX1OR2-NEXT: LBB21_6: ## %else5
; AVX1OR2-NEXT: testb $8, %al
; AVX1OR2-NEXT: jne LBB21_7
; AVX1OR2-NEXT: LBB21_8: ## %else8
; AVX1OR2-NEXT: testb $16, %al
; AVX1OR2-NEXT: jne LBB21_9
; AVX1OR2-NEXT: LBB21_10: ## %else11
; AVX1OR2-NEXT: testb $32, %al
; AVX1OR2-NEXT: jne LBB21_11
; AVX1OR2-NEXT: LBB21_12: ## %else14
; AVX1OR2-NEXT: testb $64, %al
; AVX1OR2-NEXT: jne LBB21_13
; AVX1OR2-NEXT: LBB21_14: ## %else17
; AVX1OR2-NEXT: testb $-128, %al
; AVX1OR2-NEXT: jne LBB21_15
; AVX1OR2-NEXT: LBB21_16: ## %else20
; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
; AVX1OR2-NEXT: retq
; AVX1OR2-NEXT: LBB21_1: ## %cond.load
; AVX1OR2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $2, %al
; AVX1OR2-NEXT: je LBB21_4
; AVX1OR2-NEXT: LBB21_3: ## %cond.load1
; AVX1OR2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $4, %al
; AVX1OR2-NEXT: je LBB21_6
; AVX1OR2-NEXT: LBB21_5: ## %cond.load4
; AVX1OR2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $8, %al
; AVX1OR2-NEXT: je LBB21_8
; AVX1OR2-NEXT: LBB21_7: ## %cond.load7
; AVX1OR2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $16, %al
; AVX1OR2-NEXT: je LBB21_10
; AVX1OR2-NEXT: LBB21_9: ## %cond.load10
; AVX1OR2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $32, %al
; AVX1OR2-NEXT: je LBB21_12
; AVX1OR2-NEXT: LBB21_11: ## %cond.load13
; AVX1OR2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $64, %al
; AVX1OR2-NEXT: je LBB21_14
; AVX1OR2-NEXT: LBB21_13: ## %cond.load16
; AVX1OR2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $-128, %al
; AVX1OR2-NEXT: je LBB21_16
; AVX1OR2-NEXT: LBB21_15: ## %cond.load19
; AVX1OR2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: load_v8i16_v8i16:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne LBB21_1
; AVX512F-NEXT: ## %bb.2: ## %else
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: jne LBB21_3
; AVX512F-NEXT: LBB21_4: ## %else2
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: jne LBB21_5
; AVX512F-NEXT: LBB21_6: ## %else5
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: jne LBB21_7
; AVX512F-NEXT: LBB21_8: ## %else8
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: jne LBB21_9
; AVX512F-NEXT: LBB21_10: ## %else11
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: jne LBB21_11
; AVX512F-NEXT: LBB21_12: ## %else14
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: jne LBB21_13
; AVX512F-NEXT: LBB21_14: ## %else17
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: jne LBB21_15
; AVX512F-NEXT: LBB21_16: ## %else20
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512F-NEXT: LBB21_1: ## %cond.load
; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je LBB21_4
; AVX512F-NEXT: LBB21_3: ## %cond.load1
; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je LBB21_6
; AVX512F-NEXT: LBB21_5: ## %cond.load4
; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je LBB21_8
; AVX512F-NEXT: LBB21_7: ## %cond.load7
; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je LBB21_10
; AVX512F-NEXT: LBB21_9: ## %cond.load10
; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je LBB21_12
; AVX512F-NEXT: LBB21_11: ## %cond.load13
; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je LBB21_14
; AVX512F-NEXT: LBB21_13: ## %cond.load16
; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je LBB21_16
; AVX512F-NEXT: LBB21_15: ## %cond.load19
; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v8i16_v8i16:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB21_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
; AVX512VLDQ-NEXT: testb $2, %al
; AVX512VLDQ-NEXT: jne LBB21_3
; AVX512VLDQ-NEXT: LBB21_4: ## %else2
; AVX512VLDQ-NEXT: testb $4, %al
; AVX512VLDQ-NEXT: jne LBB21_5
; AVX512VLDQ-NEXT: LBB21_6: ## %else5
; AVX512VLDQ-NEXT: testb $8, %al
; AVX512VLDQ-NEXT: jne LBB21_7
; AVX512VLDQ-NEXT: LBB21_8: ## %else8
; AVX512VLDQ-NEXT: testb $16, %al
; AVX512VLDQ-NEXT: jne LBB21_9
; AVX512VLDQ-NEXT: LBB21_10: ## %else11
; AVX512VLDQ-NEXT: testb $32, %al
; AVX512VLDQ-NEXT: jne LBB21_11
; AVX512VLDQ-NEXT: LBB21_12: ## %else14
; AVX512VLDQ-NEXT: testb $64, %al
; AVX512VLDQ-NEXT: jne LBB21_13
; AVX512VLDQ-NEXT: LBB21_14: ## %else17
; AVX512VLDQ-NEXT: testb $-128, %al
; AVX512VLDQ-NEXT: jne LBB21_15
; AVX512VLDQ-NEXT: LBB21_16: ## %else20
; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
; AVX512VLDQ-NEXT: LBB21_1: ## %cond.load
; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $2, %al
; AVX512VLDQ-NEXT: je LBB21_4
; AVX512VLDQ-NEXT: LBB21_3: ## %cond.load1
; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $4, %al
; AVX512VLDQ-NEXT: je LBB21_6
; AVX512VLDQ-NEXT: LBB21_5: ## %cond.load4
; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $8, %al
; AVX512VLDQ-NEXT: je LBB21_8
; AVX512VLDQ-NEXT: LBB21_7: ## %cond.load7
; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $16, %al
; AVX512VLDQ-NEXT: je LBB21_10
; AVX512VLDQ-NEXT: LBB21_9: ## %cond.load10
; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $32, %al
; AVX512VLDQ-NEXT: je LBB21_12
; AVX512VLDQ-NEXT: LBB21_11: ## %cond.load13
; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $64, %al
; AVX512VLDQ-NEXT: je LBB21_14
; AVX512VLDQ-NEXT: LBB21_13: ## %cond.load16
; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $-128, %al
; AVX512VLDQ-NEXT: je LBB21_16
; AVX512VLDQ-NEXT: LBB21_15: ## %cond.load19
; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v8i16_v8i16:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
; AVX512VLBW-NEXT: vpblendmw (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp slt <8 x i16> %trigger, zeroinitializer
%res = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr, i32 4, <8 x i1> %mask, <8 x i16> %dst)
ret <8 x i16> %res
}
define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <16 x i16> %dst) {
; SSE-LABEL: load_v16i16_v16i16:
; SSE: ## %bb.0:
; SSE-NEXT: packsswb %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: testb $1, %al
; SSE-NEXT: jne LBB22_1
; SSE-NEXT: ## %bb.2: ## %else
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne LBB22_3
; SSE-NEXT: LBB22_4: ## %else2
; SSE-NEXT: testb $4, %al
; SSE-NEXT: jne LBB22_5
; SSE-NEXT: LBB22_6: ## %else5
; SSE-NEXT: testb $8, %al
; SSE-NEXT: jne LBB22_7
; SSE-NEXT: LBB22_8: ## %else8
; SSE-NEXT: testb $16, %al
; SSE-NEXT: jne LBB22_9
; SSE-NEXT: LBB22_10: ## %else11
; SSE-NEXT: testb $32, %al
; SSE-NEXT: jne LBB22_11
; SSE-NEXT: LBB22_12: ## %else14
; SSE-NEXT: testb $64, %al
; SSE-NEXT: jne LBB22_13
; SSE-NEXT: LBB22_14: ## %else17
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: jne LBB22_15
; SSE-NEXT: LBB22_16: ## %else20
; SSE-NEXT: testl $256, %eax ## imm = 0x100
; SSE-NEXT: jne LBB22_17
; SSE-NEXT: LBB22_18: ## %else23
; SSE-NEXT: testl $512, %eax ## imm = 0x200
; SSE-NEXT: jne LBB22_19
; SSE-NEXT: LBB22_20: ## %else26
; SSE-NEXT: testl $1024, %eax ## imm = 0x400
; SSE-NEXT: jne LBB22_21
; SSE-NEXT: LBB22_22: ## %else29
; SSE-NEXT: testl $2048, %eax ## imm = 0x800
; SSE-NEXT: jne LBB22_23
; SSE-NEXT: LBB22_24: ## %else32
; SSE-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE-NEXT: jne LBB22_25
; SSE-NEXT: LBB22_26: ## %else35
; SSE-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE-NEXT: jne LBB22_27
; SSE-NEXT: LBB22_28: ## %else38
; SSE-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE-NEXT: jne LBB22_29
; SSE-NEXT: LBB22_30: ## %else41
; SSE-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE-NEXT: je LBB22_32
; SSE-NEXT: LBB22_31: ## %cond.load43
; SSE-NEXT: pinsrw $7, 30(%rdi), %xmm3
; SSE-NEXT: LBB22_32: ## %else44
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
; SSE-NEXT: retq
; SSE-NEXT: LBB22_1: ## %cond.load
; SSE-NEXT: pinsrw $0, (%rdi), %xmm2
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB22_4
; SSE-NEXT: LBB22_3: ## %cond.load1
; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm2
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je LBB22_6
; SSE-NEXT: LBB22_5: ## %cond.load4
; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm2
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je LBB22_8
; SSE-NEXT: LBB22_7: ## %cond.load7
; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm2
; SSE-NEXT: testb $16, %al
; SSE-NEXT: je LBB22_10
; SSE-NEXT: LBB22_9: ## %cond.load10
; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm2
; SSE-NEXT: testb $32, %al
; SSE-NEXT: je LBB22_12
; SSE-NEXT: LBB22_11: ## %cond.load13
; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm2
; SSE-NEXT: testb $64, %al
; SSE-NEXT: je LBB22_14
; SSE-NEXT: LBB22_13: ## %cond.load16
; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm2
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: je LBB22_16
; SSE-NEXT: LBB22_15: ## %cond.load19
; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm2
; SSE-NEXT: testl $256, %eax ## imm = 0x100
; SSE-NEXT: je LBB22_18
; SSE-NEXT: LBB22_17: ## %cond.load22
; SSE-NEXT: pinsrw $0, 16(%rdi), %xmm3
; SSE-NEXT: testl $512, %eax ## imm = 0x200
; SSE-NEXT: je LBB22_20
; SSE-NEXT: LBB22_19: ## %cond.load25
; SSE-NEXT: pinsrw $1, 18(%rdi), %xmm3
; SSE-NEXT: testl $1024, %eax ## imm = 0x400
; SSE-NEXT: je LBB22_22
; SSE-NEXT: LBB22_21: ## %cond.load28
; SSE-NEXT: pinsrw $2, 20(%rdi), %xmm3
; SSE-NEXT: testl $2048, %eax ## imm = 0x800
; SSE-NEXT: je LBB22_24
; SSE-NEXT: LBB22_23: ## %cond.load31
; SSE-NEXT: pinsrw $3, 22(%rdi), %xmm3
; SSE-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE-NEXT: je LBB22_26
; SSE-NEXT: LBB22_25: ## %cond.load34
; SSE-NEXT: pinsrw $4, 24(%rdi), %xmm3
; SSE-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE-NEXT: je LBB22_28
; SSE-NEXT: LBB22_27: ## %cond.load37
; SSE-NEXT: pinsrw $5, 26(%rdi), %xmm3
; SSE-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE-NEXT: je LBB22_30
; SSE-NEXT: LBB22_29: ## %cond.load40
; SSE-NEXT: pinsrw $6, 28(%rdi), %xmm3
; SSE-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE-NEXT: jne LBB22_31
; SSE-NEXT: jmp LBB22_32
;
; AVX1-LABEL: load_v16i16_v16i16:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne LBB22_1
; AVX1-NEXT: ## %bb.2: ## %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: jne LBB22_3
; AVX1-NEXT: LBB22_4: ## %else2
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: jne LBB22_5
; AVX1-NEXT: LBB22_6: ## %else5
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: jne LBB22_7
; AVX1-NEXT: LBB22_8: ## %else8
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: jne LBB22_9
; AVX1-NEXT: LBB22_10: ## %else11
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: jne LBB22_11
; AVX1-NEXT: LBB22_12: ## %else14
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: jne LBB22_13
; AVX1-NEXT: LBB22_14: ## %else17
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: jne LBB22_15
; AVX1-NEXT: LBB22_16: ## %else20
; AVX1-NEXT: testl $256, %eax ## imm = 0x100
; AVX1-NEXT: jne LBB22_17
; AVX1-NEXT: LBB22_18: ## %else23
; AVX1-NEXT: testl $512, %eax ## imm = 0x200
; AVX1-NEXT: jne LBB22_19
; AVX1-NEXT: LBB22_20: ## %else26
; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1-NEXT: jne LBB22_21
; AVX1-NEXT: LBB22_22: ## %else29
; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1-NEXT: jne LBB22_23
; AVX1-NEXT: LBB22_24: ## %else32
; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1-NEXT: jne LBB22_25
; AVX1-NEXT: LBB22_26: ## %else35
; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX1-NEXT: jne LBB22_27
; AVX1-NEXT: LBB22_28: ## %else38
; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1-NEXT: jne LBB22_29
; AVX1-NEXT: LBB22_30: ## %else41
; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1-NEXT: jne LBB22_31
; AVX1-NEXT: LBB22_32: ## %else44
; AVX1-NEXT: vmovaps %ymm1, %ymm0
; AVX1-NEXT: retq
; AVX1-NEXT: LBB22_1: ## %cond.load
; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB22_4
; AVX1-NEXT: LBB22_3: ## %cond.load1
; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB22_6
; AVX1-NEXT: LBB22_5: ## %cond.load4
; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB22_8
; AVX1-NEXT: LBB22_7: ## %cond.load7
; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je LBB22_10
; AVX1-NEXT: LBB22_9: ## %cond.load10
; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je LBB22_12
; AVX1-NEXT: LBB22_11: ## %cond.load13
; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je LBB22_14
; AVX1-NEXT: LBB22_13: ## %cond.load16
; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je LBB22_16
; AVX1-NEXT: LBB22_15: ## %cond.load19
; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $256, %eax ## imm = 0x100
; AVX1-NEXT: je LBB22_18
; AVX1-NEXT: LBB22_17: ## %cond.load22
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $512, %eax ## imm = 0x200
; AVX1-NEXT: je LBB22_20
; AVX1-NEXT: LBB22_19: ## %cond.load25
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1-NEXT: je LBB22_22
; AVX1-NEXT: LBB22_21: ## %cond.load28
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1-NEXT: je LBB22_24
; AVX1-NEXT: LBB22_23: ## %cond.load31
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1-NEXT: je LBB22_26
; AVX1-NEXT: LBB22_25: ## %cond.load34
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX1-NEXT: je LBB22_28
; AVX1-NEXT: LBB22_27: ## %cond.load37
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1-NEXT: je LBB22_30
; AVX1-NEXT: LBB22_29: ## %cond.load40
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1-NEXT: je LBB22_32
; AVX1-NEXT: LBB22_31: ## %cond.load43
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v16i16_v16i16:
; AVX2: ## %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne LBB22_1
; AVX2-NEXT: ## %bb.2: ## %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: jne LBB22_3
; AVX2-NEXT: LBB22_4: ## %else2
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: jne LBB22_5
; AVX2-NEXT: LBB22_6: ## %else5
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: jne LBB22_7
; AVX2-NEXT: LBB22_8: ## %else8
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: jne LBB22_9
; AVX2-NEXT: LBB22_10: ## %else11
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: jne LBB22_11
; AVX2-NEXT: LBB22_12: ## %else14
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: jne LBB22_13
; AVX2-NEXT: LBB22_14: ## %else17
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: jne LBB22_15
; AVX2-NEXT: LBB22_16: ## %else20
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: jne LBB22_17
; AVX2-NEXT: LBB22_18: ## %else23
; AVX2-NEXT: testl $512, %eax ## imm = 0x200
; AVX2-NEXT: jne LBB22_19
; AVX2-NEXT: LBB22_20: ## %else26
; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX2-NEXT: jne LBB22_21
; AVX2-NEXT: LBB22_22: ## %else29
; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX2-NEXT: jne LBB22_23
; AVX2-NEXT: LBB22_24: ## %else32
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: jne LBB22_25
; AVX2-NEXT: LBB22_26: ## %else35
; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX2-NEXT: jne LBB22_27
; AVX2-NEXT: LBB22_28: ## %else38
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: jne LBB22_29
; AVX2-NEXT: LBB22_30: ## %else41
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: jne LBB22_31
; AVX2-NEXT: LBB22_32: ## %else44
; AVX2-NEXT: vmovdqa %ymm1, %ymm0
; AVX2-NEXT: retq
; AVX2-NEXT: LBB22_1: ## %cond.load
; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je LBB22_4
; AVX2-NEXT: LBB22_3: ## %cond.load1
; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB22_6
; AVX2-NEXT: LBB22_5: ## %cond.load4
; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB22_8
; AVX2-NEXT: LBB22_7: ## %cond.load7
; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB22_10
; AVX2-NEXT: LBB22_9: ## %cond.load10
; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je LBB22_12
; AVX2-NEXT: LBB22_11: ## %cond.load13
; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je LBB22_14
; AVX2-NEXT: LBB22_13: ## %cond.load16
; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB22_16
; AVX2-NEXT: LBB22_15: ## %cond.load19
; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: je LBB22_18
; AVX2-NEXT: LBB22_17: ## %cond.load22
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $512, %eax ## imm = 0x200
; AVX2-NEXT: je LBB22_20
; AVX2-NEXT: LBB22_19: ## %cond.load25
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX2-NEXT: je LBB22_22
; AVX2-NEXT: LBB22_21: ## %cond.load28
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX2-NEXT: je LBB22_24
; AVX2-NEXT: LBB22_23: ## %cond.load31
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: je LBB22_26
; AVX2-NEXT: LBB22_25: ## %cond.load34
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX2-NEXT: je LBB22_28
; AVX2-NEXT: LBB22_27: ## %cond.load37
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: je LBB22_30
; AVX2-NEXT: LBB22_29: ## %cond.load40
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: je LBB22_32
; AVX2-NEXT: LBB22_31: ## %cond.load43
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v16i16_v16i16:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne LBB22_1
; AVX512F-NEXT: ## %bb.2: ## %else
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: jne LBB22_3
; AVX512F-NEXT: LBB22_4: ## %else2
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: jne LBB22_5
; AVX512F-NEXT: LBB22_6: ## %else5
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: jne LBB22_7
; AVX512F-NEXT: LBB22_8: ## %else8
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: jne LBB22_9
; AVX512F-NEXT: LBB22_10: ## %else11
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: jne LBB22_11
; AVX512F-NEXT: LBB22_12: ## %else14
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: jne LBB22_13
; AVX512F-NEXT: LBB22_14: ## %else17
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: jne LBB22_15
; AVX512F-NEXT: LBB22_16: ## %else20
; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
; AVX512F-NEXT: jne LBB22_17
; AVX512F-NEXT: LBB22_18: ## %else23
; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
; AVX512F-NEXT: jne LBB22_19
; AVX512F-NEXT: LBB22_20: ## %else26
; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512F-NEXT: jne LBB22_21
; AVX512F-NEXT: LBB22_22: ## %else29
; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512F-NEXT: jne LBB22_23
; AVX512F-NEXT: LBB22_24: ## %else32
; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512F-NEXT: jne LBB22_25
; AVX512F-NEXT: LBB22_26: ## %else35
; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512F-NEXT: jne LBB22_27
; AVX512F-NEXT: LBB22_28: ## %else38
; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512F-NEXT: jne LBB22_29
; AVX512F-NEXT: LBB22_30: ## %else41
; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512F-NEXT: jne LBB22_31
; AVX512F-NEXT: LBB22_32: ## %else44
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
; AVX512F-NEXT: LBB22_1: ## %cond.load
; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je LBB22_4
; AVX512F-NEXT: LBB22_3: ## %cond.load1
; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je LBB22_6
; AVX512F-NEXT: LBB22_5: ## %cond.load4
; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je LBB22_8
; AVX512F-NEXT: LBB22_7: ## %cond.load7
; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je LBB22_10
; AVX512F-NEXT: LBB22_9: ## %cond.load10
; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je LBB22_12
; AVX512F-NEXT: LBB22_11: ## %cond.load13
; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je LBB22_14
; AVX512F-NEXT: LBB22_13: ## %cond.load16
; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je LBB22_16
; AVX512F-NEXT: LBB22_15: ## %cond.load19
; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
; AVX512F-NEXT: je LBB22_18
; AVX512F-NEXT: LBB22_17: ## %cond.load22
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
; AVX512F-NEXT: je LBB22_20
; AVX512F-NEXT: LBB22_19: ## %cond.load25
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512F-NEXT: je LBB22_22
; AVX512F-NEXT: LBB22_21: ## %cond.load28
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512F-NEXT: je LBB22_24
; AVX512F-NEXT: LBB22_23: ## %cond.load31
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512F-NEXT: je LBB22_26
; AVX512F-NEXT: LBB22_25: ## %cond.load34
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512F-NEXT: je LBB22_28
; AVX512F-NEXT: LBB22_27: ## %cond.load37
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512F-NEXT: je LBB22_30
; AVX512F-NEXT: LBB22_29: ## %cond.load40
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512F-NEXT: je LBB22_32
; AVX512F-NEXT: LBB22_31: ## %cond.load43
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v16i16_v16i16:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB22_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
; AVX512VLDQ-NEXT: testb $2, %al
; AVX512VLDQ-NEXT: jne LBB22_3
; AVX512VLDQ-NEXT: LBB22_4: ## %else2
; AVX512VLDQ-NEXT: testb $4, %al
; AVX512VLDQ-NEXT: jne LBB22_5
; AVX512VLDQ-NEXT: LBB22_6: ## %else5
; AVX512VLDQ-NEXT: testb $8, %al
; AVX512VLDQ-NEXT: jne LBB22_7
; AVX512VLDQ-NEXT: LBB22_8: ## %else8
; AVX512VLDQ-NEXT: testb $16, %al
; AVX512VLDQ-NEXT: jne LBB22_9
; AVX512VLDQ-NEXT: LBB22_10: ## %else11
; AVX512VLDQ-NEXT: testb $32, %al
; AVX512VLDQ-NEXT: jne LBB22_11
; AVX512VLDQ-NEXT: LBB22_12: ## %else14
; AVX512VLDQ-NEXT: testb $64, %al
; AVX512VLDQ-NEXT: jne LBB22_13
; AVX512VLDQ-NEXT: LBB22_14: ## %else17
; AVX512VLDQ-NEXT: testb $-128, %al
; AVX512VLDQ-NEXT: jne LBB22_15
; AVX512VLDQ-NEXT: LBB22_16: ## %else20
; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
; AVX512VLDQ-NEXT: jne LBB22_17
; AVX512VLDQ-NEXT: LBB22_18: ## %else23
; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
; AVX512VLDQ-NEXT: jne LBB22_19
; AVX512VLDQ-NEXT: LBB22_20: ## %else26
; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512VLDQ-NEXT: jne LBB22_21
; AVX512VLDQ-NEXT: LBB22_22: ## %else29
; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512VLDQ-NEXT: jne LBB22_23
; AVX512VLDQ-NEXT: LBB22_24: ## %else32
; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512VLDQ-NEXT: jne LBB22_25
; AVX512VLDQ-NEXT: LBB22_26: ## %else35
; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512VLDQ-NEXT: jne LBB22_27
; AVX512VLDQ-NEXT: LBB22_28: ## %else38
; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512VLDQ-NEXT: jne LBB22_29
; AVX512VLDQ-NEXT: LBB22_30: ## %else41
; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512VLDQ-NEXT: jne LBB22_31
; AVX512VLDQ-NEXT: LBB22_32: ## %else44
; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLDQ-NEXT: retq
; AVX512VLDQ-NEXT: LBB22_1: ## %cond.load
; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $2, %al
; AVX512VLDQ-NEXT: je LBB22_4
; AVX512VLDQ-NEXT: LBB22_3: ## %cond.load1
; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $4, %al
; AVX512VLDQ-NEXT: je LBB22_6
; AVX512VLDQ-NEXT: LBB22_5: ## %cond.load4
; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $8, %al
; AVX512VLDQ-NEXT: je LBB22_8
; AVX512VLDQ-NEXT: LBB22_7: ## %cond.load7
; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $16, %al
; AVX512VLDQ-NEXT: je LBB22_10
; AVX512VLDQ-NEXT: LBB22_9: ## %cond.load10
; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $32, %al
; AVX512VLDQ-NEXT: je LBB22_12
; AVX512VLDQ-NEXT: LBB22_11: ## %cond.load13
; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $64, %al
; AVX512VLDQ-NEXT: je LBB22_14
; AVX512VLDQ-NEXT: LBB22_13: ## %cond.load16
; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $-128, %al
; AVX512VLDQ-NEXT: je LBB22_16
; AVX512VLDQ-NEXT: LBB22_15: ## %cond.load19
; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
; AVX512VLDQ-NEXT: je LBB22_18
; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
; AVX512VLDQ-NEXT: je LBB22_20
; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512VLDQ-NEXT: je LBB22_22
; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512VLDQ-NEXT: je LBB22_24
; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512VLDQ-NEXT: je LBB22_26
; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512VLDQ-NEXT: je LBB22_28
; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512VLDQ-NEXT: je LBB22_30
; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512VLDQ-NEXT: je LBB22_32
; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v16i16_v16i16:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpmovw2m %ymm0, %k1
; AVX512VLBW-NEXT: vpblendmw (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp slt <16 x i16> %trigger, zeroinitializer
%res = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %addr, i32 4, <16 x i1> %mask, <16 x i16> %dst)
ret <16 x i16> %res
}
;
; vXi8
;
define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, <16 x i8>* %addr, <16 x i8> %dst) {
; SSE2-LABEL: load_v16i8_v16i8:
; SSE2: ## %bb.0:
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB23_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB23_3
; SSE2-NEXT: LBB23_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB23_5
; SSE2-NEXT: LBB23_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB23_7
; SSE2-NEXT: LBB23_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB23_9
; SSE2-NEXT: LBB23_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB23_11
; SSE2-NEXT: LBB23_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB23_13
; SSE2-NEXT: LBB23_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB23_15
; SSE2-NEXT: LBB23_16: ## %else20
; SSE2-NEXT: testl $256, %eax ## imm = 0x100
; SSE2-NEXT: jne LBB23_17
; SSE2-NEXT: LBB23_18: ## %else23
; SSE2-NEXT: testl $512, %eax ## imm = 0x200
; SSE2-NEXT: jne LBB23_19
; SSE2-NEXT: LBB23_20: ## %else26
; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
; SSE2-NEXT: jne LBB23_21
; SSE2-NEXT: LBB23_22: ## %else29
; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
; SSE2-NEXT: jne LBB23_23
; SSE2-NEXT: LBB23_24: ## %else32
; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE2-NEXT: jne LBB23_25
; SSE2-NEXT: LBB23_26: ## %else35
; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE2-NEXT: jne LBB23_27
; SSE2-NEXT: LBB23_28: ## %else38
; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE2-NEXT: jne LBB23_29
; SSE2-NEXT: LBB23_30: ## %else41
; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE2-NEXT: jne LBB23_31
; SSE2-NEXT: LBB23_32: ## %else44
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: LBB23_1: ## %cond.load
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl (%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB23_4
; SSE2-NEXT: LBB23_3: ## %cond.load1
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 1(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: psllw $8, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB23_6
; SSE2-NEXT: LBB23_5: ## %cond.load4
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 2(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB23_8
; SSE2-NEXT: LBB23_7: ## %cond.load7
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 3(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pslld $24, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB23_10
; SSE2-NEXT: LBB23_9: ## %cond.load10
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 4(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB23_12
; SSE2-NEXT: LBB23_11: ## %cond.load13
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 5(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: psllq $40, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB23_14
; SSE2-NEXT: LBB23_13: ## %cond.load16
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 6(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: psllq $48, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB23_16
; SSE2-NEXT: LBB23_15: ## %cond.load19
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 7(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: psllq $56, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testl $256, %eax ## imm = 0x100
; SSE2-NEXT: je LBB23_18
; SSE2-NEXT: LBB23_17: ## %cond.load22
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 8(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testl $512, %eax ## imm = 0x200
; SSE2-NEXT: je LBB23_20
; SSE2-NEXT: LBB23_19: ## %cond.load25
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 9(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
; SSE2-NEXT: je LBB23_22
; SSE2-NEXT: LBB23_21: ## %cond.load28
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 10(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
; SSE2-NEXT: je LBB23_24
; SSE2-NEXT: LBB23_23: ## %cond.load31
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 11(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE2-NEXT: je LBB23_26
; SSE2-NEXT: LBB23_25: ## %cond.load34
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 12(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE2-NEXT: je LBB23_28
; SSE2-NEXT: LBB23_27: ## %cond.load37
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 13(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE2-NEXT: je LBB23_30
; SSE2-NEXT: LBB23_29: ## %cond.load40
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movzbl 14(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE2-NEXT: je LBB23_32
; SSE2-NEXT: LBB23_31: ## %cond.load43
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movzbl 15(%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v16i8_v16i8:
; SSE42: ## %bb.0:
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB23_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB23_3
; SSE42-NEXT: LBB23_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB23_5
; SSE42-NEXT: LBB23_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB23_7
; SSE42-NEXT: LBB23_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB23_9
; SSE42-NEXT: LBB23_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB23_11
; SSE42-NEXT: LBB23_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB23_13
; SSE42-NEXT: LBB23_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB23_15
; SSE42-NEXT: LBB23_16: ## %else20
; SSE42-NEXT: testl $256, %eax ## imm = 0x100
; SSE42-NEXT: jne LBB23_17
; SSE42-NEXT: LBB23_18: ## %else23
; SSE42-NEXT: testl $512, %eax ## imm = 0x200
; SSE42-NEXT: jne LBB23_19
; SSE42-NEXT: LBB23_20: ## %else26
; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
; SSE42-NEXT: jne LBB23_21
; SSE42-NEXT: LBB23_22: ## %else29
; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
; SSE42-NEXT: jne LBB23_23
; SSE42-NEXT: LBB23_24: ## %else32
; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE42-NEXT: jne LBB23_25
; SSE42-NEXT: LBB23_26: ## %else35
; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE42-NEXT: jne LBB23_27
; SSE42-NEXT: LBB23_28: ## %else38
; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE42-NEXT: jne LBB23_29
; SSE42-NEXT: LBB23_30: ## %else41
; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE42-NEXT: jne LBB23_31
; SSE42-NEXT: LBB23_32: ## %else44
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
; SSE42-NEXT: LBB23_1: ## %cond.load
; SSE42-NEXT: pinsrb $0, (%rdi), %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB23_4
; SSE42-NEXT: LBB23_3: ## %cond.load1
; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm1
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB23_6
; SSE42-NEXT: LBB23_5: ## %cond.load4
; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm1
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB23_8
; SSE42-NEXT: LBB23_7: ## %cond.load7
; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm1
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB23_10
; SSE42-NEXT: LBB23_9: ## %cond.load10
; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm1
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB23_12
; SSE42-NEXT: LBB23_11: ## %cond.load13
; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm1
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB23_14
; SSE42-NEXT: LBB23_13: ## %cond.load16
; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm1
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB23_16
; SSE42-NEXT: LBB23_15: ## %cond.load19
; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm1
; SSE42-NEXT: testl $256, %eax ## imm = 0x100
; SSE42-NEXT: je LBB23_18
; SSE42-NEXT: LBB23_17: ## %cond.load22
; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm1
; SSE42-NEXT: testl $512, %eax ## imm = 0x200
; SSE42-NEXT: je LBB23_20
; SSE42-NEXT: LBB23_19: ## %cond.load25
; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm1
; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
; SSE42-NEXT: je LBB23_22
; SSE42-NEXT: LBB23_21: ## %cond.load28
; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm1
; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
; SSE42-NEXT: je LBB23_24
; SSE42-NEXT: LBB23_23: ## %cond.load31
; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm1
; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE42-NEXT: je LBB23_26
; SSE42-NEXT: LBB23_25: ## %cond.load34
; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm1
; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE42-NEXT: je LBB23_28
; SSE42-NEXT: LBB23_27: ## %cond.load37
; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm1
; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE42-NEXT: je LBB23_30
; SSE42-NEXT: LBB23_29: ## %cond.load40
; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm1
; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE42-NEXT: je LBB23_32
; SSE42-NEXT: LBB23_31: ## %cond.load43
; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: load_v16i8_v16i8:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: jne LBB23_1
; AVX1OR2-NEXT: ## %bb.2: ## %else
; AVX1OR2-NEXT: testb $2, %al
; AVX1OR2-NEXT: jne LBB23_3
; AVX1OR2-NEXT: LBB23_4: ## %else2
; AVX1OR2-NEXT: testb $4, %al
; AVX1OR2-NEXT: jne LBB23_5
; AVX1OR2-NEXT: LBB23_6: ## %else5
; AVX1OR2-NEXT: testb $8, %al
; AVX1OR2-NEXT: jne LBB23_7
; AVX1OR2-NEXT: LBB23_8: ## %else8
; AVX1OR2-NEXT: testb $16, %al
; AVX1OR2-NEXT: jne LBB23_9
; AVX1OR2-NEXT: LBB23_10: ## %else11
; AVX1OR2-NEXT: testb $32, %al
; AVX1OR2-NEXT: jne LBB23_11
; AVX1OR2-NEXT: LBB23_12: ## %else14
; AVX1OR2-NEXT: testb $64, %al
; AVX1OR2-NEXT: jne LBB23_13
; AVX1OR2-NEXT: LBB23_14: ## %else17
; AVX1OR2-NEXT: testb $-128, %al
; AVX1OR2-NEXT: jne LBB23_15
; AVX1OR2-NEXT: LBB23_16: ## %else20
; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100
; AVX1OR2-NEXT: jne LBB23_17
; AVX1OR2-NEXT: LBB23_18: ## %else23
; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200
; AVX1OR2-NEXT: jne LBB23_19
; AVX1OR2-NEXT: LBB23_20: ## %else26
; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1OR2-NEXT: jne LBB23_21
; AVX1OR2-NEXT: LBB23_22: ## %else29
; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1OR2-NEXT: jne LBB23_23
; AVX1OR2-NEXT: LBB23_24: ## %else32
; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1OR2-NEXT: jne LBB23_25
; AVX1OR2-NEXT: LBB23_26: ## %else35
; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX1OR2-NEXT: jne LBB23_27
; AVX1OR2-NEXT: LBB23_28: ## %else38
; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1OR2-NEXT: jne LBB23_29
; AVX1OR2-NEXT: LBB23_30: ## %else41
; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1OR2-NEXT: jne LBB23_31
; AVX1OR2-NEXT: LBB23_32: ## %else44
; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
; AVX1OR2-NEXT: retq
; AVX1OR2-NEXT: LBB23_1: ## %cond.load
; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $2, %al
; AVX1OR2-NEXT: je LBB23_4
; AVX1OR2-NEXT: LBB23_3: ## %cond.load1
; AVX1OR2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $4, %al
; AVX1OR2-NEXT: je LBB23_6
; AVX1OR2-NEXT: LBB23_5: ## %cond.load4
; AVX1OR2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $8, %al
; AVX1OR2-NEXT: je LBB23_8
; AVX1OR2-NEXT: LBB23_7: ## %cond.load7
; AVX1OR2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $16, %al
; AVX1OR2-NEXT: je LBB23_10
; AVX1OR2-NEXT: LBB23_9: ## %cond.load10
; AVX1OR2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $32, %al
; AVX1OR2-NEXT: je LBB23_12
; AVX1OR2-NEXT: LBB23_11: ## %cond.load13
; AVX1OR2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $64, %al
; AVX1OR2-NEXT: je LBB23_14
; AVX1OR2-NEXT: LBB23_13: ## %cond.load16
; AVX1OR2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testb $-128, %al
; AVX1OR2-NEXT: je LBB23_16
; AVX1OR2-NEXT: LBB23_15: ## %cond.load19
; AVX1OR2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100
; AVX1OR2-NEXT: je LBB23_18
; AVX1OR2-NEXT: LBB23_17: ## %cond.load22
; AVX1OR2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200
; AVX1OR2-NEXT: je LBB23_20
; AVX1OR2-NEXT: LBB23_19: ## %cond.load25
; AVX1OR2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1OR2-NEXT: je LBB23_22
; AVX1OR2-NEXT: LBB23_21: ## %cond.load28
; AVX1OR2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1OR2-NEXT: je LBB23_24
; AVX1OR2-NEXT: LBB23_23: ## %cond.load31
; AVX1OR2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1OR2-NEXT: je LBB23_26
; AVX1OR2-NEXT: LBB23_25: ## %cond.load34
; AVX1OR2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX1OR2-NEXT: je LBB23_28
; AVX1OR2-NEXT: LBB23_27: ## %cond.load37
; AVX1OR2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1OR2-NEXT: je LBB23_30
; AVX1OR2-NEXT: LBB23_29: ## %cond.load40
; AVX1OR2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1OR2-NEXT: je LBB23_32
; AVX1OR2-NEXT: LBB23_31: ## %cond.load43
; AVX1OR2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: load_v16i8_v16i8:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovmskb %xmm0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne LBB23_1
; AVX512F-NEXT: ## %bb.2: ## %else
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: jne LBB23_3
; AVX512F-NEXT: LBB23_4: ## %else2
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: jne LBB23_5
; AVX512F-NEXT: LBB23_6: ## %else5
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: jne LBB23_7
; AVX512F-NEXT: LBB23_8: ## %else8
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: jne LBB23_9
; AVX512F-NEXT: LBB23_10: ## %else11
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: jne LBB23_11
; AVX512F-NEXT: LBB23_12: ## %else14
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: jne LBB23_13
; AVX512F-NEXT: LBB23_14: ## %else17
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: jne LBB23_15
; AVX512F-NEXT: LBB23_16: ## %else20
; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
; AVX512F-NEXT: jne LBB23_17
; AVX512F-NEXT: LBB23_18: ## %else23
; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
; AVX512F-NEXT: jne LBB23_19
; AVX512F-NEXT: LBB23_20: ## %else26
; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512F-NEXT: jne LBB23_21
; AVX512F-NEXT: LBB23_22: ## %else29
; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512F-NEXT: jne LBB23_23
; AVX512F-NEXT: LBB23_24: ## %else32
; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512F-NEXT: jne LBB23_25
; AVX512F-NEXT: LBB23_26: ## %else35
; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512F-NEXT: jne LBB23_27
; AVX512F-NEXT: LBB23_28: ## %else38
; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512F-NEXT: jne LBB23_29
; AVX512F-NEXT: LBB23_30: ## %else41
; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512F-NEXT: jne LBB23_31
; AVX512F-NEXT: LBB23_32: ## %else44
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: retq
; AVX512F-NEXT: LBB23_1: ## %cond.load
; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je LBB23_4
; AVX512F-NEXT: LBB23_3: ## %cond.load1
; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je LBB23_6
; AVX512F-NEXT: LBB23_5: ## %cond.load4
; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je LBB23_8
; AVX512F-NEXT: LBB23_7: ## %cond.load7
; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je LBB23_10
; AVX512F-NEXT: LBB23_9: ## %cond.load10
; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je LBB23_12
; AVX512F-NEXT: LBB23_11: ## %cond.load13
; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je LBB23_14
; AVX512F-NEXT: LBB23_13: ## %cond.load16
; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je LBB23_16
; AVX512F-NEXT: LBB23_15: ## %cond.load19
; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
; AVX512F-NEXT: je LBB23_18
; AVX512F-NEXT: LBB23_17: ## %cond.load22
; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
; AVX512F-NEXT: je LBB23_20
; AVX512F-NEXT: LBB23_19: ## %cond.load25
; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512F-NEXT: je LBB23_22
; AVX512F-NEXT: LBB23_21: ## %cond.load28
; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512F-NEXT: je LBB23_24
; AVX512F-NEXT: LBB23_23: ## %cond.load31
; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512F-NEXT: je LBB23_26
; AVX512F-NEXT: LBB23_25: ## %cond.load34
; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512F-NEXT: je LBB23_28
; AVX512F-NEXT: LBB23_27: ## %cond.load37
; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512F-NEXT: je LBB23_30
; AVX512F-NEXT: LBB23_29: ## %cond.load40
; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512F-NEXT: je LBB23_32
; AVX512F-NEXT: LBB23_31: ## %cond.load43
; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v16i8_v16i8:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovmskb %xmm0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB23_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
; AVX512VLDQ-NEXT: testb $2, %al
; AVX512VLDQ-NEXT: jne LBB23_3
; AVX512VLDQ-NEXT: LBB23_4: ## %else2
; AVX512VLDQ-NEXT: testb $4, %al
; AVX512VLDQ-NEXT: jne LBB23_5
; AVX512VLDQ-NEXT: LBB23_6: ## %else5
; AVX512VLDQ-NEXT: testb $8, %al
; AVX512VLDQ-NEXT: jne LBB23_7
; AVX512VLDQ-NEXT: LBB23_8: ## %else8
; AVX512VLDQ-NEXT: testb $16, %al
; AVX512VLDQ-NEXT: jne LBB23_9
; AVX512VLDQ-NEXT: LBB23_10: ## %else11
; AVX512VLDQ-NEXT: testb $32, %al
; AVX512VLDQ-NEXT: jne LBB23_11
; AVX512VLDQ-NEXT: LBB23_12: ## %else14
; AVX512VLDQ-NEXT: testb $64, %al
; AVX512VLDQ-NEXT: jne LBB23_13
; AVX512VLDQ-NEXT: LBB23_14: ## %else17
; AVX512VLDQ-NEXT: testb $-128, %al
; AVX512VLDQ-NEXT: jne LBB23_15
; AVX512VLDQ-NEXT: LBB23_16: ## %else20
; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
; AVX512VLDQ-NEXT: jne LBB23_17
; AVX512VLDQ-NEXT: LBB23_18: ## %else23
; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
; AVX512VLDQ-NEXT: jne LBB23_19
; AVX512VLDQ-NEXT: LBB23_20: ## %else26
; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512VLDQ-NEXT: jne LBB23_21
; AVX512VLDQ-NEXT: LBB23_22: ## %else29
; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512VLDQ-NEXT: jne LBB23_23
; AVX512VLDQ-NEXT: LBB23_24: ## %else32
; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512VLDQ-NEXT: jne LBB23_25
; AVX512VLDQ-NEXT: LBB23_26: ## %else35
; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512VLDQ-NEXT: jne LBB23_27
; AVX512VLDQ-NEXT: LBB23_28: ## %else38
; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512VLDQ-NEXT: jne LBB23_29
; AVX512VLDQ-NEXT: LBB23_30: ## %else41
; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512VLDQ-NEXT: jne LBB23_31
; AVX512VLDQ-NEXT: LBB23_32: ## %else44
; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLDQ-NEXT: retq
; AVX512VLDQ-NEXT: LBB23_1: ## %cond.load
; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $2, %al
; AVX512VLDQ-NEXT: je LBB23_4
; AVX512VLDQ-NEXT: LBB23_3: ## %cond.load1
; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $4, %al
; AVX512VLDQ-NEXT: je LBB23_6
; AVX512VLDQ-NEXT: LBB23_5: ## %cond.load4
; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $8, %al
; AVX512VLDQ-NEXT: je LBB23_8
; AVX512VLDQ-NEXT: LBB23_7: ## %cond.load7
; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $16, %al
; AVX512VLDQ-NEXT: je LBB23_10
; AVX512VLDQ-NEXT: LBB23_9: ## %cond.load10
; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $32, %al
; AVX512VLDQ-NEXT: je LBB23_12
; AVX512VLDQ-NEXT: LBB23_11: ## %cond.load13
; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $64, %al
; AVX512VLDQ-NEXT: je LBB23_14
; AVX512VLDQ-NEXT: LBB23_13: ## %cond.load16
; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testb $-128, %al
; AVX512VLDQ-NEXT: je LBB23_16
; AVX512VLDQ-NEXT: LBB23_15: ## %cond.load19
; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
; AVX512VLDQ-NEXT: je LBB23_18
; AVX512VLDQ-NEXT: LBB23_17: ## %cond.load22
; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
; AVX512VLDQ-NEXT: je LBB23_20
; AVX512VLDQ-NEXT: LBB23_19: ## %cond.load25
; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512VLDQ-NEXT: je LBB23_22
; AVX512VLDQ-NEXT: LBB23_21: ## %cond.load28
; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512VLDQ-NEXT: je LBB23_24
; AVX512VLDQ-NEXT: LBB23_23: ## %cond.load31
; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512VLDQ-NEXT: je LBB23_26
; AVX512VLDQ-NEXT: LBB23_25: ## %cond.load34
; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512VLDQ-NEXT: je LBB23_28
; AVX512VLDQ-NEXT: LBB23_27: ## %cond.load37
; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512VLDQ-NEXT: je LBB23_30
; AVX512VLDQ-NEXT: LBB23_29: ## %cond.load40
; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512VLDQ-NEXT: je LBB23_32
; AVX512VLDQ-NEXT: LBB23_31: ## %cond.load43
; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v16i8_v16i8:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpmovb2m %xmm0, %k1
; AVX512VLBW-NEXT: vpblendmb (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp slt <16 x i8> %trigger, zeroinitializer
%res = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %addr, i32 4, <16 x i1> %mask, <16 x i8> %dst)
ret <16 x i8> %res
}
define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, <32 x i8>* %addr, <32 x i8> %dst) {
; SSE2-LABEL: load_v32i8_v32i8:
; SSE2: ## %bb.0:
; SSE2-NEXT: pmovmskb %xmm0, %ecx
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: orl %ecx, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB24_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB24_3
; SSE2-NEXT: LBB24_4: ## %else2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB24_5
; SSE2-NEXT: LBB24_6: ## %else5
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB24_7
; SSE2-NEXT: LBB24_8: ## %else8
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB24_9
; SSE2-NEXT: LBB24_10: ## %else11
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB24_11
; SSE2-NEXT: LBB24_12: ## %else14
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB24_13
; SSE2-NEXT: LBB24_14: ## %else17
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB24_15
; SSE2-NEXT: LBB24_16: ## %else20
; SSE2-NEXT: testl $256, %eax ## imm = 0x100
; SSE2-NEXT: jne LBB24_17
; SSE2-NEXT: LBB24_18: ## %else23
; SSE2-NEXT: testl $512, %eax ## imm = 0x200
; SSE2-NEXT: jne LBB24_19
; SSE2-NEXT: LBB24_20: ## %else26
; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
; SSE2-NEXT: jne LBB24_21
; SSE2-NEXT: LBB24_22: ## %else29
; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
; SSE2-NEXT: jne LBB24_23
; SSE2-NEXT: LBB24_24: ## %else32
; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE2-NEXT: jne LBB24_25
; SSE2-NEXT: LBB24_26: ## %else35
; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE2-NEXT: jne LBB24_27
; SSE2-NEXT: LBB24_28: ## %else38
; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE2-NEXT: jne LBB24_29
; SSE2-NEXT: LBB24_30: ## %else41
; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE2-NEXT: jne LBB24_31
; SSE2-NEXT: LBB24_32: ## %else44
; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000
; SSE2-NEXT: jne LBB24_33
; SSE2-NEXT: LBB24_34: ## %else47
; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000
; SSE2-NEXT: jne LBB24_35
; SSE2-NEXT: LBB24_36: ## %else50
; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000
; SSE2-NEXT: jne LBB24_37
; SSE2-NEXT: LBB24_38: ## %else53
; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000
; SSE2-NEXT: jne LBB24_39
; SSE2-NEXT: LBB24_40: ## %else56
; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000
; SSE2-NEXT: jne LBB24_41
; SSE2-NEXT: LBB24_42: ## %else59
; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000
; SSE2-NEXT: jne LBB24_43
; SSE2-NEXT: LBB24_44: ## %else62
; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000
; SSE2-NEXT: jne LBB24_45
; SSE2-NEXT: LBB24_46: ## %else65
; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000
; SSE2-NEXT: jne LBB24_47
; SSE2-NEXT: LBB24_48: ## %else68
; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000
; SSE2-NEXT: jne LBB24_49
; SSE2-NEXT: LBB24_50: ## %else71
; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000
; SSE2-NEXT: jne LBB24_51
; SSE2-NEXT: LBB24_52: ## %else74
; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000
; SSE2-NEXT: jne LBB24_53
; SSE2-NEXT: LBB24_54: ## %else77
; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000
; SSE2-NEXT: jne LBB24_55
; SSE2-NEXT: LBB24_56: ## %else80
; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000
; SSE2-NEXT: jne LBB24_57
; SSE2-NEXT: LBB24_58: ## %else83
; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000
; SSE2-NEXT: jne LBB24_59
; SSE2-NEXT: LBB24_60: ## %else86
; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; SSE2-NEXT: jne LBB24_61
; SSE2-NEXT: LBB24_62: ## %else89
; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; SSE2-NEXT: je LBB24_64
; SSE2-NEXT: LBB24_63: ## %cond.load91
; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
; SSE2-NEXT: movzbl 31(%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: LBB24_64: ## %else92
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: retq
; SSE2-NEXT: LBB24_1: ## %cond.load
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl (%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB24_4
; SSE2-NEXT: LBB24_3: ## %cond.load1
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 1(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB24_6
; SSE2-NEXT: LBB24_5: ## %cond.load4
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 2(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB24_8
; SSE2-NEXT: LBB24_7: ## %cond.load7
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 3(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslld $24, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB24_10
; SSE2-NEXT: LBB24_9: ## %cond.load10
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 4(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB24_12
; SSE2-NEXT: LBB24_11: ## %cond.load13
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 5(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: psllq $40, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB24_14
; SSE2-NEXT: LBB24_13: ## %cond.load16
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 6(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: psllq $48, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB24_16
; SSE2-NEXT: LBB24_15: ## %cond.load19
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 7(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: psllq $56, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $256, %eax ## imm = 0x100
; SSE2-NEXT: je LBB24_18
; SSE2-NEXT: LBB24_17: ## %cond.load22
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 8(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $512, %eax ## imm = 0x200
; SSE2-NEXT: je LBB24_20
; SSE2-NEXT: LBB24_19: ## %cond.load25
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 9(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
; SSE2-NEXT: je LBB24_22
; SSE2-NEXT: LBB24_21: ## %cond.load28
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 10(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
; SSE2-NEXT: je LBB24_24
; SSE2-NEXT: LBB24_23: ## %cond.load31
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 11(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE2-NEXT: je LBB24_26
; SSE2-NEXT: LBB24_25: ## %cond.load34
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 12(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE2-NEXT: je LBB24_28
; SSE2-NEXT: LBB24_27: ## %cond.load37
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 13(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE2-NEXT: je LBB24_30
; SSE2-NEXT: LBB24_29: ## %cond.load40
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movzbl 14(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE2-NEXT: je LBB24_32
; SSE2-NEXT: LBB24_31: ## %cond.load43
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movzbl 15(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000
; SSE2-NEXT: je LBB24_34
; SSE2-NEXT: LBB24_33: ## %cond.load46
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 16(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000
; SSE2-NEXT: je LBB24_36
; SSE2-NEXT: LBB24_35: ## %cond.load49
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 17(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000
; SSE2-NEXT: je LBB24_38
; SSE2-NEXT: LBB24_37: ## %cond.load52
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 18(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000
; SSE2-NEXT: je LBB24_40
; SSE2-NEXT: LBB24_39: ## %cond.load55
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 19(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslld $24, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000
; SSE2-NEXT: je LBB24_42
; SSE2-NEXT: LBB24_41: ## %cond.load58
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 20(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000
; SSE2-NEXT: je LBB24_44
; SSE2-NEXT: LBB24_43: ## %cond.load61
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 21(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: psllq $40, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000
; SSE2-NEXT: je LBB24_46
; SSE2-NEXT: LBB24_45: ## %cond.load64
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 22(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: psllq $48, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000
; SSE2-NEXT: je LBB24_48
; SSE2-NEXT: LBB24_47: ## %cond.load67
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 23(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: psllq $56, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000
; SSE2-NEXT: je LBB24_50
; SSE2-NEXT: LBB24_49: ## %cond.load70
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 24(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000
; SSE2-NEXT: je LBB24_52
; SSE2-NEXT: LBB24_51: ## %cond.load73
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 25(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000
; SSE2-NEXT: je LBB24_54
; SSE2-NEXT: LBB24_53: ## %cond.load76
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 26(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000
; SSE2-NEXT: je LBB24_56
; SSE2-NEXT: LBB24_55: ## %cond.load79
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 27(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000
; SSE2-NEXT: je LBB24_58
; SSE2-NEXT: LBB24_57: ## %cond.load82
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 28(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000
; SSE2-NEXT: je LBB24_60
; SSE2-NEXT: LBB24_59: ## %cond.load85
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 29(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; SSE2-NEXT: je LBB24_62
; SSE2-NEXT: LBB24_61: ## %cond.load88
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movzbl 30(%rdi), %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; SSE2-NEXT: jne LBB24_63
; SSE2-NEXT: jmp LBB24_64
;
; SSE42-LABEL: load_v32i8_v32i8:
; SSE42: ## %bb.0:
; SSE42-NEXT: pmovmskb %xmm0, %ecx
; SSE42-NEXT: pmovmskb %xmm1, %eax
; SSE42-NEXT: shll $16, %eax
; SSE42-NEXT: orl %ecx, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB24_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: jne LBB24_3
; SSE42-NEXT: LBB24_4: ## %else2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: jne LBB24_5
; SSE42-NEXT: LBB24_6: ## %else5
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: jne LBB24_7
; SSE42-NEXT: LBB24_8: ## %else8
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: jne LBB24_9
; SSE42-NEXT: LBB24_10: ## %else11
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: jne LBB24_11
; SSE42-NEXT: LBB24_12: ## %else14
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: jne LBB24_13
; SSE42-NEXT: LBB24_14: ## %else17
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB24_15
; SSE42-NEXT: LBB24_16: ## %else20
; SSE42-NEXT: testl $256, %eax ## imm = 0x100
; SSE42-NEXT: jne LBB24_17
; SSE42-NEXT: LBB24_18: ## %else23
; SSE42-NEXT: testl $512, %eax ## imm = 0x200
; SSE42-NEXT: jne LBB24_19
; SSE42-NEXT: LBB24_20: ## %else26
; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
; SSE42-NEXT: jne LBB24_21
; SSE42-NEXT: LBB24_22: ## %else29
; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
; SSE42-NEXT: jne LBB24_23
; SSE42-NEXT: LBB24_24: ## %else32
; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE42-NEXT: jne LBB24_25
; SSE42-NEXT: LBB24_26: ## %else35
; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE42-NEXT: jne LBB24_27
; SSE42-NEXT: LBB24_28: ## %else38
; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE42-NEXT: jne LBB24_29
; SSE42-NEXT: LBB24_30: ## %else41
; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE42-NEXT: jne LBB24_31
; SSE42-NEXT: LBB24_32: ## %else44
; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000
; SSE42-NEXT: jne LBB24_33
; SSE42-NEXT: LBB24_34: ## %else47
; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000
; SSE42-NEXT: jne LBB24_35
; SSE42-NEXT: LBB24_36: ## %else50
; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000
; SSE42-NEXT: jne LBB24_37
; SSE42-NEXT: LBB24_38: ## %else53
; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000
; SSE42-NEXT: jne LBB24_39
; SSE42-NEXT: LBB24_40: ## %else56
; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000
; SSE42-NEXT: jne LBB24_41
; SSE42-NEXT: LBB24_42: ## %else59
; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000
; SSE42-NEXT: jne LBB24_43
; SSE42-NEXT: LBB24_44: ## %else62
; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000
; SSE42-NEXT: jne LBB24_45
; SSE42-NEXT: LBB24_46: ## %else65
; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000
; SSE42-NEXT: jne LBB24_47
; SSE42-NEXT: LBB24_48: ## %else68
; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000
; SSE42-NEXT: jne LBB24_49
; SSE42-NEXT: LBB24_50: ## %else71
; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000
; SSE42-NEXT: jne LBB24_51
; SSE42-NEXT: LBB24_52: ## %else74
; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000
; SSE42-NEXT: jne LBB24_53
; SSE42-NEXT: LBB24_54: ## %else77
; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000
; SSE42-NEXT: jne LBB24_55
; SSE42-NEXT: LBB24_56: ## %else80
; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000
; SSE42-NEXT: jne LBB24_57
; SSE42-NEXT: LBB24_58: ## %else83
; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000
; SSE42-NEXT: jne LBB24_59
; SSE42-NEXT: LBB24_60: ## %else86
; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; SSE42-NEXT: jne LBB24_61
; SSE42-NEXT: LBB24_62: ## %else89
; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; SSE42-NEXT: je LBB24_64
; SSE42-NEXT: LBB24_63: ## %cond.load91
; SSE42-NEXT: pinsrb $15, 31(%rdi), %xmm3
; SSE42-NEXT: LBB24_64: ## %else92
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: movdqa %xmm3, %xmm1
; SSE42-NEXT: retq
; SSE42-NEXT: LBB24_1: ## %cond.load
; SSE42-NEXT: pinsrb $0, (%rdi), %xmm2
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB24_4
; SSE42-NEXT: LBB24_3: ## %cond.load1
; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm2
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB24_6
; SSE42-NEXT: LBB24_5: ## %cond.load4
; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm2
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB24_8
; SSE42-NEXT: LBB24_7: ## %cond.load7
; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm2
; SSE42-NEXT: testb $16, %al
; SSE42-NEXT: je LBB24_10
; SSE42-NEXT: LBB24_9: ## %cond.load10
; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm2
; SSE42-NEXT: testb $32, %al
; SSE42-NEXT: je LBB24_12
; SSE42-NEXT: LBB24_11: ## %cond.load13
; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm2
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB24_14
; SSE42-NEXT: LBB24_13: ## %cond.load16
; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm2
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB24_16
; SSE42-NEXT: LBB24_15: ## %cond.load19
; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm2
; SSE42-NEXT: testl $256, %eax ## imm = 0x100
; SSE42-NEXT: je LBB24_18
; SSE42-NEXT: LBB24_17: ## %cond.load22
; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm2
; SSE42-NEXT: testl $512, %eax ## imm = 0x200
; SSE42-NEXT: je LBB24_20
; SSE42-NEXT: LBB24_19: ## %cond.load25
; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm2
; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
; SSE42-NEXT: je LBB24_22
; SSE42-NEXT: LBB24_21: ## %cond.load28
; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm2
; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
; SSE42-NEXT: je LBB24_24
; SSE42-NEXT: LBB24_23: ## %cond.load31
; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm2
; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
; SSE42-NEXT: je LBB24_26
; SSE42-NEXT: LBB24_25: ## %cond.load34
; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm2
; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
; SSE42-NEXT: je LBB24_28
; SSE42-NEXT: LBB24_27: ## %cond.load37
; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm2
; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
; SSE42-NEXT: je LBB24_30
; SSE42-NEXT: LBB24_29: ## %cond.load40
; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm2
; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
; SSE42-NEXT: je LBB24_32
; SSE42-NEXT: LBB24_31: ## %cond.load43
; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm2
; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000
; SSE42-NEXT: je LBB24_34
; SSE42-NEXT: LBB24_33: ## %cond.load46
; SSE42-NEXT: pinsrb $0, 16(%rdi), %xmm3
; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000
; SSE42-NEXT: je LBB24_36
; SSE42-NEXT: LBB24_35: ## %cond.load49
; SSE42-NEXT: pinsrb $1, 17(%rdi), %xmm3
; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000
; SSE42-NEXT: je LBB24_38
; SSE42-NEXT: LBB24_37: ## %cond.load52
; SSE42-NEXT: pinsrb $2, 18(%rdi), %xmm3
; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000
; SSE42-NEXT: je LBB24_40
; SSE42-NEXT: LBB24_39: ## %cond.load55
; SSE42-NEXT: pinsrb $3, 19(%rdi), %xmm3
; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000
; SSE42-NEXT: je LBB24_42
; SSE42-NEXT: LBB24_41: ## %cond.load58
; SSE42-NEXT: pinsrb $4, 20(%rdi), %xmm3
; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000
; SSE42-NEXT: je LBB24_44
; SSE42-NEXT: LBB24_43: ## %cond.load61
; SSE42-NEXT: pinsrb $5, 21(%rdi), %xmm3
; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000
; SSE42-NEXT: je LBB24_46
; SSE42-NEXT: LBB24_45: ## %cond.load64
; SSE42-NEXT: pinsrb $6, 22(%rdi), %xmm3
; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000
; SSE42-NEXT: je LBB24_48
; SSE42-NEXT: LBB24_47: ## %cond.load67
; SSE42-NEXT: pinsrb $7, 23(%rdi), %xmm3
; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000
; SSE42-NEXT: je LBB24_50
; SSE42-NEXT: LBB24_49: ## %cond.load70
; SSE42-NEXT: pinsrb $8, 24(%rdi), %xmm3
; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000
; SSE42-NEXT: je LBB24_52
; SSE42-NEXT: LBB24_51: ## %cond.load73
; SSE42-NEXT: pinsrb $9, 25(%rdi), %xmm3
; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000
; SSE42-NEXT: je LBB24_54
; SSE42-NEXT: LBB24_53: ## %cond.load76
; SSE42-NEXT: pinsrb $10, 26(%rdi), %xmm3
; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000
; SSE42-NEXT: je LBB24_56
; SSE42-NEXT: LBB24_55: ## %cond.load79
; SSE42-NEXT: pinsrb $11, 27(%rdi), %xmm3
; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000
; SSE42-NEXT: je LBB24_58
; SSE42-NEXT: LBB24_57: ## %cond.load82
; SSE42-NEXT: pinsrb $12, 28(%rdi), %xmm3
; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000
; SSE42-NEXT: je LBB24_60
; SSE42-NEXT: LBB24_59: ## %cond.load85
; SSE42-NEXT: pinsrb $13, 29(%rdi), %xmm3
; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; SSE42-NEXT: je LBB24_62
; SSE42-NEXT: LBB24_61: ## %cond.load88
; SSE42-NEXT: pinsrb $14, 30(%rdi), %xmm3
; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; SSE42-NEXT: jne LBB24_63
; SSE42-NEXT: jmp LBB24_64
;
; AVX1-LABEL: load_v32i8_v32i8:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovmskb %xmm0, %ecx
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: shll $16, %eax
; AVX1-NEXT: orl %ecx, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne LBB24_1
; AVX1-NEXT: ## %bb.2: ## %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: jne LBB24_3
; AVX1-NEXT: LBB24_4: ## %else2
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: jne LBB24_5
; AVX1-NEXT: LBB24_6: ## %else5
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: jne LBB24_7
; AVX1-NEXT: LBB24_8: ## %else8
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: jne LBB24_9
; AVX1-NEXT: LBB24_10: ## %else11
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: jne LBB24_11
; AVX1-NEXT: LBB24_12: ## %else14
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: jne LBB24_13
; AVX1-NEXT: LBB24_14: ## %else17
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: jne LBB24_15
; AVX1-NEXT: LBB24_16: ## %else20
; AVX1-NEXT: testl $256, %eax ## imm = 0x100
; AVX1-NEXT: jne LBB24_17
; AVX1-NEXT: LBB24_18: ## %else23
; AVX1-NEXT: testl $512, %eax ## imm = 0x200
; AVX1-NEXT: jne LBB24_19
; AVX1-NEXT: LBB24_20: ## %else26
; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1-NEXT: jne LBB24_21
; AVX1-NEXT: LBB24_22: ## %else29
; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1-NEXT: jne LBB24_23
; AVX1-NEXT: LBB24_24: ## %else32
; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1-NEXT: jne LBB24_25
; AVX1-NEXT: LBB24_26: ## %else35
; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX1-NEXT: jne LBB24_27
; AVX1-NEXT: LBB24_28: ## %else38
; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1-NEXT: jne LBB24_29
; AVX1-NEXT: LBB24_30: ## %else41
; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1-NEXT: jne LBB24_31
; AVX1-NEXT: LBB24_32: ## %else44
; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX1-NEXT: jne LBB24_33
; AVX1-NEXT: LBB24_34: ## %else47
; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX1-NEXT: jne LBB24_35
; AVX1-NEXT: LBB24_36: ## %else50
; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX1-NEXT: jne LBB24_37
; AVX1-NEXT: LBB24_38: ## %else53
; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX1-NEXT: jne LBB24_39
; AVX1-NEXT: LBB24_40: ## %else56
; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX1-NEXT: jne LBB24_41
; AVX1-NEXT: LBB24_42: ## %else59
; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX1-NEXT: jne LBB24_43
; AVX1-NEXT: LBB24_44: ## %else62
; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX1-NEXT: jne LBB24_45
; AVX1-NEXT: LBB24_46: ## %else65
; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX1-NEXT: jne LBB24_47
; AVX1-NEXT: LBB24_48: ## %else68
; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX1-NEXT: jne LBB24_49
; AVX1-NEXT: LBB24_50: ## %else71
; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX1-NEXT: jne LBB24_51
; AVX1-NEXT: LBB24_52: ## %else74
; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX1-NEXT: jne LBB24_53
; AVX1-NEXT: LBB24_54: ## %else77
; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX1-NEXT: jne LBB24_55
; AVX1-NEXT: LBB24_56: ## %else80
; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX1-NEXT: jne LBB24_57
; AVX1-NEXT: LBB24_58: ## %else83
; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX1-NEXT: jne LBB24_59
; AVX1-NEXT: LBB24_60: ## %else86
; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX1-NEXT: jne LBB24_61
; AVX1-NEXT: LBB24_62: ## %else89
; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX1-NEXT: jne LBB24_63
; AVX1-NEXT: LBB24_64: ## %else92
; AVX1-NEXT: vmovaps %ymm1, %ymm0
; AVX1-NEXT: retq
; AVX1-NEXT: LBB24_1: ## %cond.load
; AVX1-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB24_4
; AVX1-NEXT: LBB24_3: ## %cond.load1
; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB24_6
; AVX1-NEXT: LBB24_5: ## %cond.load4
; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB24_8
; AVX1-NEXT: LBB24_7: ## %cond.load7
; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je LBB24_10
; AVX1-NEXT: LBB24_9: ## %cond.load10
; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je LBB24_12
; AVX1-NEXT: LBB24_11: ## %cond.load13
; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je LBB24_14
; AVX1-NEXT: LBB24_13: ## %cond.load16
; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je LBB24_16
; AVX1-NEXT: LBB24_15: ## %cond.load19
; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $256, %eax ## imm = 0x100
; AVX1-NEXT: je LBB24_18
; AVX1-NEXT: LBB24_17: ## %cond.load22
; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $512, %eax ## imm = 0x200
; AVX1-NEXT: je LBB24_20
; AVX1-NEXT: LBB24_19: ## %cond.load25
; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1-NEXT: je LBB24_22
; AVX1-NEXT: LBB24_21: ## %cond.load28
; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1-NEXT: je LBB24_24
; AVX1-NEXT: LBB24_23: ## %cond.load31
; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1-NEXT: je LBB24_26
; AVX1-NEXT: LBB24_25: ## %cond.load34
; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX1-NEXT: je LBB24_28
; AVX1-NEXT: LBB24_27: ## %cond.load37
; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1-NEXT: je LBB24_30
; AVX1-NEXT: LBB24_29: ## %cond.load40
; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1-NEXT: je LBB24_32
; AVX1-NEXT: LBB24_31: ## %cond.load43
; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX1-NEXT: je LBB24_34
; AVX1-NEXT: LBB24_33: ## %cond.load46
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX1-NEXT: je LBB24_36
; AVX1-NEXT: LBB24_35: ## %cond.load49
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX1-NEXT: je LBB24_38
; AVX1-NEXT: LBB24_37: ## %cond.load52
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX1-NEXT: je LBB24_40
; AVX1-NEXT: LBB24_39: ## %cond.load55
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX1-NEXT: je LBB24_42
; AVX1-NEXT: LBB24_41: ## %cond.load58
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX1-NEXT: je LBB24_44
; AVX1-NEXT: LBB24_43: ## %cond.load61
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX1-NEXT: je LBB24_46
; AVX1-NEXT: LBB24_45: ## %cond.load64
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX1-NEXT: je LBB24_48
; AVX1-NEXT: LBB24_47: ## %cond.load67
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX1-NEXT: je LBB24_50
; AVX1-NEXT: LBB24_49: ## %cond.load70
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX1-NEXT: je LBB24_52
; AVX1-NEXT: LBB24_51: ## %cond.load73
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX1-NEXT: je LBB24_54
; AVX1-NEXT: LBB24_53: ## %cond.load76
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX1-NEXT: je LBB24_56
; AVX1-NEXT: LBB24_55: ## %cond.load79
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX1-NEXT: je LBB24_58
; AVX1-NEXT: LBB24_57: ## %cond.load82
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX1-NEXT: je LBB24_60
; AVX1-NEXT: LBB24_59: ## %cond.load85
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX1-NEXT: je LBB24_62
; AVX1-NEXT: LBB24_61: ## %cond.load88
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX1-NEXT: je LBB24_64
; AVX1-NEXT: LBB24_63: ## %cond.load91
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v32i8_v32i8:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne LBB24_1
; AVX2-NEXT: ## %bb.2: ## %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: jne LBB24_3
; AVX2-NEXT: LBB24_4: ## %else2
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: jne LBB24_5
; AVX2-NEXT: LBB24_6: ## %else5
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: jne LBB24_7
; AVX2-NEXT: LBB24_8: ## %else8
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: jne LBB24_9
; AVX2-NEXT: LBB24_10: ## %else11
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: jne LBB24_11
; AVX2-NEXT: LBB24_12: ## %else14
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: jne LBB24_13
; AVX2-NEXT: LBB24_14: ## %else17
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: jne LBB24_15
; AVX2-NEXT: LBB24_16: ## %else20
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: jne LBB24_17
; AVX2-NEXT: LBB24_18: ## %else23
; AVX2-NEXT: testl $512, %eax ## imm = 0x200
; AVX2-NEXT: jne LBB24_19
; AVX2-NEXT: LBB24_20: ## %else26
; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX2-NEXT: jne LBB24_21
; AVX2-NEXT: LBB24_22: ## %else29
; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX2-NEXT: jne LBB24_23
; AVX2-NEXT: LBB24_24: ## %else32
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: jne LBB24_25
; AVX2-NEXT: LBB24_26: ## %else35
; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX2-NEXT: jne LBB24_27
; AVX2-NEXT: LBB24_28: ## %else38
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: jne LBB24_29
; AVX2-NEXT: LBB24_30: ## %else41
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: jne LBB24_31
; AVX2-NEXT: LBB24_32: ## %else44
; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX2-NEXT: jne LBB24_33
; AVX2-NEXT: LBB24_34: ## %else47
; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX2-NEXT: jne LBB24_35
; AVX2-NEXT: LBB24_36: ## %else50
; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX2-NEXT: jne LBB24_37
; AVX2-NEXT: LBB24_38: ## %else53
; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX2-NEXT: jne LBB24_39
; AVX2-NEXT: LBB24_40: ## %else56
; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX2-NEXT: jne LBB24_41
; AVX2-NEXT: LBB24_42: ## %else59
; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX2-NEXT: jne LBB24_43
; AVX2-NEXT: LBB24_44: ## %else62
; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX2-NEXT: jne LBB24_45
; AVX2-NEXT: LBB24_46: ## %else65
; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX2-NEXT: jne LBB24_47
; AVX2-NEXT: LBB24_48: ## %else68
; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX2-NEXT: jne LBB24_49
; AVX2-NEXT: LBB24_50: ## %else71
; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX2-NEXT: jne LBB24_51
; AVX2-NEXT: LBB24_52: ## %else74
; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX2-NEXT: jne LBB24_53
; AVX2-NEXT: LBB24_54: ## %else77
; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX2-NEXT: jne LBB24_55
; AVX2-NEXT: LBB24_56: ## %else80
; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX2-NEXT: jne LBB24_57
; AVX2-NEXT: LBB24_58: ## %else83
; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX2-NEXT: jne LBB24_59
; AVX2-NEXT: LBB24_60: ## %else86
; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX2-NEXT: jne LBB24_61
; AVX2-NEXT: LBB24_62: ## %else89
; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX2-NEXT: jne LBB24_63
; AVX2-NEXT: LBB24_64: ## %else92
; AVX2-NEXT: vmovdqa %ymm1, %ymm0
; AVX2-NEXT: retq
; AVX2-NEXT: LBB24_1: ## %cond.load
; AVX2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je LBB24_4
; AVX2-NEXT: LBB24_3: ## %cond.load1
; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB24_6
; AVX2-NEXT: LBB24_5: ## %cond.load4
; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB24_8
; AVX2-NEXT: LBB24_7: ## %cond.load7
; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB24_10
; AVX2-NEXT: LBB24_9: ## %cond.load10
; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je LBB24_12
; AVX2-NEXT: LBB24_11: ## %cond.load13
; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je LBB24_14
; AVX2-NEXT: LBB24_13: ## %cond.load16
; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB24_16
; AVX2-NEXT: LBB24_15: ## %cond.load19
; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: je LBB24_18
; AVX2-NEXT: LBB24_17: ## %cond.load22
; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $512, %eax ## imm = 0x200
; AVX2-NEXT: je LBB24_20
; AVX2-NEXT: LBB24_19: ## %cond.load25
; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX2-NEXT: je LBB24_22
; AVX2-NEXT: LBB24_21: ## %cond.load28
; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX2-NEXT: je LBB24_24
; AVX2-NEXT: LBB24_23: ## %cond.load31
; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: je LBB24_26
; AVX2-NEXT: LBB24_25: ## %cond.load34
; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX2-NEXT: je LBB24_28
; AVX2-NEXT: LBB24_27: ## %cond.load37
; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: je LBB24_30
; AVX2-NEXT: LBB24_29: ## %cond.load40
; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: je LBB24_32
; AVX2-NEXT: LBB24_31: ## %cond.load43
; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX2-NEXT: je LBB24_34
; AVX2-NEXT: LBB24_33: ## %cond.load46
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX2-NEXT: je LBB24_36
; AVX2-NEXT: LBB24_35: ## %cond.load49
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX2-NEXT: je LBB24_38
; AVX2-NEXT: LBB24_37: ## %cond.load52
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX2-NEXT: je LBB24_40
; AVX2-NEXT: LBB24_39: ## %cond.load55
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX2-NEXT: je LBB24_42
; AVX2-NEXT: LBB24_41: ## %cond.load58
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX2-NEXT: je LBB24_44
; AVX2-NEXT: LBB24_43: ## %cond.load61
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX2-NEXT: je LBB24_46
; AVX2-NEXT: LBB24_45: ## %cond.load64
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX2-NEXT: je LBB24_48
; AVX2-NEXT: LBB24_47: ## %cond.load67
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX2-NEXT: je LBB24_50
; AVX2-NEXT: LBB24_49: ## %cond.load70
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX2-NEXT: je LBB24_52
; AVX2-NEXT: LBB24_51: ## %cond.load73
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX2-NEXT: je LBB24_54
; AVX2-NEXT: LBB24_53: ## %cond.load76
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX2-NEXT: je LBB24_56
; AVX2-NEXT: LBB24_55: ## %cond.load79
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX2-NEXT: je LBB24_58
; AVX2-NEXT: LBB24_57: ## %cond.load82
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX2-NEXT: je LBB24_60
; AVX2-NEXT: LBB24_59: ## %cond.load85
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX2-NEXT: je LBB24_62
; AVX2-NEXT: LBB24_61: ## %cond.load88
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX2-NEXT: je LBB24_64
; AVX2-NEXT: LBB24_63: ## %cond.load91
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v32i8_v32i8:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovmskb %ymm0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne LBB24_1
; AVX512F-NEXT: ## %bb.2: ## %else
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: jne LBB24_3
; AVX512F-NEXT: LBB24_4: ## %else2
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: jne LBB24_5
; AVX512F-NEXT: LBB24_6: ## %else5
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: jne LBB24_7
; AVX512F-NEXT: LBB24_8: ## %else8
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: jne LBB24_9
; AVX512F-NEXT: LBB24_10: ## %else11
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: jne LBB24_11
; AVX512F-NEXT: LBB24_12: ## %else14
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: jne LBB24_13
; AVX512F-NEXT: LBB24_14: ## %else17
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: jne LBB24_15
; AVX512F-NEXT: LBB24_16: ## %else20
; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
; AVX512F-NEXT: jne LBB24_17
; AVX512F-NEXT: LBB24_18: ## %else23
; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
; AVX512F-NEXT: jne LBB24_19
; AVX512F-NEXT: LBB24_20: ## %else26
; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512F-NEXT: jne LBB24_21
; AVX512F-NEXT: LBB24_22: ## %else29
; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512F-NEXT: jne LBB24_23
; AVX512F-NEXT: LBB24_24: ## %else32
; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512F-NEXT: jne LBB24_25
; AVX512F-NEXT: LBB24_26: ## %else35
; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512F-NEXT: jne LBB24_27
; AVX512F-NEXT: LBB24_28: ## %else38
; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512F-NEXT: jne LBB24_29
; AVX512F-NEXT: LBB24_30: ## %else41
; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512F-NEXT: jne LBB24_31
; AVX512F-NEXT: LBB24_32: ## %else44
; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX512F-NEXT: jne LBB24_33
; AVX512F-NEXT: LBB24_34: ## %else47
; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX512F-NEXT: jne LBB24_35
; AVX512F-NEXT: LBB24_36: ## %else50
; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX512F-NEXT: jne LBB24_37
; AVX512F-NEXT: LBB24_38: ## %else53
; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX512F-NEXT: jne LBB24_39
; AVX512F-NEXT: LBB24_40: ## %else56
; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX512F-NEXT: jne LBB24_41
; AVX512F-NEXT: LBB24_42: ## %else59
; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX512F-NEXT: jne LBB24_43
; AVX512F-NEXT: LBB24_44: ## %else62
; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX512F-NEXT: jne LBB24_45
; AVX512F-NEXT: LBB24_46: ## %else65
; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX512F-NEXT: jne LBB24_47
; AVX512F-NEXT: LBB24_48: ## %else68
; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX512F-NEXT: jne LBB24_49
; AVX512F-NEXT: LBB24_50: ## %else71
; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX512F-NEXT: jne LBB24_51
; AVX512F-NEXT: LBB24_52: ## %else74
; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX512F-NEXT: jne LBB24_53
; AVX512F-NEXT: LBB24_54: ## %else77
; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX512F-NEXT: jne LBB24_55
; AVX512F-NEXT: LBB24_56: ## %else80
; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX512F-NEXT: jne LBB24_57
; AVX512F-NEXT: LBB24_58: ## %else83
; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX512F-NEXT: jne LBB24_59
; AVX512F-NEXT: LBB24_60: ## %else86
; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX512F-NEXT: jne LBB24_61
; AVX512F-NEXT: LBB24_62: ## %else89
; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX512F-NEXT: jne LBB24_63
; AVX512F-NEXT: LBB24_64: ## %else92
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
; AVX512F-NEXT: LBB24_1: ## %cond.load
; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je LBB24_4
; AVX512F-NEXT: LBB24_3: ## %cond.load1
; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je LBB24_6
; AVX512F-NEXT: LBB24_5: ## %cond.load4
; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je LBB24_8
; AVX512F-NEXT: LBB24_7: ## %cond.load7
; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je LBB24_10
; AVX512F-NEXT: LBB24_9: ## %cond.load10
; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je LBB24_12
; AVX512F-NEXT: LBB24_11: ## %cond.load13
; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je LBB24_14
; AVX512F-NEXT: LBB24_13: ## %cond.load16
; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je LBB24_16
; AVX512F-NEXT: LBB24_15: ## %cond.load19
; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
; AVX512F-NEXT: je LBB24_18
; AVX512F-NEXT: LBB24_17: ## %cond.load22
; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
; AVX512F-NEXT: je LBB24_20
; AVX512F-NEXT: LBB24_19: ## %cond.load25
; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512F-NEXT: je LBB24_22
; AVX512F-NEXT: LBB24_21: ## %cond.load28
; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512F-NEXT: je LBB24_24
; AVX512F-NEXT: LBB24_23: ## %cond.load31
; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512F-NEXT: je LBB24_26
; AVX512F-NEXT: LBB24_25: ## %cond.load34
; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512F-NEXT: je LBB24_28
; AVX512F-NEXT: LBB24_27: ## %cond.load37
; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512F-NEXT: je LBB24_30
; AVX512F-NEXT: LBB24_29: ## %cond.load40
; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512F-NEXT: je LBB24_32
; AVX512F-NEXT: LBB24_31: ## %cond.load43
; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX512F-NEXT: je LBB24_34
; AVX512F-NEXT: LBB24_33: ## %cond.load46
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX512F-NEXT: je LBB24_36
; AVX512F-NEXT: LBB24_35: ## %cond.load49
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX512F-NEXT: je LBB24_38
; AVX512F-NEXT: LBB24_37: ## %cond.load52
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX512F-NEXT: je LBB24_40
; AVX512F-NEXT: LBB24_39: ## %cond.load55
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX512F-NEXT: je LBB24_42
; AVX512F-NEXT: LBB24_41: ## %cond.load58
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX512F-NEXT: je LBB24_44
; AVX512F-NEXT: LBB24_43: ## %cond.load61
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX512F-NEXT: je LBB24_46
; AVX512F-NEXT: LBB24_45: ## %cond.load64
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX512F-NEXT: je LBB24_48
; AVX512F-NEXT: LBB24_47: ## %cond.load67
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX512F-NEXT: je LBB24_50
; AVX512F-NEXT: LBB24_49: ## %cond.load70
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX512F-NEXT: je LBB24_52
; AVX512F-NEXT: LBB24_51: ## %cond.load73
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX512F-NEXT: je LBB24_54
; AVX512F-NEXT: LBB24_53: ## %cond.load76
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX512F-NEXT: je LBB24_56
; AVX512F-NEXT: LBB24_55: ## %cond.load79
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX512F-NEXT: je LBB24_58
; AVX512F-NEXT: LBB24_57: ## %cond.load82
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX512F-NEXT: je LBB24_60
; AVX512F-NEXT: LBB24_59: ## %cond.load85
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX512F-NEXT: je LBB24_62
; AVX512F-NEXT: LBB24_61: ## %cond.load88
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX512F-NEXT: je LBB24_64
; AVX512F-NEXT: LBB24_63: ## %cond.load91
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_v32i8_v32i8:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovmskb %ymm0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB24_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
; AVX512VLDQ-NEXT: testb $2, %al
; AVX512VLDQ-NEXT: jne LBB24_3
; AVX512VLDQ-NEXT: LBB24_4: ## %else2
; AVX512VLDQ-NEXT: testb $4, %al
; AVX512VLDQ-NEXT: jne LBB24_5
; AVX512VLDQ-NEXT: LBB24_6: ## %else5
; AVX512VLDQ-NEXT: testb $8, %al
; AVX512VLDQ-NEXT: jne LBB24_7
; AVX512VLDQ-NEXT: LBB24_8: ## %else8
; AVX512VLDQ-NEXT: testb $16, %al
; AVX512VLDQ-NEXT: jne LBB24_9
; AVX512VLDQ-NEXT: LBB24_10: ## %else11
; AVX512VLDQ-NEXT: testb $32, %al
; AVX512VLDQ-NEXT: jne LBB24_11
; AVX512VLDQ-NEXT: LBB24_12: ## %else14
; AVX512VLDQ-NEXT: testb $64, %al
; AVX512VLDQ-NEXT: jne LBB24_13
; AVX512VLDQ-NEXT: LBB24_14: ## %else17
; AVX512VLDQ-NEXT: testb $-128, %al
; AVX512VLDQ-NEXT: jne LBB24_15
; AVX512VLDQ-NEXT: LBB24_16: ## %else20
; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
; AVX512VLDQ-NEXT: jne LBB24_17
; AVX512VLDQ-NEXT: LBB24_18: ## %else23
; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
; AVX512VLDQ-NEXT: jne LBB24_19
; AVX512VLDQ-NEXT: LBB24_20: ## %else26
; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512VLDQ-NEXT: jne LBB24_21
; AVX512VLDQ-NEXT: LBB24_22: ## %else29
; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512VLDQ-NEXT: jne LBB24_23
; AVX512VLDQ-NEXT: LBB24_24: ## %else32
; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512VLDQ-NEXT: jne LBB24_25
; AVX512VLDQ-NEXT: LBB24_26: ## %else35
; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512VLDQ-NEXT: jne LBB24_27
; AVX512VLDQ-NEXT: LBB24_28: ## %else38
; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512VLDQ-NEXT: jne LBB24_29
; AVX512VLDQ-NEXT: LBB24_30: ## %else41
; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512VLDQ-NEXT: jne LBB24_31
; AVX512VLDQ-NEXT: LBB24_32: ## %else44
; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX512VLDQ-NEXT: jne LBB24_33
; AVX512VLDQ-NEXT: LBB24_34: ## %else47
; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX512VLDQ-NEXT: jne LBB24_35
; AVX512VLDQ-NEXT: LBB24_36: ## %else50
; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX512VLDQ-NEXT: jne LBB24_37
; AVX512VLDQ-NEXT: LBB24_38: ## %else53
; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX512VLDQ-NEXT: jne LBB24_39
; AVX512VLDQ-NEXT: LBB24_40: ## %else56
; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX512VLDQ-NEXT: jne LBB24_41
; AVX512VLDQ-NEXT: LBB24_42: ## %else59
; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX512VLDQ-NEXT: jne LBB24_43
; AVX512VLDQ-NEXT: LBB24_44: ## %else62
; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX512VLDQ-NEXT: jne LBB24_45
; AVX512VLDQ-NEXT: LBB24_46: ## %else65
; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX512VLDQ-NEXT: jne LBB24_47
; AVX512VLDQ-NEXT: LBB24_48: ## %else68
; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX512VLDQ-NEXT: jne LBB24_49
; AVX512VLDQ-NEXT: LBB24_50: ## %else71
; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX512VLDQ-NEXT: jne LBB24_51
; AVX512VLDQ-NEXT: LBB24_52: ## %else74
; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX512VLDQ-NEXT: jne LBB24_53
; AVX512VLDQ-NEXT: LBB24_54: ## %else77
; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX512VLDQ-NEXT: jne LBB24_55
; AVX512VLDQ-NEXT: LBB24_56: ## %else80
; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX512VLDQ-NEXT: jne LBB24_57
; AVX512VLDQ-NEXT: LBB24_58: ## %else83
; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX512VLDQ-NEXT: jne LBB24_59
; AVX512VLDQ-NEXT: LBB24_60: ## %else86
; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX512VLDQ-NEXT: jne LBB24_61
; AVX512VLDQ-NEXT: LBB24_62: ## %else89
; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX512VLDQ-NEXT: jne LBB24_63
; AVX512VLDQ-NEXT: LBB24_64: ## %else92
; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLDQ-NEXT: retq
; AVX512VLDQ-NEXT: LBB24_1: ## %cond.load
; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $2, %al
; AVX512VLDQ-NEXT: je LBB24_4
; AVX512VLDQ-NEXT: LBB24_3: ## %cond.load1
; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $4, %al
; AVX512VLDQ-NEXT: je LBB24_6
; AVX512VLDQ-NEXT: LBB24_5: ## %cond.load4
; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $8, %al
; AVX512VLDQ-NEXT: je LBB24_8
; AVX512VLDQ-NEXT: LBB24_7: ## %cond.load7
; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $16, %al
; AVX512VLDQ-NEXT: je LBB24_10
; AVX512VLDQ-NEXT: LBB24_9: ## %cond.load10
; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $32, %al
; AVX512VLDQ-NEXT: je LBB24_12
; AVX512VLDQ-NEXT: LBB24_11: ## %cond.load13
; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $64, %al
; AVX512VLDQ-NEXT: je LBB24_14
; AVX512VLDQ-NEXT: LBB24_13: ## %cond.load16
; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testb $-128, %al
; AVX512VLDQ-NEXT: je LBB24_16
; AVX512VLDQ-NEXT: LBB24_15: ## %cond.load19
; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
; AVX512VLDQ-NEXT: je LBB24_18
; AVX512VLDQ-NEXT: LBB24_17: ## %cond.load22
; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
; AVX512VLDQ-NEXT: je LBB24_20
; AVX512VLDQ-NEXT: LBB24_19: ## %cond.load25
; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512VLDQ-NEXT: je LBB24_22
; AVX512VLDQ-NEXT: LBB24_21: ## %cond.load28
; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512VLDQ-NEXT: je LBB24_24
; AVX512VLDQ-NEXT: LBB24_23: ## %cond.load31
; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512VLDQ-NEXT: je LBB24_26
; AVX512VLDQ-NEXT: LBB24_25: ## %cond.load34
; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512VLDQ-NEXT: je LBB24_28
; AVX512VLDQ-NEXT: LBB24_27: ## %cond.load37
; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512VLDQ-NEXT: je LBB24_30
; AVX512VLDQ-NEXT: LBB24_29: ## %cond.load40
; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512VLDQ-NEXT: je LBB24_32
; AVX512VLDQ-NEXT: LBB24_31: ## %cond.load43
; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX512VLDQ-NEXT: je LBB24_34
; AVX512VLDQ-NEXT: LBB24_33: ## %cond.load46
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX512VLDQ-NEXT: je LBB24_36
; AVX512VLDQ-NEXT: LBB24_35: ## %cond.load49
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX512VLDQ-NEXT: je LBB24_38
; AVX512VLDQ-NEXT: LBB24_37: ## %cond.load52
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX512VLDQ-NEXT: je LBB24_40
; AVX512VLDQ-NEXT: LBB24_39: ## %cond.load55
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX512VLDQ-NEXT: je LBB24_42
; AVX512VLDQ-NEXT: LBB24_41: ## %cond.load58
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX512VLDQ-NEXT: je LBB24_44
; AVX512VLDQ-NEXT: LBB24_43: ## %cond.load61
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX512VLDQ-NEXT: je LBB24_46
; AVX512VLDQ-NEXT: LBB24_45: ## %cond.load64
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX512VLDQ-NEXT: je LBB24_48
; AVX512VLDQ-NEXT: LBB24_47: ## %cond.load67
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX512VLDQ-NEXT: je LBB24_50
; AVX512VLDQ-NEXT: LBB24_49: ## %cond.load70
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX512VLDQ-NEXT: je LBB24_52
; AVX512VLDQ-NEXT: LBB24_51: ## %cond.load73
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX512VLDQ-NEXT: je LBB24_54
; AVX512VLDQ-NEXT: LBB24_53: ## %cond.load76
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX512VLDQ-NEXT: je LBB24_56
; AVX512VLDQ-NEXT: LBB24_55: ## %cond.load79
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX512VLDQ-NEXT: je LBB24_58
; AVX512VLDQ-NEXT: LBB24_57: ## %cond.load82
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX512VLDQ-NEXT: je LBB24_60
; AVX512VLDQ-NEXT: LBB24_59: ## %cond.load85
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX512VLDQ-NEXT: je LBB24_62
; AVX512VLDQ-NEXT: LBB24_61: ## %cond.load88
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX512VLDQ-NEXT: je LBB24_64
; AVX512VLDQ-NEXT: LBB24_63: ## %cond.load91
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_v32i8_v32i8:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpmovb2m %ymm0, %k1
; AVX512VLBW-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp slt <32 x i8> %trigger, zeroinitializer
%res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1> %mask, <32 x i8> %dst)
ret <32 x i8> %res
}
;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
; 128-bit FP vectors are supported with AVX.
define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
; SSE2-LABEL: mload_constmask_v4f32:
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
; SSE2-NEXT: retq
;
; SSE42-LABEL: mload_constmask_v4f32:
; SSE42: ## %bb.0:
; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v4f32:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: movw $13, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v4f32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $13, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v4f32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $13, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
ret <4 x float> %res
}
define <4 x float> @mload_constmask_v4f32_all(<4 x float>* %addr) {
; SSE-LABEL: mload_constmask_v4f32_all:
; SSE: ## %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v4f32_all:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vmovups (%rdi), %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f32_all:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: movw $15, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: mload_constmask_v4f32_all:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: kxnorw %k0, %k0, %k1
; AVX512VL-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
ret <4 x float> %res
}
define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) {
; SSE-LABEL: mload_constmask_v2f64:
; SSE: ## %bb.0:
; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: mload_constmask_v2f64:
; AVX: ## %bb.0:
; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; AVX-NEXT: retq
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
ret <2 x double> %res
}
; 128-bit integer vectors are supported with AVX2.
define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
; SSE2-LABEL: mload_constmask_v4i32:
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movaps %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,2]
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: mload_constmask_v4i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: mload_constmask_v4i32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: mload_constmask_v4i32:
; AVX2: ## %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: movw $14, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v4i32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $14, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v4i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $14, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
ret <4 x i32> %res
}
define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
; SSE2-LABEL: mload_constmask_v2i64:
; SSE2: ## %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE42-LABEL: mload_constmask_v2i64:
; SSE42: ## %bb.0:
; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: mload_constmask_v2i64:
; AVX: ## %bb.0:
; AVX-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
ret <2 x i64> %res
}
; 256-bit FP vectors are supported with AVX.
define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
; SSE2-LABEL: mload_constmask_v8f32:
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: mload_constmask_v8f32:
; SSE42: ## %bb.0:
; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v8f32:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8f32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: movw $7, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v8f32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $7, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v8f32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $7, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
ret <8 x float> %res
}
define <8 x float> @mload_constmask_v8f32_zero(<8 x float>* %addr, <8 x float> %dst) {
; SSE2-LABEL: mload_constmask_v8f32_zero:
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: mload_constmask_v8f32_zero:
; SSE42: ## %bb.0:
; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],zero
; SSE42-NEXT: xorps %xmm1, %xmm1
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v8f32_zero:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,0,0,0,0,0]
; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8f32_zero:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: movw $7, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v8f32_zero:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $7, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v8f32_zero:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $7, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> zeroinitializer)
ret <8 x float> %res
}
define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
; SSE-LABEL: mload_constmask_v4f64:
; SSE: ## %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v4f64:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
; AVX1OR2-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: movb $7, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v4f64:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $7, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v4f64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $7, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
ret <4 x double> %res
}
; 256-bit integer vectors are supported with AVX2.
define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
; SSE2-LABEL: mload_constmask_v8i32:
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: mload_constmask_v8i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0
; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v8i32:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: movw $135, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v8i32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $-121, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v8i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $-121, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
ret <8 x i32> %res
}
define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
; SSE2-LABEL: mload_constmask_v4i64:
; SSE2: ## %bb.0:
; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE42-LABEL: mload_constmask_v4i64:
; SSE42: ## %bb.0:
; SSE42-NEXT: pinsrq $0, (%rdi), %xmm0
; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm1
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v4i64:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: movb $9, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v4i64:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $9, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v4i64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $9, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
ret <4 x i64> %res
}
; 512-bit FP vectors are supported with AVX512.
define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
; SSE-LABEL: mload_constmask_v8f64:
; SSE: ## %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v8f64:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8f64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: movb $-121, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v8f64:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $-121, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v8f64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $-121, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
ret <8 x double> %res
}
; If the pass-through operand is undef, no blend is needed.
define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
; SSE-LABEL: mload_constmask_v4f64_undef_passthrough:
; SSE: ## %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v4f64_undef_passthrough:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
; AVX1OR2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
; AVX512F: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: movb $7, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v4f64_undef_passthrough:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $7, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v4f64_undef_passthrough:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $7, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
ret <4 x double> %res
}
define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
; SSE-LABEL: mload_constmask_v4i64_undef_passthrough:
; SSE: ## %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
; AVX1: ## %bb.0:
; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
; AVX2: ## %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
; AVX512F: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 18:20:37 +00:00
; AVX512F-NEXT: movb $6, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v4i64_undef_passthrough:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $6, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v4i64_undef_passthrough:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $6, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
ret <4 x i64> %res
}
; When only one element of the mask is set, reduce to a scalar load.
define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
; SSE2-LABEL: load_one_mask_bit_set1:
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_one_mask_bit_set1:
; SSE42: ## %bb.0:
; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: load_one_mask_bit_set1:
; AVX: ## %bb.0:
; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
ret <4 x i32> %res
}
; Choose a different element to show that the correct address offset is produced.
define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
; SSE2-LABEL: load_one_mask_bit_set2:
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_one_mask_bit_set2:
; SSE42: ## %bb.0:
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: retq
;
; AVX-LABEL: load_one_mask_bit_set2:
; AVX: ## %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
ret <4 x float> %res
}
; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; SSE2-LABEL: load_one_mask_bit_set3:
; SSE2: ## %bb.0:
; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_one_mask_bit_set3:
; SSE42: ## %bb.0:
; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_one_mask_bit_set3:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_one_mask_bit_set3:
; AVX2: ## %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set3:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
ret <4 x i64> %res
}
; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; SSE-LABEL: load_one_mask_bit_set4:
; SSE: ## %bb.0:
; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: load_one_mask_bit_set4:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
ret <4 x double> %res
}
; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; SSE-LABEL: load_one_mask_bit_set5:
; SSE: ## %bb.0:
; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: load_one_mask_bit_set5:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1OR2-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set5:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
ret <8 x double> %res
}
define i32 @pr38986(i1 %c, i32* %p) {
; SSE-LABEL: pr38986:
; SSE: ## %bb.0:
; SSE-NEXT: testb $1, %dil
; SSE-NEXT: ## implicit-def: $eax
; SSE-NEXT: je LBB43_2
; SSE-NEXT: ## %bb.1: ## %cond.load
; SSE-NEXT: movl (%rsi), %eax
; SSE-NEXT: LBB43_2: ## %else
; SSE-NEXT: retq
;
; AVX-LABEL: pr38986:
; AVX: ## %bb.0:
; AVX-NEXT: testb $1, %dil
; AVX-NEXT: ## implicit-def: $eax
; AVX-NEXT: je LBB43_2
; AVX-NEXT: ## %bb.1: ## %cond.load
; AVX-NEXT: movl (%rsi), %eax
; AVX-NEXT: LBB43_2: ## %else
; AVX-NEXT: retq
%vc = insertelement <1 x i1> undef, i1 %c, i32 0
%vp = bitcast i32* %p to <1 x i32>*
%L = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32 (<1 x i32>* %vp, i32 4, <1 x i1> %vc, <1 x i32> undef)
%ret = bitcast <1 x i32> %L to i32
ret i32 %ret
}
define <2 x double> @zero_mask(<2 x double>* %addr, <2 x double> %dst) {
; SSE-LABEL: zero_mask:
; SSE: ## %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: zero_mask:
; AVX: ## %bb.0:
; AVX-NEXT: retq
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> zeroinitializer, <2 x double> %dst)
ret <2 x double> %res
}
declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
declare <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>*, i32, <8 x i1>, <8 x i64>)
declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
declare <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>*, i32, <1 x i1>, <1 x i64>)
declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
declare <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>*, i32, <1 x i1>, <1 x i32>)
declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)