1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[X86][AVX] Combine non-lane crossing binary shuffles using X86ISD::VPERMV3

Some of the combines might be further improved if we lower more shuffles with X86ISD::VPERMV3 directly, instead of waiting to combine the results.

llvm-svn: 359400
This commit is contained in:
Simon Pilgrim 2019-04-28 14:31:01 +00:00
parent 7a1fbf2a34
commit 8a0d0867c4
24 changed files with 1329 additions and 677 deletions

View File

@ -31874,6 +31874,28 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
// If we have a dual input shuffle then lower to VPERMV3.
if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasVLX() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() &&
(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() &&
(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
return DAG.getBitcast(RootVT, Res);
}
// Failed to find any combines.
return SDValue();
}

View File

@ -150,11 +150,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x
define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@ -164,12 +162,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9]
; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@ -181,13 +177,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@ -200,12 +194,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14]
; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
@ -217,13 +209,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
@ -237,10 +227,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [8,0,3,0,5,0,7,1]
; CHECK-NEXT: vpermi2w %xmm2, %xmm3, %xmm4
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
@ -252,12 +242,12 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm1
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,0,3,0,5,0,7,1]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vpermi2w %xmm2, %xmm3, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
@ -269,9 +259,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp,
define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@ -281,10 +271,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = mem[0],xmm2[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2]
; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@ -296,11 +286,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@ -1102,9 +1092,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3]
; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@ -1116,10 +1107,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3]
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@ -1132,9 +1124,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0]
; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
@ -1146,10 +1139,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
@ -1161,9 +1155,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4
define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,2,3]
; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7]
; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@ -1172,11 +1166,11 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = mem[1,1,2,3]
; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm3
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7]
; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@ -1188,11 +1182,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = mem[1,1,2,3]
; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@ -1811,12 +1805,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp,
define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,1,0,2]
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vpermps %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,0,2]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vpermd %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3]
; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
@ -2785,10 +2779,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec
define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1]
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1]
; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@ -2797,12 +2790,12 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0]
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1]
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1]
; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@ -2814,12 +2807,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm1
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1]
; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@ -2832,10 +2825,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2]
; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@ -2847,11 +2841,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@ -2864,11 +2859,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0]
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0]
; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7]
; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@ -2880,12 +2875,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm1
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0]
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0]
; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@ -2897,10 +2892,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>*
define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
; CHECK-NEXT: vmovaps (%rdi), %xmm1
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3]
; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@ -2910,11 +2904,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0]
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2]
; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3]
; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@ -2926,12 +2920,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm1
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0]
; CHECK-NEXT: vmovaps (%rdi), %xmm2
; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2]
; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@ -3136,11 +3130,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6]
; CHECK-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
@ -3153,12 +3147,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %v
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6]
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
; CHECK-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
@ -3203,9 +3196,9 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm1
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,5,3,7]
; CHECK-NEXT: vpermi2ps %xmm1, %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
@ -3217,11 +3210,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec
; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,3,7]
; CHECK-NEXT: vpermi2ps %xmm3, %xmm0, %xmm4
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
@ -3235,12 +3228,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,5,3,7]
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
; CHECK-NEXT: vpermi2ps %xmm2, %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
@ -3409,9 +3401,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float
define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,3,3]
; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = mem[3,1,2,3]
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3]
; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
@ -3421,12 +3413,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = mem[0,2,3,3]
; CHECK-NEXT: vpermpd {{.*#+}} ymm3 = mem[3,1,2,3]
; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3]
; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3]
; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
@ -3439,12 +3431,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>*
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,2,3,3]
; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3]
; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp

View File

@ -296,13 +296,20 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10,11,12,13],ymm1[14],ymm0[15]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: retq
; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10,11,12,13],ymm1[14],ymm0[15]
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,22,7,8,9,10,11,12,13,14,31]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%1 = insertelement <16 x i16> %a, i16 -1, i32 0
%2 = insertelement <16 x i16> %1, i16 -1, i32 6
%3 = insertelement <16 x i16> %2, i16 -1, i32 15

View File

@ -47,16 +47,46 @@ define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
store <8 x i16> %strided.vec, <8 x i16>* %S
@ -192,12 +222,9 @@ define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
@ -333,12 +360,9 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@ -485,12 +509,9 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
@ -607,12 +628,9 @@ define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
@ -843,12 +861,9 @@ define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>

View File

@ -24,16 +24,57 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: shuffle_v32i8_to_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i8> %strided.vec, <16 x i8>* %S
@ -115,16 +156,54 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: shuffle_v16i16_to_v8i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i16> %strided.vec, <8 x i16>* %S
@ -293,16 +372,57 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: shuffle_v32i8_to_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112]
; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
store <8 x i8> %strided.vec, <8 x i8>* %S
@ -1038,23 +1158,17 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@ -1149,16 +1263,57 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: shuffle_v32i8_to_v4i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-NEXT: vmovd %xmm0, (%rsi)
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808]
; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
store <4 x i8> %strided.vec, <4 x i8>* %S

View File

@ -24,16 +24,57 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: shuffle_v32i8_to_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i8> %strided.vec, <16 x i8>* %S
@ -115,16 +156,54 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: shuffle_v16i16_to_v8i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i16> %strided.vec, <8 x i16>* %S
@ -329,23 +408,17 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VBMIVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VBMIVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>

View File

@ -855,11 +855,11 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VL-NEXT: retq
;

View File

@ -846,11 +846,11 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VL-NEXT: retq
;

View File

@ -407,15 +407,15 @@ define <3 x i32> @smulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
;
; AVX512-LABEL: smulo_v3i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k1
; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
@ -537,15 +537,15 @@ define <4 x i32> @smulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) noun
;
; AVX512-LABEL: smulo_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k1
; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
@ -796,15 +796,15 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
;
; AVX512-LABEL: smulo_v6i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0
; AVX512-NEXT: vpcmpneqd %ymm0, %ymm2, %k1
; AVX512-NEXT: vpcmpneqd %ymm0, %ymm4, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
@ -995,15 +995,15 @@ define <8 x i32> @smulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun
;
; AVX512-LABEL: smulo_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0
; AVX512-NEXT: vpcmpneqd %ymm0, %ymm2, %k1
; AVX512-NEXT: vpcmpneqd %ymm0, %ymm4, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
@ -2103,19 +2103,19 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
;
; AVX512-LABEL: smulo_v4i24:
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $8, %xmm0, %xmm0
; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
; AVX512-NEXT: vpslld $8, %xmm1, %xmm1
; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
; AVX512-NEXT: vpslld $8, %xmm0, %xmm0
; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k0
; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k0
; AVX512-NEXT: vpslld $8, %xmm1, %xmm0
; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k1

View File

@ -367,13 +367,13 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
;
; AVX512-LABEL: umulo_v3i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1
; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@ -483,13 +483,13 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) noun
;
; AVX512-LABEL: umulo_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1
; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@ -703,13 +703,13 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
;
; AVX512-LABEL: umulo_v6i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
; AVX512-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@ -873,13 +873,13 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun
;
; AVX512-LABEL: umulo_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
; AVX512-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@ -1878,17 +1878,17 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; AVX512-LABEL: umulo_v4i24:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0
; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpor %xmm4, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}

View File

@ -1961,9 +1961,10 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm2
; AVX512VLBW-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLBW-NEXT: vpor %ymm1, %ymm2, %ymm2
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512VLBW-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:

View File

@ -1971,9 +1971,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm2
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm2
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512VLBW-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:

View File

@ -286,11 +286,24 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,0,17,0,18,0,19,0,20,0,21,0,22,0,23]
; AVX512VLVBMI-NEXT: vpermi2b %xmm0, %xmm1, %xmm2
; AVX512VLVBMI-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
ret <16 x i8> %shuffle
}
@ -352,11 +365,23 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
; AVX1OR2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,19,18,17,16,23,22,21,20]
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
ret <16 x i8> %shuffle
}
@ -394,12 +419,25 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(
; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
; AVX1OR2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1OR2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,31,30,29,28,11,10,9,8,23,22,21,20]
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
ret <16 x i8> %shuffle
}
@ -1136,11 +1174,23 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1OR2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,1,17,4,20,5,21,2,18,3,19,6,22,7,23]
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
ret <16 x i8> %shuffle
}
@ -1381,12 +1431,25 @@ define <16 x i8> @shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_2
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
%1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%3 = bitcast <8 x i16> %1 to <16 x i8>
@ -1647,13 +1710,27 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: PR12412:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
; AVX1OR2-LABEL: PR12412:
; AVX1OR2: # %bb.0: # %entry
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1OR2-NEXT: retq
;
; AVX512VLBW-LABEL: PR12412:
; AVX512VLBW: # %bb.0: # %entry
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: PR12412:
; AVX512VLVBMI: # %bb.0: # %entry
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
entry:
%0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
ret <16 x i8> %0

View File

@ -360,11 +360,17 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0124:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1
; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v4i32_0124:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_0124:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4]
; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %shuffle
}
@ -401,12 +407,18 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0142:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v4i32_0142:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_0142:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2]
; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
ret <4 x i32> %shuffle
}
@ -446,12 +458,18 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0412:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v4i32_0412:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_0412:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2]
; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
ret <4 x i32> %shuffle
}
@ -483,11 +501,17 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_4012:
; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v4i32_4012:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_4012:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2]
; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i32> %shuffle
}
@ -537,12 +561,18 @@ define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0451:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v4i32_0451:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_0451:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1]
; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
ret <4 x i32> %shuffle
}
@ -593,12 +623,18 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_4015:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v4i32_4015:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_4015:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5]
; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
ret <4 x i32> %shuffle
}
@ -1841,16 +1877,10 @@ define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4f32_bitcast_4401:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f32_bitcast_4401:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm1
; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: retq
; AVX-LABEL: shuffle_v4f32_bitcast_4401:
; AVX: # %bb.0:
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
%2 = bitcast <4 x i32> %1 to <2 x double>
%3 = bitcast <4 x float> %a to <2 x double>

View File

@ -974,11 +974,23 @@ define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,2,14,3,15]
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
ret <8 x i16> %shuffle
}
@ -1004,11 +1016,23 @@ define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_48596a7b:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i16_48596a7b:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_48596a7b:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8,5,9,6,10,7,11]
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
ret <8 x i16> %shuffle
}
@ -1021,12 +1045,18 @@ define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_08196e7f:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i16_08196e7f:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_08196e7f:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,6,14,7,15]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15>
ret <8 x i16> %shuffle
}
@ -1039,12 +1069,18 @@ define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0c1d6879:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i16_0c1d6879:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_0c1d6879:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,6,8,7,9]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
ret <8 x i16> %shuffle
}
@ -1077,18 +1113,11 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_109832ba:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
; AVX512VL-FAST-NEXT: retq
; AVX512VL-LABEL: shuffle_v8i16_109832ba:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,9,8,3,2,11,10]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
ret <8 x i16> %shuffle
}
@ -1165,19 +1194,11 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15]
; AVX512VL-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; AVX512VL-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-FAST-NEXT: retq
; AVX512VL-LABEL: shuffle_v8i16_0213cedf:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
ret <8 x i16> %shuffle
}
@ -1227,18 +1248,11 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_443aXXXX:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
; AVX512VL-FAST-NEXT: retq
; AVX512VL-LABEL: shuffle_v8i16_443aXXXX:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,3,10,4,5,6,7]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@ -1272,11 +1286,17 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_032dXXXX:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v8i16_032dXXXX:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_032dXXXX:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,3,2,13,0,13,0,1]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@ -1317,11 +1337,23 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_012dXXXX:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i16_012dXXXX:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_012dXXXX:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_012dXXXX:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,4,5,6,7]
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@ -1356,11 +1388,24 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXXXcde3:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_XXXXcde3:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,11]
; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
@ -1389,11 +1434,24 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_cde3XXXX:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,0,1,2,3]
; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@ -1430,11 +1488,17 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_012dcde3:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v8i16_012dcde3:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_012dcde3:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
@ -1520,20 +1584,11 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXX1X579:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15]
; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
; AVX512VL-FAST-NEXT: retq
; AVX512VL-LABEL: shuffle_v8i16_XXX1X579:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,1,4,5,7,9]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
ret <8 x i16> %shuffle
}
@ -1568,12 +1623,18 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_XX4X8acX:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v8i16_XX4X8acX:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_XX4X8acX:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,8,10,12,10]
; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
ret <8 x i16> %shuffle
}
@ -2449,12 +2510,19 @@ define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_fu3ucc5u:
; AVX: # %bb.0:
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i16_fu3ucc5u:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1OR2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
; AVX1OR2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_fu3ucc5u:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,5,11,12,4,4,13,14]
; AVX512VL-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 15, i32 undef, i32 3, i32 undef, i32 12, i32 12, i32 5, i32 undef>
ret <8 x i16> %shuffle
}

View File

@ -1005,11 +1005,17 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
ret <16 x i16> %shuffle
}
@ -1023,11 +1029,17 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
}

View File

@ -2978,12 +2978,25 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15]
; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15]
; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i8> %shuffle
}

View File

@ -318,22 +318,40 @@ define <4 x double> @shuffle_v4f64_0213(<4 x double> %a, <4 x double> %b) {
}
define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0423:
; ALL: # %bb.0:
; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v4f64_0423:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4f64_0423:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v4f64_0423:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,2,3]
; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
ret <4 x double> %shuffle
}
define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0462:
; ALL: # %bb.0:
; ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v4f64_0462:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_0462:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,6,2]
; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
ret <4 x double> %shuffle
}
@ -483,11 +501,23 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
}
define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_1076:
; ALL: # %bb.0:
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v4f64_1076:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4f64_1076:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v4f64_1076:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,0,7,6]
; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
ret <4 x double> %shuffle
}
@ -906,12 +936,19 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0142:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
; AVX512VL-SLOW-LABEL: shuffle_v4i64_0142:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v4i64_0142:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,2]
; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
ret <4 x i64> %shuffle
}
@ -1185,11 +1222,17 @@ define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1076:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
; AVX512VL-SLOW-LABEL: shuffle_v4i64_1076:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v4i64_1076:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,7,6]
; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
ret <4 x i64> %shuffle
}

View File

@ -283,11 +283,23 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_08084c4c:
; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8f32_08084c4c:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_08084c4c:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12]
; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x float> %shuffle
}
@ -933,11 +945,23 @@ define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_3210fedc:
; ALL: # %bb.0:
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8f32_3210fedc:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210fedc:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
ret <8 x float> %shuffle
}
@ -1017,11 +1041,24 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
}
define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_ba987654:
; ALL: # %bb.0:
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8f32_ba987654:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba987654:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2
; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
}
@ -1468,11 +1505,23 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_08084c4c:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: shuffle_v8i32_08084c4c:
; AVX2: # %bb.0:
; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_08084c4c:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12]
; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x i32> %shuffle
}
@ -2188,11 +2237,23 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_3210fedc:
; ALL: # %bb.0:
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i32_3210fedc:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210fedc:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
ret <8 x i32> %shuffle
}
@ -2243,21 +2304,47 @@ define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_ba987654:
; ALL: # %bb.0:
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i32_ba987654:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba987654:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_ba983210:
; ALL: # %bb.0:
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
; AVX1OR2-LABEL: shuffle_v8i32_ba983210:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba983210:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
@ -2903,14 +2990,32 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: broadcast_concat_crash:
; AVX2OR512VL: # %bb.0: # %entry
; AVX2OR512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX2OR512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
; AVX2OR512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
; AVX2-LABEL: broadcast_concat_crash:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: broadcast_concat_crash:
; AVX512VL-SLOW: # %bb.0: # %entry
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: broadcast_concat_crash:
; AVX512VL-FAST: # %bb.0: # %entry
; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3]
; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1
; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-FAST-NEXT: retq
entry:
%tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%bc = bitcast <8 x float> %tmp to <4 x i64>

View File

@ -595,9 +595,8 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_
;
; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512VBMI-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512VBMI-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,65,67,69,71,73,75,77,79,17,19,21,23,25,27,29,31,81,83,85,87,89,91,93,95,33,35,37,39,41,43,45,47,97,99,101,103,105,107,109,111,49,51,53,55,57,59,61,63,113,115,117,119,121,123,125,127]
; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
; AVX512VBMI-NEXT: retq
%1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = lshr <32 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>

View File

@ -84,10 +84,8 @@ define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1
define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <16 x i8> %x1) {
; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
; CHECK-NEXT: vpermi2b %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
; CHECK-NEXT: vpermi2b %xmm2, %xmm2, %xmm0
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
; CHECK-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 0, i8 31, i8 2, i8 29, i8 4, i8 27, i8 6, i8 25, i8 8, i8 23, i8 10, i8 21, i8 12, i8 19, i8 14, i8 17>, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 0, i8 17, i8 2, i8 18, i8 4, i8 19, i8 6, i8 21, i8 8, i8 23, i8 10, i8 25, i8 12, i8 27, i8 14, i8 29>, <16 x i8> %res0, <16 x i8> %res0, i16 -1)

View File

@ -769,9 +769,9 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
; AVX512VL-NEXT: vzeroupper
@ -782,10 +782,10 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; VL_BW_DQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2
; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
; VL_BW_DQ-NEXT: vzeroupper

View File

@ -1649,13 +1649,35 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
; AVX512-LABEL: trunc2x4i32_8i16:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: retq
; AVX512F-LABEL: trunc2x4i32_8i16:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i32_8i16:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i32_8i16:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i32_8i16:
; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14]
; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <4 x i32> %a to <4 x i16>
%1 = trunc <4 x i32> %b to <4 x i16>

View File

@ -1560,12 +1560,10 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512BWVL-LABEL: trunc2x4i64_8i16:
; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm2
; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,2,4,6,8,10,12,14]
; AVX512BWVL-NEXT: vpermi2w %xmm1, %xmm2, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
@ -1657,13 +1655,35 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
; AVX512-LABEL: trunc2x4i32_8i16:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: retq
; AVX512F-LABEL: trunc2x4i32_8i16:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i32_8i16:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i32_8i16:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i32_8i16:
; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14]
; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <4 x i32> %a to <4 x i16>
%1 = trunc <4 x i32> %b to <4 x i16>