diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ffe36189956..235b27bf99c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -36173,7 +36173,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask; // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a // higher depth before combining them. - bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask); + bool AllowBWIVPERMV3 = + (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask); bool MaskContainsZeros = isAnyZero(Mask); diff --git a/test/CodeGen/X86/insertelement-ones.ll b/test/CodeGen/X86/insertelement-ones.ll index fcc8b4b44f4..f8eca3608d8 100644 --- a/test/CodeGen/X86/insertelement-ones.ll +++ b/test/CodeGen/X86/insertelement-ones.ll @@ -5,8 +5,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 define <2 x i64> @insert_v2i64_x1(<2 x i64> %a) { ; SSE2-LABEL: insert_v2i64_x1: @@ -297,21 +297,14 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) { ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; -; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32,1,2,3,4,5,38,7,8,9,10,11,12,13,14,47] -; AVX512F-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,22,7,8,9,10,11,12,13,14,31] -; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: insert_v16i16_x12345x789ABCDEx: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> %a, i16 -1, i32 0 %2 = insertelement <16 x i16> %1, i16 -1, i32 6 %3 = insertelement <16 x i16> %2, i16 -1, i32 15 diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index 94511650356..8e3554f2f1b 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -1736,54 +1736,30 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "mi } define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { -; CHECK-AVX512-LABEL: constant_rotate_v32i8: -; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1 -; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] -; CHECK-AVX512-NEXT: # ymm2 = mem[0,1,0,1] -; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 -; CHECK-AVX512-NEXT: vpsllw $2, %ymm1, %ymm3 -; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; CHECK-AVX512-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm3 -; CHECK-AVX512-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] -; CHECK-AVX512-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; CHECK-AVX512-NEXT: vpsrlw $8, %ymm3, %ymm3 -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; CHECK-AVX512-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; CHECK-AVX512-NEXT: retq -; -; CHECK-VBMI-LABEL: constant_rotate_v32i8: -; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: vpsllw $4, %ymm0, %ymm1 -; CHECK-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; CHECK-VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] -; CHECK-VBMI-NEXT: # ymm2 = mem[0,1,0,1] -; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 -; CHECK-VBMI-NEXT: vpsllw $2, %ymm1, %ymm3 -; CHECK-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; CHECK-VBMI-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; CHECK-VBMI-NEXT: vpaddb %ymm1, %ymm1, %ymm3 -; CHECK-VBMI-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; CHECK-VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] -; CHECK-VBMI-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; CHECK-VBMI-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,33,35,37,39,41,43,45,47,17,19,21,23,25,27,29,31,49,51,53,55,57,59,61,63] -; CHECK-VBMI-NEXT: vpermi2b %ymm3, %ymm0, %ymm2 -; CHECK-VBMI-NEXT: vpor %ymm2, %ymm1, %ymm0 -; CHECK-VBMI-NEXT: retq +; CHECK-LABEL: constant_rotate_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 +; CHECK-NEXT: vpsllw $2, %ymm1, %ymm3 +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; CHECK-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm3 +; CHECK-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; CHECK-NEXT: vpsrlw $8, %ymm3, %ymm3 +; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 +; CHECK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, %or = or <32 x i8> %shl, %lshr diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index b655f3f74f5..74917512698 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1643,25 +1643,12 @@ define <16 x i8> @shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_2 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512VLBW-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512VLBW-NEXT: retq -; -; AVX512VLVBMI-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: -; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 -; AVX512VLVBMI-NEXT: retq +; AVX2OR512VL-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: retq ; ; XOP-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: ; XOP: # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 64ffbda2ccd..2159fa4f918 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1073,18 +1073,12 @@ define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) { ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i16_08196e7f: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_08196e7f: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,6,14,7,15] -; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i16_08196e7f: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq ; ; XOP-LABEL: shuffle_v8i16_08196e7f: ; XOP: # %bb.0: @@ -1109,18 +1103,12 @@ define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) { ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i16_0c1d6879: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_0c1d6879: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,6,8,7,9] -; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i16_0c1d6879: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq ; ; XOP-LABEL: shuffle_v8i16_0c1d6879: ; XOP: # %bb.0: @@ -1158,11 +1146,18 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) { ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11] ; AVX2-FAST-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v8i16_109832ba: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,9,8,3,2,11,10] -; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v8i16_109832ba: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,9,8,3,2,11,10] +; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq ; ; XOP-LABEL: shuffle_v8i16_109832ba: ; XOP: # %bb.0: @@ -1244,11 +1239,18 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v8i16_0213cedf: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15] -; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15] +; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq ; ; XOP-LABEL: shuffle_v8i16_0213cedf: ; XOP: # %bb.0: @@ -1303,11 +1305,18 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) { ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v8i16_443aXXXX: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,3,10,4,5,6,7] -; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v8i16_443aXXXX: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,3,10,4,5,6,7] +; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq ; ; XOP-LABEL: shuffle_v8i16_443aXXXX: ; XOP: # %bb.0: @@ -1628,11 +1637,19 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; AVX2-FAST-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v8i16_XXX1X579: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,4,5,7,9] -; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXX1X579: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,4,5,7,9] +; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq ; ; XOP-LABEL: shuffle_v8i16_XXX1X579: ; XOP: # %bb.0: @@ -2849,19 +2866,12 @@ define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) { ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i16_fu3ucc5u: -; AVX2: # %bb.0: -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_fu3ucc5u: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,5,11,12,4,4,13,14] -; AVX512VL-NEXT: vpermi2w %xmm0, %xmm1, %xmm2 -; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i16_fu3ucc5u: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4] +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2OR512VL-NEXT: retq ; ; XOP-LABEL: shuffle_v8i16_fu3ucc5u: ; XOP: # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index c16e6ece6ec..2e7e59ab456 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1374,17 +1374,11 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: -; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31] -; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2OR512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: ; XOPAVX1: # %bb.0: @@ -1409,17 +1403,11 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: -; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2OR512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: ; XOPAVX1: # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll index fbdc5a5bf60..fbbaf720336 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -831,8 +831,9 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_ ; ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,65,67,69,71,73,75,77,79,17,19,21,23,25,27,29,31,81,83,85,87,89,91,93,95,33,35,37,39,41,43,45,47,97,99,101,103,105,107,109,111,49,51,53,55,57,59,61,63,113,115,117,119,121,123,125,127] -; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 +; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VBMI-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512VBMI-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %1 = lshr <32 x i16> %a0, %2 = lshr <32 x i16> %a1, diff --git a/test/CodeGen/X86/vector-shuffle-v48.ll b/test/CodeGen/X86/vector-shuffle-v48.ll index 974eb083eaf..7eed2dc614e 100644 --- a/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/test/CodeGen/X86/vector-shuffle-v48.ll @@ -71,9 +71,9 @@ define <32 x i8> @foo(<48 x i8>* %x0) { ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: retq ; ; AVX512VBMI-LABEL: foo: diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 75a45c459d5..88bff5fac9e 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -443,15 +443,16 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,33,32,35,34] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 -; AVX512-NEXT: vprold $8, %zmm4, %zmm4 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,3,2,4,5,6,7] +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[1,0,3,2,4,5,6,7] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] ; AVX512-NEXT: vprold $8, %zmm3, %zmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vprold $8, %zmm2, %zmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX512-NEXT: vpaddb %xmm4, %xmm2, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u> ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0