From 3dde4af99bc8476516217ea56918bbcd1f160813 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 3 Jul 2016 19:50:06 +0000 Subject: [PATCH] [X86][AVX512] Add support for 512-bit shuffle lowering to VPERMPD/VPERMQ llvm-svn: 274473 --- lib/Target/X86/X86ISelLowering.cpp | 53 +++++++--- test/CodeGen/X86/vector-shuffle-512-v8.ll | 114 ++++++++-------------- 2 files changed, 78 insertions(+), 89 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 34f6f23d7e8..02f1bf5a314 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7040,10 +7040,10 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { return false; } -/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. +/// \brief Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same -/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies +/// lane-relative shuffle in each sub-lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// @@ -7051,10 +7051,10 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { /// non-trivial to compute in the face of undef lanes. The representation is /// suitable for use with existing 128-bit shuffles as entries from the second /// vector have been remapped to [LaneSize, 2*LaneSize). -static bool -is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, - SmallVectorImpl &RepeatedMask) { - int LaneSize = 128 / VT.getScalarSizeInBits(); +static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, + ArrayRef Mask, + SmallVectorImpl &RepeatedMask) { + int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { @@ -7078,6 +7078,20 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, return true; } +/// Test whether a shuffle mask is equivalent within each 128-bit lane. +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, + SmallVectorImpl &RepeatedMask) { + return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); +} + +/// Test whether a shuffle mask is equivalent within each 256-bit lane. +static bool +is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, + SmallVectorImpl &RepeatedMask) { + return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); +} + /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// @@ -11732,6 +11746,11 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, DAG.getConstant(VPERMILPMask, DL, MVT::i8)); } + + SmallVector RepeatedMask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } if (SDValue Shuf128 = @@ -11791,16 +11810,17 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Shuf128; - // When the shuffle is mirrored between the 128-bit lanes of the unit, we can - // use lower latency instructions that will operate on both 128-bit lanes. - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, RepeatedMask)) { - if (V2.isUndef()) { + if (V2.isUndef()) { + // When the shuffle is mirrored between the 128-bit lanes of the unit, we + // can use lower latency instructions that will operate on all four + // 128-bit lanes. + SmallVector Repeated128Mask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { int PSHUFDMask[] = {-1, -1, -1, -1}; for (int i = 0; i < 2; ++i) - if (RepeatedMask[i] >= 0) { - PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; - PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; + if (Repeated128Mask[i] >= 0) { + PSHUFDMask[2 * i] = 2 * Repeated128Mask[i]; + PSHUFDMask[2 * i + 1] = 2 * Repeated128Mask[i] + 1; } return DAG.getBitcast( MVT::v8i64, @@ -11808,6 +11828,11 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, DAG.getBitcast(MVT::v16i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } + + SmallVector Repeated256Mask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, + getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } // Try to use shift instructions. diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 3edcfd98077..6924c8f169a 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1,9 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 -target triple = "x86_64-unknown-unknown" - define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00000000: ; AVX512F: # BB#0: @@ -172,12 +170,12 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_01014545: ; AVX512F: # BB#0: -; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_01014545: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -435,14 +433,12 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_00014445: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,4,4,4,5] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00014445: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,4,0,4,0,4,0,5,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -452,14 +448,12 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_00204464: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,4,6,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00204464: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,4,0,6,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -469,14 +463,12 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_03004744: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,7,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_03004744: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,7,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -486,14 +478,12 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_10005444: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,5,4,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_10005444: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,5,0,4,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -503,14 +493,12 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_22006644: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,6,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_22006644: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,6,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -520,14 +508,12 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_33307774: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,7,7,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_33307774: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,7,0,7,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -537,14 +523,12 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { ; ; AVX512F-LABEL: shuffle_v8f64_32107654: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,7,6,5,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_32107654: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -1425,14 +1409,12 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_00014445: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,4,4,4,5] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00014445: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,4,0,4,0,4,0,5,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1442,14 +1424,12 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_00204464: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,4,6,4] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00204464: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,4,0,6,0,4,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1459,14 +1439,12 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_03004744: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,7,4,4] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_03004744: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,7,0,4,0,4,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1476,14 +1454,12 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_10005444: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,5,4,4,4] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_10005444: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,5,0,4,0,4,0,4,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1493,14 +1469,12 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_22006644: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,6,4,4] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_22006644: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,6,0,4,0,4,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1510,14 +1484,12 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_33307774: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,7,7,4] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_33307774: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,7,0,7,0,4,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1527,14 +1499,12 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_32107654: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,7,6,5,4] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_32107654: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1544,14 +1514,12 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_00234467: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,4,4,6,7] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00234467: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,4,0,4,0,6,0,7,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1606,14 +1574,12 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_10235467: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,3,5,4,6,7] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_10235467: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,3,0,5,0,4,0,6,0,7,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1623,14 +1589,12 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_10225466: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,2,5,4,6,6] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_10225466: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,2,0,5,0,4,0,6,0,6,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2269,12 +2233,12 @@ define <8 x double> @shuffle_v8f64_2301uu67(<8 x double> %a0, <8 x double> %a1) define <8 x double> @shuffle_v8f64_2301uuuu(<8 x double> %a0, <8 x double> %a1) { ; AVX512F-LABEL: shuffle_v8f64_2301uuuu: ; AVX512F: # BB#0: -; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1],zmm0[0,1,0,1] +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_2301uuuu: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1],zmm0[0,1,0,1] +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5] ; AVX512F-32-NEXT: retl %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> ret <8 x double> %1