From d91fbd97b2e101e1826f151de3db1bc4299d78fc Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 2 Jun 2015 13:43:18 +0000 Subject: [PATCH] AVX-512: Shorten implementation of lowerV16X32VectorShuffle() using lowerVectorShuffleWithSHUFPS() and other shuffle-helpers routines. Added matching of VALIGN instruction. llvm-svn: 238830 --- lib/Target/X86/X86ISelLowering.cpp | 100 ++++++++++++++++++----------- test/CodeGen/X86/avx512-shuffle.ll | 30 ++++++++- 2 files changed, 88 insertions(+), 42 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 908e7b07a15..a4787c81661 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10089,6 +10089,49 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } +static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + assert(VT.getScalarSizeInBits() >= 32 && "Unexpected data type for VALIGN"); + // VALIGN pattern 2, 3, 4, 5, .. (sequential, shifted right) + int AlignVal = -1; + for (int i = 0; i < (signed)VT.getVectorNumElements(); ++i) { + if (Mask[i] < 0) + continue; + if (Mask[i] < i) + return SDValue(); + if (AlignVal == -1) + AlignVal = Mask[i] - i; + else if (Mask[i] - i != AlignVal) + return SDValue(); + } + return DAG.getNode(X86ISD::VALIGN, DL, VT, V1, V2, + DAG.getConstant(AlignVal, DL, MVT::i8)); +} + +static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); + + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SmallVector VPermMask; + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) + VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) : + DAG.getConstant(Mask[i], DL,MaskEltVT)); + SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT, + VPermMask); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2); +} + + /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, @@ -10110,6 +10153,9 @@ static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG)) + return Op; + // VSHUFPD instruction - mask 0/1, 8/9, 2/3, 10/11, 4/5, 12/13, 6/7, 14/15 bool ShufpdMask = true; unsigned Immediate = 0; @@ -10124,8 +10170,8 @@ static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Immediate |= (Mask[i]%2) << i; } if (ShufpdMask) - return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, - DAG.getConstant(Immediate, DL, MVT::i8)); + return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, + DAG.getConstant(Immediate, DL, MVT::i8)); // PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7 if (isSingleInputShuffleMask(Mask)) { @@ -10155,16 +10201,7 @@ static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getConstant(Immediate, DL, MVT::i8)); } } - SDValue VPermMask[8]; - for (int i = 0; i < 8; ++i) - VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i64) - : DAG.getConstant(Mask[i], DL, MVT::i64); - SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i64, - VPermMask); - if (isSingleInputShuffleMask(Mask)) - return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); - - return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2); + return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. @@ -10196,46 +10233,31 @@ static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, - 12, 12, 14, 14})) + 12, 12, 14, 14})) return DAG.getNode(X86ISD::MOVSLDUP, DL, VT, V1); if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, - 13, 13, 15, 15})) + 13, 13, 15, 15})) return DAG.getNode(X86ISD::MOVSHDUP, DL, VT, V1); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) { - unsigned Immediate = 0; - for (int i = 0; i < 4; ++i) - if (RepeatedMask[i] > 0) - Immediate |= (RepeatedMask[i] & 3) << (i*2); - if (isSingleInputShuffleMask(Mask)) { unsigned Opc = VT.isInteger() ? X86ISD::PSHUFD : X86ISD::VPERMILPI; return DAG.getNode(Opc, DL, VT, V1, - DAG.getConstant(Immediate, DL, MVT::i8)); + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } - // VSHUFPS pattern: 0-3, 0-3, 16-19, 16-19, 4-7, 4-7, 20-23, 20-23 .. - bool InterleavedMask = true; - for (int i = 0; i < 4; ++i) - if (RepeatedMask[i] >= 0 && - ((i < 2 && RepeatedMask[i] > 2) || ( i >=2 && RepeatedMask[i] < 16))) - InterleavedMask = false; - - if (InterleavedMask) - return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, - DAG.getConstant(Immediate, DL, MVT::i8)); + for (int i = 0; i < 4; ++i) { + if (RepeatedMask[i] >= 16) + RepeatedMask[i] -= 12; + } + return lowerVectorShuffleWithSHUFPS(DL, VT, RepeatedMask, V1, V2, DAG); } - SDValue VPermMask[16]; - for (int i = 0; i < 16; ++i) - VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) - : DAG.getConstant(Mask[i], DL, MVT::i32); - SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i32, - VPermMask); - if (V2.getOpcode() == ISD::UNDEF) - return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); - return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2); + if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG)) + return Op; + + return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 1fbe76f2648..5d0de6f2d4e 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -114,6 +114,13 @@ define <16 x i32> @test15(<16 x i32> %a) { %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> ret <16 x i32> %b } +; CHECK-LABEL: test16 +; CHECK: valignq $2, %zmm0, %zmm1 +; CHECK: ret +define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %c +} ; CHECK-LABEL: test17 ; CHECK: vshufpd $19, %zmm1, %zmm0 @@ -163,11 +170,20 @@ define <8 x i64> @test22(<8 x i64> %a, <8 x i64> %b) { ret <8 x i64> %shuffle } +; CHECK-LABEL: @test23 +; CHECK: vshufps +; CHECK: vshufps +; CHECK: ret +define <16 x i32> @test23(<16 x i32> %a, <16 x i32> %b) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +} + ; CHECK-LABEL: @test24 ; CHECK: vpermt2d ; CHECK: ret define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c } @@ -175,8 +191,8 @@ define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { ; CHECK: vshufps $52 ; CHECK: ret define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind { -; mask - 0-1-3-undef 00110100 = 0x34 = 52 - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> +; mask - 0-1-3-0 00110100 = 0x34 = 52 + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c } @@ -218,3 +234,11 @@ define <16 x float> @test30(<16 x float> %a, <16 x float> %c) { %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> ret <16 x float> %b } + +; CHECK-LABEL: test31 +; CHECK: valignd $3, %zmm0, %zmm1 +; CHECK: ret +define <16 x i32> @test31(<16 x i32> %a, <16 x i32> %b) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %c +}