From ca60fdbee15ac8ec180c1b2c6b4d5bb983bcc0a3 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Wed, 3 Sep 2014 22:48:34 +0000 Subject: [PATCH] [x86] Teach the new vector shuffle lowering about the simplest of 'insertps' patterns. This replaces two shuffles with a single insertps in very common cases. My next patch will extend this to leverage the zeroing capabilities of insertps which will allow it to be used in a much wider set of cases. llvm-svn: 217100 --- lib/Target/X86/X86ISelLowering.cpp | 29 +++++++++++++++++++++++ test/CodeGen/X86/vector-shuffle-128-v4.ll | 16 +++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b024f331d10..8b102e4fbb9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7182,6 +7182,21 @@ static bool isSingleInputShuffleMask(ArrayRef Mask) { return true; } +/// \brief Check wether all of one set of inputs to a shuffle mask are in place. +/// +/// Mask entries pointing at the other input or undef will be skipped. +static bool isShuffleMaskInputInPlace(ArrayRef Mask, bool LoInput = true) { + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + int M = Mask[i]; + if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4)) + continue; + if (M - (LoInput ? 0 : Size) != i) + return false; + } + return true; +} + // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC // 2013 will allow us to use it as a non-type template parameter. namespace { @@ -7365,6 +7380,20 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); + + // Check for whether we can use INSERTPS to perform the blend. We only use + // INSERTPS when the V1 elements are already in the correct locations + // because otherwise we can just always use two SHUFPS instructions which + // are much smaller to encode than a SHUFPS and an INSERTPS. + if (Subtarget->hasSSE41() && + isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) { + // Insert the V2 element into the desired position. + SDValue InsertPSMask = + DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + InsertPSMask); + } + // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index a044dedc4df..0c43e0e9d27 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -121,10 +121,18 @@ define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) { } define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { -; ALL-LABEL: @shuffle_v4i32_0124 -; ALL: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0] -; ALL-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0] -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v4i32_0124 +; SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4i32_0124 +; SSE41: insertps {{.*}} # xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4i32_0124 +; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0,1,2],xmm1[0] +; AVX1-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle }