diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8415c8f708d..f85b00a84f8 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6840,6 +6840,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); + + // If using the new shuffle lowering, just directly insert this. + if (ExperimentalVectorShuffleLowering) + return DAG.getNode( + ISD::BITCAST, dl, VT, + getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of @@ -6913,6 +6920,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + // If using the new shuffle lowering, just directly insert this. + if (ExperimentalVectorShuffleLowering) + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); + // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector MaskVec; @@ -7492,7 +7503,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - if (isSingleInputShuffleMask(Mask)) + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, @@ -7504,6 +7518,52 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) { + int V2Index = + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - + Mask.begin(); + + // Check for a single input from a SCALAR_TO_VECTOR node. + // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and + // all the smarts here sunk into that routine. However, the current + // lowering of BUILD_VECTOR makes that nearly impossible until the old + // vector shuffle lowering is dead. + if ((Mask[V2Index] == 4 && V2.getOpcode() == ISD::SCALAR_TO_VECTOR) || + V2.getOpcode() == ISD::BUILD_VECTOR) { + SDValue V2S = V2.getOperand(Mask[V2Index] - 4); + + bool V1IsAllZero = false; + if (ISD::isBuildVectorAllZeros(V1.getNode())) { + V1IsAllZero = true; + } else if (V1.getOpcode() == ISD::BUILD_VECTOR) { + V1IsAllZero = true; + for (int M : Mask) { + if (M < 0 || M >= 4) + continue; + SDValue Input = V1.getOperand(M); + if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) { + // A non-zero input! + V1IsAllZero = false; + break; + } + } + } + if (V1IsAllZero) { + V2 = DAG.getNode( + X86ISD::VZEXT_MOVL, DL, MVT::v4i32, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V2S)); + if (V2Index != 0) { + int V2Shuffle[] = {1, 1, 1, 1}; + V2Shuffle[V2Index] = 0; + V2 = DAG.getVectorShuffle(MVT::v4i32, DL, V2, + DAG.getUNDEF(MVT::v4i32), V2Shuffle); + } + return V2; + } + } + } + // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll index bdd85213885..9823963d61b 100644 --- a/test/CodeGen/X86/vec_set-3.ll +++ b/test/CodeGen/X86/vec_set-3.ll @@ -1,10 +1,15 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s +; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-EXP define <4 x float> @test(float %a) { ; CHECK-LABEL: test: ; CHECK: movss {{.*}}, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; CHECK-NEXT: retl +; +; CHECK-EXP-LABEL: test: +; CHECK-EXP: insertps $285, {{.*}}, %xmm0 +; CHECK-EXP-NEXT: retl entry: %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 @@ -18,6 +23,11 @@ define <2 x i64> @test2(i32 %a) { ; CHECK: movd {{.*}}, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; CHECK-NEXT: retl +; +; CHECK-EXP-LABEL: test2: +; CHECK-EXP: movd {{.*}}, %xmm0 +; CHECK-EXP-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; CHECK-EXP-NEXT: retl entry: %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 @@ -32,6 +42,10 @@ define <4 x float> @test3(<4 x float> %A) { ; CHECK-NEXT: movss %xmm0, %[[X1]] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = [[X1]][1,0,1,1] ; CHECK-NEXT: retl +; +; CHECK-EXP-LABEL: test3: +; CHECK-EXP: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; CHECK-EXP-NEXT: retl %tmp0 = extractelement <4 x float> %A, i32 0 %tmp1 = insertelement <4 x float> , float %tmp0, i32 1 diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 7f448835b5d..9105197f67c 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -317,3 +317,52 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle } + +define <4 x i32> @shuffle_v4i32_4zzz(i32 %i) { +; ALL-LABEL: @shuffle_v4i32_4zzz +; ALL: movd {{.*}}, %xmm0 +; ALL-NEXT: retq + %a = insertelement <4 x i32> undef, i32 %i, i32 0 + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_z4zz(i32 %i) { +; ALL-LABEL: @shuffle_v4i32_z4zz +; ALL: movd {{.*}}, %xmm0 +; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[1,0,1,1] +; ALL-NEXT: retq + %a = insertelement <4 x i32> undef, i32 %i, i32 0 + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_zz4z(i32 %i) { +; ALL-LABEL: @shuffle_v4i32_zz4z +; ALL: movd {{.*}}, %xmm0 +; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[1,1,0,1] +; ALL-NEXT: retq + %a = insertelement <4 x i32> undef, i32 %i, i32 0 + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_zuu4(i32 %i) { +; ALL-LABEL: @shuffle_v4i32_zuu4 +; ALL: movd {{.*}}, %xmm0 +; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[1,1,1,0] +; ALL-NEXT: retq + %a = insertelement <4 x i32> undef, i32 %i, i32 0 + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_z6zz(i32 %i) { +; ALL-LABEL: @shuffle_v4i32_z6zz +; ALL: movd {{.*}}, %xmm0 +; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[1,0,1,1] +; ALL-NEXT: retq + %a = insertelement <4 x i32> undef, i32 %i, i32 2 + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle +}