diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0c7d4bcabe8..cff91d6e3ce 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -30279,23 +30279,26 @@ static SDValue combineX86ShufflesRecursively( // Add the inputs to the Ops list, avoiding duplicates. SmallVector Ops(SrcOps.begin(), SrcOps.end()); - int InputIdx0 = -1, InputIdx1 = -1; - for (int i = 0, e = Ops.size(); i < e; ++i) { - SDValue BC = peekThroughBitcasts(Ops[i]); - if (Input0 && BC == peekThroughBitcasts(Input0)) - InputIdx0 = i; - if (Input1 && BC == peekThroughBitcasts(Input1)) - InputIdx1 = i; - } + auto AddOp = [&Ops](SDValue Input, int InsertionPoint = -1) -> int { + if (!Input) + return -1; + // Attempt to find an existing match. + SDValue InputBC = peekThroughBitcasts(Input); + for (int i = 0, e = Ops.size(); i < e; ++i) + if (InputBC == peekThroughBitcasts(Ops[i])) + return i; + // Match failed - should we replace an existing Op? + if (InsertionPoint >= 0) { + Ops[InsertionPoint] = Input; + return InsertionPoint; + } + // Add to the end of the Ops list. + Ops.push_back(Input); + return Ops.size() - 1; + }; - if (Input0 && InputIdx0 < 0) { - InputIdx0 = SrcOpIndex; - Ops[SrcOpIndex] = Input0; - } - if (Input1 && InputIdx1 < 0) { - InputIdx1 = Ops.size(); - Ops.push_back(Input1); - } + int InputIdx0 = AddOp(Input0, SrcOpIndex); + int InputIdx1 = AddOp(Input1); assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 05d19f315b0..122c77aefcc 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -923,18 +923,12 @@ define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) { ; X32-LABEL: combine_pshufb_pshufb_or_pshufb: ; X32: # %bb.0: -; X32-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero -; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19] -; X32-NEXT: vpor %ymm0, %ymm1, %ymm0 -; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X32-NEXT: retl ; ; X64-LABEL: combine_pshufb_pshufb_or_pshufb: ; X64: # %bb.0: -; X64-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero -; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19] -; X64-NEXT: vpor %ymm0, %ymm1, %ymm0 -; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X64-NEXT: retq %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index b67f9e266eb..a1316eb41f8 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -707,20 +707,23 @@ define <16 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<16 x i8> %a0, <16 x i8> define <16 x i8> @combine_pshufb_pshufb_or_pshufb(<16 x i8> %a0) { ; SSE-LABEL: combine_pshufb_pshufb_or_pshufb: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero -; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_pshufb_pshufb_or_pshufb: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3] -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_pshufb_or_pshufb: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512F-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %3 = or <16 x i8> %1, %2