diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4a4f4a1e9ec..5fc8448d1e7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -47297,6 +47297,11 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InVec.getOpcode() == X86ISD::VBROADCAST_LOAD)) return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits()); + // If we're extracting a broadcasted subvector, just use the source. + if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST && + InVec.getOperand(0).getValueType() == VT) + return InVec.getOperand(0); + // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { diff --git a/test/CodeGen/X86/avx512-vbroadcasti256.ll b/test/CodeGen/X86/avx512-vbroadcasti256.ll index aed8b680e40..33b3ddba0fc 100644 --- a/test/CodeGen/X86/avx512-vbroadcasti256.ll +++ b/test/CodeGen/X86/avx512-vbroadcasti256.ll @@ -54,9 +54,8 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 -; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512VL-NEXT: retq @@ -69,9 +68,8 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; ; X64-AVX512DQVL-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0 ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 -; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512DQVL-NEXT: retq @@ -84,9 +82,8 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 -; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512VL-NEXT: retq @@ -99,9 +96,8 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; ; X64-AVX512DQVL-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0 ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 -; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512DQVL-NEXT: retq diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index 6bdc4e3d447..03f6b526651 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -1146,22 +1146,22 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqu (%rsi), %xmm0 ; AVX2-NEXT: vmovdqu (%rdx), %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,2,3,6,7,2,3,8,9,8,9,4,5,6,7,16,17,18,19,22,23,18,19,24,25,24,25,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-NEXT: vmovdqu (%rcx), %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,2,3,6,7,2,3,8,9,8,9,4,5,6,7,16,17,18,19,22,23,18,19,24,25,24,25,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdi) -; AVX2-NEXT: vmovdqu %ymm2, (%rdi) +; AVX2-NEXT: vmovdqu %ymm3, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ;