From 36a9b7e0b6274530212fa8859c5ba9551c182eac Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 11 Mar 2018 11:52:26 +0000 Subject: [PATCH] [X86][AVX] createVariablePermute - use 2xVPERMIL+PCMPGT+SELECT for v8i32/v8f32 and v4i64/v4f64 variable permutes As VPERMILPS/VPERMILPD only selects elements based on the bits[1:0]/bit[1] then we can permute both the (repeated) lo/hi 128-bit vectors in each case and then select between these results based on whether the index was for for lo/hi. For v4i64/v4f64 this avoids some rather nasty v4i64 multiples on the AVX2 implementation, which seems to be worse than the extra port5 pressure from the additional shuffles/blends. llvm-svn: 327239 --- lib/Target/X86/X86ISelLowering.cpp | 37 ++- test/CodeGen/X86/var-permute-256.ll | 372 ++++++++++------------------ 2 files changed, 158 insertions(+), 251 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 55c28c3b108..89ad2bbed3c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8035,15 +8035,24 @@ SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, case MVT::v8i32: if (Subtarget.hasAVX2()) Opcode = X86ISD::VPERMV; - else if (Subtarget.hasXOP()) { + else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {0, 1, 2, 3, 0, 1, 2, 3}); SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {4, 5, 6, 7, 4, 5, 6, 7}); - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, - LoLo, HiHi, IndicesVec, - DAG.getConstant(0, DL, MVT::i8))); + if (Subtarget.hasXOP()) + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, + LoLo, HiHi, IndicesVec, + DAG.getConstant(0, DL, MVT::i8))); + // Permute Lo and Hi and then select based on index range. + // This works as VPERMILPS only uses index bits[0:1] to permute elements. + SDValue Res = DAG.getSelectCC( + DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32), + DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec), + DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec), + ISD::CondCode::SETGT); + return DAG.getBitcast(VT, Res); } break; case MVT::v4i64: @@ -8060,7 +8069,7 @@ SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, return extract256BitVector(Res, 0, DAG, DL); } Opcode = X86ISD::VPERMV; - } else if (Subtarget.hasXOP()) { + } else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1}); @@ -8068,12 +8077,18 @@ SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3}); // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, - LoLo, HiHi, IndicesVec, - DAG.getConstant(0, DL, MVT::i8))); - } else if (Subtarget.hasAVX2()) { - Opcode = X86ISD::VPERMV; - ShuffleVT = MVT::v8f32; + if (Subtarget.hasXOP()) + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, + LoLo, HiHi, IndicesVec, + DAG.getConstant(0, DL, MVT::i8))); + // Permute Lo and Hi and then select based on index range. + // This works as VPERMILPD only uses index bit[1] to permute elements. + SDValue Res = DAG.getSelectCC( + DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64), + DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec), + DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec), + ISD::CondCode::SETGT); + return DAG.getBitcast(VT, Res); } break; case MVT::v64i8: diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll index 87cd0918f75..ed6b48c68c5 100644 --- a/test/CodeGen/X86/var-permute-256.ll +++ b/test/CodeGen/X86/var-permute-256.ll @@ -23,45 +23,32 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind { ; ; AVX1-LABEL: var_shuffle_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: andl $3, %eax -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: andl $3, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdx -; AVX1-NEXT: andl $3, %edx -; AVX1-NEXT: vpextrq $1, %xmm1, %rsi -; AVX1-NEXT: andl $3, %esi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: movq %rbp, %rsp -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4i64: ; AVX2: # %bb.0: +; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] -; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_shuffle_v4i64: @@ -114,39 +101,17 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { ; ; AVX1-LABEL: var_shuffle_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vmovd %xmm1, %r8d -; AVX1-NEXT: vpextrd $1, %xmm1, %r9d -; AVX1-NEXT: vpextrd $2, %xmm1, %r10d -; AVX1-NEXT: vpextrd $3, %xmm1, %esi -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %edi -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; AVX1-NEXT: vpextrd $3, %xmm1, %edx -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: andl $7, %r8d -; AVX1-NEXT: andl $7, %r9d -; AVX1-NEXT: andl $7, %r10d -; AVX1-NEXT: andl $7, %esi -; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: andl $7, %ecx -; AVX1-NEXT: andl $7, %edx -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, (%rsp,%rax,4), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, (%rsp,%rcx,4), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, (%rsp,%rdx,4), %xmm0, %xmm0 -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, (%rsp,%r9,4), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: movq %rbp, %rsp -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8i32: @@ -1326,43 +1291,32 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun ; ; AVX1-LABEL: var_shuffle_v4f64: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: andl $3, %eax -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: andl $3, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdx -; AVX1-NEXT: andl $3, %edx -; AVX1-NEXT: vpextrq $1, %xmm1, %rsi -; AVX1-NEXT: andl $3, %esi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: movq %rbp, %rsp -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4f64: ; AVX2: # %bb.0: +; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] -; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_shuffle_v4f64: @@ -1415,39 +1369,17 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi ; ; AVX1-LABEL: var_shuffle_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vmovd %xmm1, %esi -; AVX1-NEXT: vpextrd $1, %xmm1, %r8d -; AVX1-NEXT: vpextrd $2, %xmm1, %r9d -; AVX1-NEXT: vpextrd $3, %xmm1, %r10d -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %edx -; AVX1-NEXT: vpextrd $1, %xmm1, %edi -; AVX1-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: andl $7, %esi -; AVX1-NEXT: andl $7, %r8d -; AVX1-NEXT: andl $7, %r9d -; AVX1-NEXT: andl $7, %r10d -; AVX1-NEXT: andl $7, %edx -; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: andl $7, %ecx -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: movq %rbp, %rsp -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8f32: @@ -1500,40 +1432,34 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) ; ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: andl $1, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: vpextrq $1, %xmm1, %rsi -; AVX1-NEXT: andl $1, %esi -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] -; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_shuffle_v4i64_from_v2i64: @@ -1589,33 +1515,18 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) ; ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd %xmm1, %r8d -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: andl $3, %r8d -; AVX1-NEXT: vpextrd $1, %xmm1, %r9d -; AVX1-NEXT: andl $3, %r9d -; AVX1-NEXT: vpextrd $2, %xmm1, %r10d -; AVX1-NEXT: andl $3, %r10d -; AVX1-NEXT: vpextrd $3, %xmm1, %esi -; AVX1-NEXT: andl $3, %esi -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edi -; AVX1-NEXT: andl $3, %edi -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $3, %eax -; AVX1-NEXT: vpextrd $2, %xmm0, %ecx -; AVX1-NEXT: andl $3, %ecx -; AVX1-NEXT: vpextrd $3, %xmm0, %edx -; AVX1-NEXT: andl $3, %edx -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rax,4), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rcx,4), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdx,4), %xmm0, %xmm0 -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, -24(%rsp,%r9,4), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r10,4), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8i32_from_v4i32: @@ -2738,38 +2649,34 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in ; ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: andl $1, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: vpextrq $1, %xmm1, %rsi -; AVX1-NEXT: andl $1, %esi -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] -; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_shuffle_v4f64_from_v2f64: @@ -2825,33 +2732,18 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indi ; ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd %xmm1, %r8d -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: andl $3, %r8d -; AVX1-NEXT: vpextrd $1, %xmm1, %r9d -; AVX1-NEXT: andl $3, %r9d -; AVX1-NEXT: vpextrd $2, %xmm1, %r10d -; AVX1-NEXT: andl $3, %r10d -; AVX1-NEXT: vpextrd $3, %xmm1, %esi -; AVX1-NEXT: andl $3, %esi -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edi -; AVX1-NEXT: andl $3, %edi -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $3, %eax -; AVX1-NEXT: vpextrd $2, %xmm0, %ecx -; AVX1-NEXT: andl $3, %ecx -; AVX1-NEXT: vpextrd $3, %xmm0, %edx -; AVX1-NEXT: andl $3, %edx -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8f32_from_v4f32: