From a9ea2c1dd301258946b60c34547c7af22bb5250f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 5 Jun 2019 12:56:53 +0000 Subject: [PATCH] [X86][AVX] combineX86ShuffleChain - combine shuffle(extractsubvector(x),extractsubvector(y)) We already handle the case where we combine shuffle(extractsubvector(x),extractsubvector(x)), this relaxes the requirement to permit different sources as long as they have the same value type. This causes a couple of cases where the VPERMV3 binary shuffles occur at a wider width than before, which I intend to improve in future commits - but as only the subvector's mask indices are defined, these will broadcast so we don't see any increase in constant size. llvm-svn: 362599 --- lib/Target/X86/X86ISelLowering.cpp | 13 +++-- .../X86/avx512-shuffles/partial_permute.ll | 25 +++++---- test/CodeGen/X86/pr29112.ll | 54 +++++++++---------- test/CodeGen/X86/vector-shuffle-256-v32.ll | 5 +- 4 files changed, 52 insertions(+), 45 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 64585c8de0a..a6aa2b77990 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -32091,19 +32091,28 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, isa(V2.getOperand(1))) { SDValue Src1 = V1.getOperand(0); SDValue Src2 = V2.getOperand(0); - if (Src1 == Src2) { + if (Src1.getValueType() == Src2.getValueType()) { unsigned Offset1 = V1.getConstantOperandVal(1); unsigned Offset2 = V2.getConstantOperandVal(1); assert(((Offset1 % VT1.getVectorNumElements()) == 0 || (Offset2 % VT2.getVectorNumElements()) == 0 || (Src1.getValueSizeInBits() % RootSizeInBits) == 0) && "Unexpected subvector extraction"); + unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits; + // Convert extraction indices to mask size. Offset1 /= VT1.getVectorNumElements(); Offset2 /= VT2.getVectorNumElements(); Offset1 *= NumMaskElts; Offset2 *= NumMaskElts; + SmallVector NewInputs; + NewInputs.push_back(Src1); + if (Src1 != Src2) { + NewInputs.push_back(Src2); + Offset2 += Scale * NumMaskElts; + } + // Create new mask for larger type. SmallVector NewMask(Mask); for (int &M : NewMask) { @@ -32114,10 +32123,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, else M = (M - NumMaskElts) + Offset2; } - unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits; NewMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); - SDValue NewInputs[] = {Src1}; if (SDValue Res = combineX86ShuffleChain( NewInputs, Src1, NewMask, Depth, HasVariableMask, AllowVariableMask, DAG, Subtarget)) { diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index c1d37a77ad0..57e333d78bd 100644 --- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1805,12 +1805,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,3] ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,0,2] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3] -; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; CHECK-NEXT: vpermt2d %ymm0, %ymm2, %ymm1 +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -3128,10 +3128,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %v define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6] -; CHECK-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4 +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm4 +; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} @@ -3146,12 +3145,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 -; CHECK-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll index b78a5ce7c5d..2dce179f367 100644 --- a/test/CodeGen/X86/pr29112.ll +++ b/test/CodeGen/X86/pr29112.ll @@ -11,45 +11,45 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: vmovaps %xmm1, %xmm8 +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm5 -; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1],xmm2[1],xmm10[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm1[1],xmm5[2,3] -; CHECK-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm2[1],xmm6[3] +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm4 +; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm10[0,1],xmm2[1],xmm10[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0],xmm1[1],xmm4[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0,1],xmm2[1],xmm0[3] ; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7 -; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1,2],xmm3[1] -; CHECK-NEXT: vaddps %xmm4, %xmm6, %xmm12 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] -; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm7[2],zero,zero +; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm12 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm7[2],zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm4[0] -; CHECK-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[3,1,2,3] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm1[2],zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm5[0] +; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm1[2],zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0,1,2],xmm3[1] ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm2[1],xmm9[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm9[0,1],xmm2[1],xmm9[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm4[0,1],xmm2[1],xmm4[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm2[1],xmm7[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3] ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm3[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1,2],xmm3[1] +; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2 -; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm9, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9 ; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3 diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 8dfcffbe0a4..a58b6bd5471 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2994,8 +2994,9 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16] -; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,32,32,32,32,32,32,32,32,15,15,15,15,15,15,15,15,32,32,32,32,32,32,32,32] +; AVX512VLVBMI-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 ; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle