From d3230253b10efb8246d4b380c72597edc89ef1a6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 16 Feb 2020 13:09:16 +0000 Subject: [PATCH] [X86] combineX86ShuffleChain - add support for combining 512-bit shuffles to bit shifts --- lib/Target/X86/X86ISelLowering.cpp | 10 ++++++---- .../X86/vector-shuffle-combining-avx512bw.ll | 16 ++++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index fdd4901d443..729ab7af4af 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -33538,12 +33538,14 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef Mask, } // Attempt to match against byte/bit shifts. - // FIXME: Add 512-bit support. - if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0, Zeroable, Subtarget); - if (0 < ShiftAmt) { + if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() || + 32 <= ShuffleVT.getScalarSizeInBits())) { PermuteImm = (unsigned)ShiftAmt; return true; } diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index cde9de95f47..08923cab6eb 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -92,7 +92,7 @@ define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) { define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_as_pslldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] +; CHECK-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> , <64 x i8> undef, i64 -1) ret <64 x i8> %res0 @@ -100,14 +100,16 @@ define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) { define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) { ; X86-LABEL: combine_pshufb_as_pslldq_mask: ; X86: # %bb.0: +; X86-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] +; X86-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: combine_pshufb_as_pslldq_mask: ; X64: # %bb.0: +; X64-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] ; X64-NEXT: kmovq %rdi, %k1 -; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] +; X64-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> , <64 x i8> zeroinitializer, i64 %m) ret <64 x i8> %res0 @@ -116,7 +118,7 @@ define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) { define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_as_psrldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> , <64 x i8> undef, i64 -1) ret <64 x i8> %res0 @@ -124,14 +126,16 @@ define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) { define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) { ; X86-LABEL: combine_pshufb_as_psrldq_mask: ; X86: # %bb.0: +; X86-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: combine_pshufb_as_psrldq_mask: ; X64: # %bb.0: +; X64-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: kmovq %rdi, %k1 -; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> , <64 x i8> zeroinitializer, i64 %m) ret <64 x i8> %res0