From 411f75a025b05c921e9b6fb2d51134aa446b48c8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Apr 2021 17:43:23 +0100 Subject: [PATCH] [X86][SSE] getFauxShuffleMask - don't decode OR(SHUFFLE,SHUFFLE) containing UNDEFs. (PR50049) PR50049 demonstrated an infinite loop between OR(SHUFFLE,SHUFFLE) <-> BLEND(SHUFFLE,SHUFFLE) patterns. The UNDEF elements were allowing a combined shuffle mask to be widened which lost the undef element, resulting us needing to use the BLEND pattern (as the undef element would need to be zero for the OR pattern). But then bitcast folds would re-expose the undef element allowing us to use OR again..... --- lib/Target/X86/X86ISelLowering.cpp | 8 ++-- test/CodeGen/X86/shuffle-vs-trunc-256.ll | 8 ++-- test/CodeGen/X86/vector-shuffle-128-v8.ll | 5 ++- test/CodeGen/X86/vector-shuffle-256-v32.ll | 6 +-- .../X86/vector-shuffle-combining-sse41.ll | 42 +++++++++++++++++++ 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8ae0df6313a..b5a4159a48b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7542,9 +7542,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); for (int i = 0; i != (int)MaskSize; ++i) { - if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) - Mask.push_back(SM_SentinelUndef); - else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) + // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite + // loops converting between OR and BLEND shuffles due to + // canWidenShuffleElements merging away undef elements, meaning we + // fail to recognise the OR as the undef element isn't known zero. + if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) Mask.push_back(i); diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 123fba43714..9c32d75488d 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1245,10 +1245,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX1-LABEL: negative: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index ad177094f1a..c88f8e47b0e 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1651,8 +1651,9 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { ; ; SSSE3-LABEL: shuffle_v8i16_XX4X8acX: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u] +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v8i16_XX4X8acX: diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index a9d9798ebc7..dc54a60d33c 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -3254,9 +3254,9 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index a4755845b11..d741f2f9f36 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -21,3 +21,45 @@ define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) { %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %res0 } + +define <16 x i8> @PR50049(<48 x i8>* %p1, <48 x i8>* %p2) { +; SSE-LABEL: PR50049: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = <128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm6, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = <0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm7, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufb %xmm6, %xmm5 +; SSE-NEXT: pshufb %xmm7, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE-NEXT: pmullw %xmm5, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = <8,u,9,u,10,u,128,u,128,u,128,u,128,u,128,u> +; SSE-NEXT: pshufb %xmm6, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = <128,u,128,u,128,u,1,u,4,u,7,u,10,u,13,u> +; SSE-NEXT: pshufb %xmm7, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufb %xmm6, %xmm2 +; SSE-NEXT: pshufb %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pmullw %xmm3, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq + %x1 = load <48 x i8>, <48 x i8>* %p1, align 16 + %x2 = load <48 x i8>, <48 x i8>* %p2, align 16 + %s1 = shufflevector <48 x i8> %x1, <48 x i8> poison, <16 x i32> + %s2 = shufflevector <48 x i8> %x2, <48 x i8> poison, <16 x i32> + %r = mul <16 x i8> %s1, %s2 + ret <16 x i8> %r +}