1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 04:32:44 +01:00

[X86][SSE] combineX86ShuffleChain - attempt to recognise 'hidden' identity shuffles

See if the combined shuffle mask is equivalent to an identity shuffle, typically this is due to repeated LHS/RHS ops in horiz-ops, but isTargetShuffleEquivalent might see other patterns as well.

This is another small step towards getting rid of foldShuffleOfHorizOp and relying on canonicalizeShuffleMaskWithHorizOp and generic shuffle combining.
This commit is contained in:
Simon Pilgrim 2021-03-27 11:09:30 +00:00
parent c9eefe2dea
commit cdae6be18a
2 changed files with 30 additions and 20 deletions

View File

@ -35301,6 +35301,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return CanonicalizeShuffleInput(RootVT, V1);
}
// See if the shuffle is a hidden identity shuffle - repeated args in HOPs
// etc. can be simplified.
if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
SmallVector<int> ScaledMask, IdentityMask;
unsigned NumElts = VT1.getVectorNumElements();
if (BaseMask.size() <= NumElts &&
scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
for (unsigned i = 0; i != NumElts; ++i)
IdentityMask.push_back(i);
if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
return CanonicalizeShuffleInput(RootVT, V1);
}
}
// Handle 128/256-bit lane shuffles of 512-bit vectors.
if (RootVT.is512BitVector() &&
(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

View File

@ -26,13 +26,12 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm2[0,0]
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
@ -109,13 +108,12 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
@ -136,9 +134,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-SLOW-NEXT: retq
;
@ -161,9 +158,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %xmm2
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: retq
%5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
@ -441,7 +437,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,3,1,1]
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[1],zero
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
@ -493,7 +489,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]