mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 10:42:39 +01:00
[X86][SSE] combineX86ShuffleChain - attempt to recognise 'hidden' identity shuffles
See if the combined shuffle mask is equivalent to an identity shuffle, typically this is due to repeated LHS/RHS ops in horiz-ops, but isTargetShuffleEquivalent might see other patterns as well. This is another small step towards getting rid of foldShuffleOfHorizOp and relying on canonicalizeShuffleMaskWithHorizOp and generic shuffle combining.
This commit is contained in:
parent
c9eefe2dea
commit
cdae6be18a
@ -35301,6 +35301,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
||||
return CanonicalizeShuffleInput(RootVT, V1);
|
||||
}
|
||||
|
||||
// See if the shuffle is a hidden identity shuffle - repeated args in HOPs
|
||||
// etc. can be simplified.
|
||||
if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
|
||||
SmallVector<int> ScaledMask, IdentityMask;
|
||||
unsigned NumElts = VT1.getVectorNumElements();
|
||||
if (BaseMask.size() <= NumElts &&
|
||||
scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
|
||||
for (unsigned i = 0; i != NumElts; ++i)
|
||||
IdentityMask.push_back(i);
|
||||
if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
|
||||
return CanonicalizeShuffleInput(RootVT, V1);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle 128/256-bit lane shuffles of 512-bit vectors.
|
||||
if (RootVT.is512BitVector() &&
|
||||
(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
|
||||
|
@ -26,13 +26,12 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm2[0,0]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
|
||||
@ -109,13 +108,12 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
|
||||
@ -136,9 +134,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
@ -161,9 +158,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX2-SLOW-NEXT: retq
|
||||
%5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
|
||||
@ -441,7 +437,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
||||
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,3,1,1]
|
||||
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[1],zero
|
||||
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
|
||||
; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
|
||||
@ -493,7 +489,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
|
||||
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
|
||||
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
|
||||
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
|
||||
|
Loading…
Reference in New Issue
Block a user