1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

Revert rG66e7dce714fab "Revert "[X86][SSE] Shuffle combine blends to OR(X,Y) if the relevant elements are known zero.""

[X86][SSE] Shuffle combine blends to OR(X,Y) if the relevant elements are known zero (REAPPLIED)

This allows us to remove the (depth violating) code in getFauxShuffleMask where we were combining the OR(SHUFFLE,SHUFFLE) shuffle inputs as well, and not just the OR().

This is a minor step toward being able to shuffle combine from/to SELECT/BLENDV as a faux shuffle.

Reapplied with fixed signed/unsigned comparisons.
This commit is contained in:
Simon Pilgrim 2020-08-04 10:32:27 +01:00
parent beaa269335
commit fb31cb4e44
5 changed files with 61 additions and 35 deletions

View File

@ -7401,8 +7401,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
SDValue N0 = peekThroughBitcasts(N.getOperand(0));
SDValue N1 = peekThroughBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
@ -7413,34 +7413,24 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
true))
return false;
// Shuffle inputs must be the same size as the result.
if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
return VT.getSizeInBits() != Op.getValueSizeInBits();
}))
return false;
if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
return VT.getSizeInBits() != Op.getValueSizeInBits();
}))
return false;
size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (size_t i = 0; i != MaskSize; ++i) {
for (int i = 0; i != (int)MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
Mask.push_back(Mask0[i]);
Mask.push_back(i);
else if (Mask0[i] == SM_SentinelZero)
Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
Mask.push_back(i + MaskSize);
else
return false;
}
Ops.append(SrcInputs0.begin(), SrcInputs0.end());
Ops.append(SrcInputs1.begin(), SrcInputs1.end());
Ops.push_back(N0);
Ops.push_back(N1);
return true;
}
case ISD::INSERT_SUBVECTOR: {
@ -34219,6 +34209,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
bool IsUnary) {
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
@ -34276,6 +34267,46 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
// Attempt to match against a OR if we're performing a blend shuffle and the
// non-blended source element is zero in each case.
if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
bool IsBlend = true;
unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
unsigned Scale1 = NumV1Elts / NumMaskElts;
unsigned Scale2 = NumV2Elts / NumMaskElts;
APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
if (M == SM_SentinelZero) {
DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
continue;
}
if (M == (int)i) {
DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
continue;
}
if (M == (int)(i + NumMaskElts)) {
DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
continue;
}
IsBlend = false;
break;
}
if (IsBlend &&
DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
Shuffle = ISD::OR;
SrcVT = DstVT = EVT(MaskVT).changeTypeToInteger().getSimpleVT();
return true;
}
}
return false;
}

View File

@ -389,11 +389,9 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
@ -411,11 +409,9 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSE3-NEXT: movdqa %xmm3, %xmm4
; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE3-NEXT: por %xmm4, %xmm0
; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
; SSE3-NEXT: pand %xmm5, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSE3-NEXT: pandn %xmm3, %xmm5
; SSE3-NEXT: por %xmm5, %xmm1
; SSE3-NEXT: por %xmm3, %xmm1
; SSE3-NEXT: pand %xmm2, %xmm1
; SSE3-NEXT: por %xmm4, %xmm1
; SSE3-NEXT: retq

View File

@ -1314,10 +1314,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX1-LABEL: negative:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper

View File

@ -1713,9 +1713,8 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
;
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_XX4X8acX:

View File

@ -3358,9 +3358,9 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0