mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
[X86][SSE] Avoid duplicate shuffle input sources in combineX86ShufflesRecursively
rL339686 added the case where a faux shuffle might have repeated shuffle inputs coming from either side of the OR(). This patch improves the insertion of the inputs into the source ops lists to account for this, as well as making it trivial to add support for shuffles with more than 2 inputs in the future. llvm-svn: 339696
This commit is contained in:
parent
9849312f06
commit
cf0f6afa45
@ -30279,23 +30279,26 @@ static SDValue combineX86ShufflesRecursively(
|
||||
// Add the inputs to the Ops list, avoiding duplicates.
|
||||
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
|
||||
|
||||
int InputIdx0 = -1, InputIdx1 = -1;
|
||||
for (int i = 0, e = Ops.size(); i < e; ++i) {
|
||||
SDValue BC = peekThroughBitcasts(Ops[i]);
|
||||
if (Input0 && BC == peekThroughBitcasts(Input0))
|
||||
InputIdx0 = i;
|
||||
if (Input1 && BC == peekThroughBitcasts(Input1))
|
||||
InputIdx1 = i;
|
||||
}
|
||||
auto AddOp = [&Ops](SDValue Input, int InsertionPoint = -1) -> int {
|
||||
if (!Input)
|
||||
return -1;
|
||||
// Attempt to find an existing match.
|
||||
SDValue InputBC = peekThroughBitcasts(Input);
|
||||
for (int i = 0, e = Ops.size(); i < e; ++i)
|
||||
if (InputBC == peekThroughBitcasts(Ops[i]))
|
||||
return i;
|
||||
// Match failed - should we replace an existing Op?
|
||||
if (InsertionPoint >= 0) {
|
||||
Ops[InsertionPoint] = Input;
|
||||
return InsertionPoint;
|
||||
}
|
||||
// Add to the end of the Ops list.
|
||||
Ops.push_back(Input);
|
||||
return Ops.size() - 1;
|
||||
};
|
||||
|
||||
if (Input0 && InputIdx0 < 0) {
|
||||
InputIdx0 = SrcOpIndex;
|
||||
Ops[SrcOpIndex] = Input0;
|
||||
}
|
||||
if (Input1 && InputIdx1 < 0) {
|
||||
InputIdx1 = Ops.size();
|
||||
Ops.push_back(Input1);
|
||||
}
|
||||
int InputIdx0 = AddOp(Input0, SrcOpIndex);
|
||||
int InputIdx1 = AddOp(Input1);
|
||||
|
||||
assert(((RootMask.size() > OpMask.size() &&
|
||||
RootMask.size() % OpMask.size() == 0) ||
|
||||
|
@ -923,18 +923,12 @@ define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8>
|
||||
define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
|
||||
; X32-LABEL: combine_pshufb_pshufb_or_pshufb:
|
||||
; X32: # %bb.0:
|
||||
; X32-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero
|
||||
; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19]
|
||||
; X32-NEXT: vpor %ymm0, %ymm1, %ymm0
|
||||
; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
|
||||
; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: combine_pshufb_pshufb_or_pshufb:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero
|
||||
; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19]
|
||||
; X64-NEXT: vpor %ymm0, %ymm1, %ymm0
|
||||
; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
|
||||
; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
; X64-NEXT: retq
|
||||
%1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
|
||||
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
|
||||
|
@ -707,20 +707,23 @@ define <16 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<16 x i8> %a0, <16 x i8>
|
||||
define <16 x i8> @combine_pshufb_pshufb_or_pshufb(<16 x i8> %a0) {
|
||||
; SSE-LABEL: combine_pshufb_pshufb_or_pshufb:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero
|
||||
; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3]
|
||||
; SSE-NEXT: por %xmm1, %xmm0
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_pshufb_pshufb_or_pshufb:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3]
|
||||
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
|
||||
; AVX-NEXT: retq
|
||||
; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512F-LABEL: combine_pshufb_pshufb_or_pshufb:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vbroadcastss %xmm0, %xmm0
|
||||
; AVX512F-NEXT: retq
|
||||
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
|
||||
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
|
||||
%3 = or <16 x i8> %1, %2
|
||||
|
Loading…
Reference in New Issue
Block a user