1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[X86] Fold CONCAT(VPERMV3(X,Y,M0),VPERMV3(Z,W,M1)) -> VPERMV3(CONCAT(X,Z),CONCAT(Y,W),CONCAT(M0,M1))

Further prep work toward supporting different subvector sizes in combineX86ShufflesRecursively
This commit is contained in:
Simon Pilgrim 2020-12-08 18:35:23 +00:00
parent 3423fa8d78
commit 7df5c0cf1d
2 changed files with 68 additions and 42 deletions

View File

@ -48813,6 +48813,38 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return DAG.getBitcast(VT, Res);
}
break;
case X86ISD::VPERMV3:
if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
MVT OpVT = Op0.getSimpleValueType();
int NumSrcElts = OpVT.getVectorNumElements();
SmallVector<int, 64> ConcatMask;
for (unsigned i = 0; i != NumOps; ++i) {
bool IsUnary;
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubOps;
if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
SubMask, IsUnary))
break;
for (int M : SubMask) {
if (0 <= M) {
M += M < NumSrcElts ? 0 : NumSrcElts;
M += i * NumSrcElts;
}
ConcatMask.push_back(M);
}
}
if (ConcatMask.size() == (NumOps * NumSrcElts)) {
SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
Ops[1].getOperand(0), DAG, DL);
SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
Ops[1].getOperand(2), DAG, DL);
MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
}
}
break;
case X86ISD::VSHLI:
case X86ISD::VSRAI:
case X86ISD::VSRLI:

View File

@ -145,11 +145,10 @@ define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512-NEXT: retq
%1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
%2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
@ -166,11 +165,10 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
; AVX512-NEXT: vpsrld $23, %zmm1, %zmm1
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512-NEXT: retq
%1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
%2 = lshr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
@ -183,26 +181,25 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX512F-LABEL: concat_trunc_packsswb_512:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: concat_trunc_packsswb_512:
@ -211,11 +208,10 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512BW-NEXT: retq
%1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@ -228,26 +224,25 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX512F-LABEL: concat_trunc_packuswb_512:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: concat_trunc_packuswb_512:
@ -256,11 +251,10 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512BW-NEXT: retq
%1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>