1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[InstCombine][SSE] Add DemandedElts support for PACKSS/PACKUS instructions

Simplify a packss/packus truncation based on the elements of the mask that are actually demanded.

Differential Revision: https://reviews.llvm.org/D28777

llvm-svn: 292591
This commit is contained in:
Simon Pilgrim 2017-01-20 09:28:21 +00:00
parent 0e35152dbe
commit 7f59f5cfb4
2 changed files with 70 additions and 30 deletions

View File

@ -1472,6 +1472,60 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
break;
}
case Intrinsic::x86_sse2_packssdw_128:
case Intrinsic::x86_sse2_packsswb_128:
case Intrinsic::x86_sse2_packuswb_128:
case Intrinsic::x86_sse41_packusdw:
case Intrinsic::x86_avx2_packssdw:
case Intrinsic::x86_avx2_packsswb:
case Intrinsic::x86_avx2_packusdw:
case Intrinsic::x86_avx2_packuswb: {
// TODO Add support for Intrinsic::x86_avx512_mask_pack*
auto *Ty0 = II->getArgOperand(0)->getType();
unsigned InnerVWidth = Ty0->getVectorNumElements();
assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
unsigned VWidthPerLane = VWidth / NumLanes;
unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
// Per lane, pack the elements of the first input and then the second.
// e.g.
// v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
// v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
for (int OpNum = 0; OpNum != 2; ++OpNum) {
APInt OpDemandedElts(InnerVWidth, 0);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
unsigned LaneIdx = Lane * VWidthPerLane;
for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
if (DemandedElts[Idx])
OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
}
}
// Demand elements from the operand.
auto *Op = II->getArgOperand(OpNum);
APInt OpUndefElts(InnerVWidth, 0);
TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts,
Depth + 1);
if (TmpV) {
II->setArgOperand(OpNum, TmpV);
MadeChange = true;
}
// Pack the operand's UNDEF elements, one lane at a time.
OpUndefElts = OpUndefElts.zext(VWidth);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
LaneElts = LaneElts.shl(InnerVWidthPerLane * (2 * Lane + OpNum));
UndefElts |= LaneElts;
}
}
break;
}
// PSHUFB
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:

View File

@ -7,11 +7,9 @@
define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @elts_packssdw_128(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
; CHECK-NEXT: ret <8 x i16> [[TMP4]]
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <8 x i16> [[TMP2]]
;
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
%2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
@ -22,10 +20,8 @@ define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @elts_packusdw_128(
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> %a0, i32 0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> %a1, i32 0, i32 3
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
; CHECK-NEXT: ret <8 x i16> [[TMP3]]
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
; CHECK-NEXT: ret <8 x i16> [[TMP1]]
;
%1 = insertelement <4 x i32> %a0, i32 0, i32 0
%2 = insertelement <4 x i32> %a1, i32 0, i32 3
@ -36,11 +32,9 @@ define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: @elts_packsswb_128(
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> %a0, i16 0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> %a1, i16 0, i32 0
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: ret <16 x i8> [[TMP4]]
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: ret <16 x i8> [[TMP2]]
;
%1 = insertelement <8 x i16> %a0, i16 0, i32 0
%2 = insertelement <8 x i16> %a1, i16 0, i32 0
@ -51,9 +45,7 @@ define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: @elts_packuswb_128(
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: ret <16 x i8> [[TMP2]]
; CHECK-NEXT: ret <16 x i8> undef
;
%1 = insertelement <8 x i16> undef, i16 0, i32 0
%2 = insertelement <8 x i16> undef, i16 0, i32 0
@ -64,10 +56,8 @@ define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @elts_packssdw_256(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
; CHECK-NEXT: ret <16 x i16> [[TMP3]]
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> undef)
; CHECK-NEXT: ret <16 x i16> [[TMP1]]
;
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
@ -79,7 +69,7 @@ define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @elts_packusdw_256(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <16 x i16> [[TMP3]]
;
@ -92,11 +82,9 @@ define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: @elts_packsswb_256(
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i16> %a0, i16 0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> %a1, i16 0, i32 8
; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
; CHECK-NEXT: ret <32 x i8> [[TMP4]]
; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <16 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
; CHECK-NEXT: ret <32 x i8> [[TMP2]]
;
%1 = insertelement <16 x i16> %a0, i16 0, i32 0
%2 = insertelement <16 x i16> %a1, i16 0, i32 8
@ -107,9 +95,7 @@ define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: @elts_packuswb_256(
; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <16 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: ret <32 x i8> [[TMP2]]
; CHECK-NEXT: ret <32 x i8> undef
;
%1 = insertelement <16 x i16> undef, i16 0, i32 1
%2 = insertelement <16 x i16> undef, i16 0, i32 0