mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
[InstCombine][SSE] Add DemandedElts support for PACKSS/PACKUS instructions
Simplify a packss/packus truncation based on the elements of the mask that are actually demanded. Differential Revision: https://reviews.llvm.org/D28777 llvm-svn: 292591
This commit is contained in:
parent
0e35152dbe
commit
7f59f5cfb4
@ -1472,6 +1472,60 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::x86_sse2_packssdw_128:
|
||||
case Intrinsic::x86_sse2_packsswb_128:
|
||||
case Intrinsic::x86_sse2_packuswb_128:
|
||||
case Intrinsic::x86_sse41_packusdw:
|
||||
case Intrinsic::x86_avx2_packssdw:
|
||||
case Intrinsic::x86_avx2_packsswb:
|
||||
case Intrinsic::x86_avx2_packusdw:
|
||||
case Intrinsic::x86_avx2_packuswb: {
|
||||
// TODO Add support for Intrinsic::x86_avx512_mask_pack*
|
||||
auto *Ty0 = II->getArgOperand(0)->getType();
|
||||
unsigned InnerVWidth = Ty0->getVectorNumElements();
|
||||
assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
|
||||
|
||||
unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
|
||||
unsigned VWidthPerLane = VWidth / NumLanes;
|
||||
unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
|
||||
|
||||
// Per lane, pack the elements of the first input and then the second.
|
||||
// e.g.
|
||||
// v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
|
||||
// v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
|
||||
for (int OpNum = 0; OpNum != 2; ++OpNum) {
|
||||
APInt OpDemandedElts(InnerVWidth, 0);
|
||||
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
|
||||
unsigned LaneIdx = Lane * VWidthPerLane;
|
||||
for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
|
||||
unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
|
||||
if (DemandedElts[Idx])
|
||||
OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
|
||||
}
|
||||
}
|
||||
|
||||
// Demand elements from the operand.
|
||||
auto *Op = II->getArgOperand(OpNum);
|
||||
APInt OpUndefElts(InnerVWidth, 0);
|
||||
TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts,
|
||||
Depth + 1);
|
||||
if (TmpV) {
|
||||
II->setArgOperand(OpNum, TmpV);
|
||||
MadeChange = true;
|
||||
}
|
||||
|
||||
// Pack the operand's UNDEF elements, one lane at a time.
|
||||
OpUndefElts = OpUndefElts.zext(VWidth);
|
||||
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
|
||||
APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
|
||||
LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
|
||||
LaneElts = LaneElts.shl(InnerVWidthPerLane * (2 * Lane + OpNum));
|
||||
UndefElts |= LaneElts;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// PSHUFB
|
||||
case Intrinsic::x86_ssse3_pshuf_b_128:
|
||||
case Intrinsic::x86_avx2_pshuf_b:
|
||||
|
@ -7,11 +7,9 @@
|
||||
|
||||
define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
; CHECK-LABEL: @elts_packssdw_128(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
|
||||
; CHECK-NEXT: ret <8 x i16> [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> undef)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <8 x i16> [[TMP2]]
|
||||
;
|
||||
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
|
||||
%2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
|
||||
@ -22,10 +20,8 @@ define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
|
||||
define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
; CHECK-LABEL: @elts_packusdw_128(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> %a0, i32 0, i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> %a1, i32 0, i32 3
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
|
||||
; CHECK-NEXT: ret <8 x i16> [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
|
||||
; CHECK-NEXT: ret <8 x i16> [[TMP1]]
|
||||
;
|
||||
%1 = insertelement <4 x i32> %a0, i32 0, i32 0
|
||||
%2 = insertelement <4 x i32> %a1, i32 0, i32 3
|
||||
@ -36,11 +32,9 @@ define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
|
||||
define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
|
||||
; CHECK-LABEL: @elts_packsswb_128(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> %a0, i16 0, i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> %a1, i16 0, i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
|
||||
; CHECK-NEXT: ret <16 x i8> [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
|
||||
; CHECK-NEXT: ret <16 x i8> [[TMP2]]
|
||||
;
|
||||
%1 = insertelement <8 x i16> %a0, i16 0, i32 0
|
||||
%2 = insertelement <8 x i16> %a1, i16 0, i32 0
|
||||
@ -51,9 +45,7 @@ define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
|
||||
|
||||
define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
|
||||
; CHECK-LABEL: @elts_packuswb_128(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
|
||||
; CHECK-NEXT: ret <16 x i8> [[TMP2]]
|
||||
; CHECK-NEXT: ret <16 x i8> undef
|
||||
;
|
||||
%1 = insertelement <8 x i16> undef, i16 0, i32 0
|
||||
%2 = insertelement <8 x i16> undef, i16 0, i32 0
|
||||
@ -64,10 +56,8 @@ define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
|
||||
|
||||
define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
|
||||
; CHECK-LABEL: @elts_packssdw_256(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
|
||||
; CHECK-NEXT: ret <16 x i16> [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> undef)
|
||||
; CHECK-NEXT: ret <16 x i16> [[TMP1]]
|
||||
;
|
||||
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
|
||||
@ -79,7 +69,7 @@ define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
|
||||
define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
|
||||
; CHECK-LABEL: @elts_packusdw_256(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> [[TMP1]])
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <16 x i16> [[TMP3]]
|
||||
;
|
||||
@ -92,11 +82,9 @@ define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
|
||||
|
||||
define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
|
||||
; CHECK-LABEL: @elts_packsswb_256(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i16> %a0, i16 0, i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> %a1, i16 0, i32 8
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
|
||||
; CHECK-NEXT: ret <32 x i8> [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <16 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
|
||||
; CHECK-NEXT: ret <32 x i8> [[TMP2]]
|
||||
;
|
||||
%1 = insertelement <16 x i16> %a0, i16 0, i32 0
|
||||
%2 = insertelement <16 x i16> %a1, i16 0, i32 8
|
||||
@ -107,9 +95,7 @@ define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
|
||||
|
||||
define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
|
||||
; CHECK-LABEL: @elts_packuswb_256(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <16 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> zeroinitializer
|
||||
; CHECK-NEXT: ret <32 x i8> [[TMP2]]
|
||||
; CHECK-NEXT: ret <32 x i8> undef
|
||||
;
|
||||
%1 = insertelement <16 x i16> undef, i16 0, i32 1
|
||||
%2 = insertelement <16 x i16> undef, i16 0, i32 0
|
||||
|
Loading…
Reference in New Issue
Block a user