mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 03:33:20 +01:00
[X86][AVX2] Vectorized i16 shift operators
Part of D9474, this patch extends AVX2 v16i16 types to 2 x 8i32 vectors and uses i32 shift variable shifts before packing back to i16. Adds AVX2 tests for v8i16 and v16i16 llvm-svn: 238149
This commit is contained in:
parent
4f9ed639c3
commit
9bb9f0f392
@ -16537,6 +16537,10 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
|
||||
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
|
||||
|
||||
if (Op.getOpcode() == ISD::SHL) {
|
||||
// Simple i8 add case
|
||||
if (ShiftAmt == 1)
|
||||
return DAG.getNode(ISD::ADD, dl, VT, R, R);
|
||||
|
||||
// Make a large shift.
|
||||
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
|
||||
R, ShiftAmt, DAG);
|
||||
@ -16881,13 +16885,31 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
|
||||
// the extra overheads to get from v16i8 to v8i32 make the existing SSE
|
||||
// solution better.
|
||||
if (Subtarget->hasInt256() && VT == MVT::v8i16) {
|
||||
MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
|
||||
MVT ExtVT = MVT::v8i32;
|
||||
unsigned ExtOpc =
|
||||
Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
||||
R = DAG.getNode(ExtOpc, dl, NewVT, R);
|
||||
Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
|
||||
R = DAG.getNode(ExtOpc, dl, ExtVT, R);
|
||||
Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
|
||||
return DAG.getNode(ISD::TRUNCATE, dl, VT,
|
||||
DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
|
||||
DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
|
||||
}
|
||||
|
||||
if (Subtarget->hasInt256() && VT == MVT::v16i16) {
|
||||
MVT ExtVT = MVT::v8i32;
|
||||
SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
|
||||
SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
|
||||
SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
|
||||
SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
|
||||
SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
|
||||
ALo = DAG.getNode(ISD::BITCAST, dl, ExtVT, ALo);
|
||||
AHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, AHi);
|
||||
RLo = DAG.getNode(ISD::BITCAST, dl, ExtVT, RLo);
|
||||
RHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, RHi);
|
||||
SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
|
||||
SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
|
||||
Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
|
||||
Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
|
||||
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
|
||||
}
|
||||
|
||||
// Decompose 256-bit shifts into smaller 128-bit shifts.
|
||||
@ -20859,7 +20881,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
|
||||
if (!InVec.hasOneUse())
|
||||
return SDValue();
|
||||
EVT BCVT = InVec.getOperand(0).getValueType();
|
||||
if (!BCVT.isVector() ||
|
||||
if (!BCVT.isVector() ||
|
||||
BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
|
||||
return SDValue();
|
||||
InVec = InVec.getOperand(0);
|
||||
@ -20991,7 +21013,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
|
||||
}
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
|
||||
if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) &&
|
||||
InputVector.getOpcode() == ISD::BITCAST &&
|
||||
dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
|
||||
|
@ -153,15 +153,15 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
|
||||
{ ISD::SHL, MVT::v4i64, 1 },
|
||||
{ ISD::SRL, MVT::v4i64, 1 },
|
||||
|
||||
{ ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence.
|
||||
{ ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized.
|
||||
{ ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence.
|
||||
{ ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
|
||||
|
||||
{ ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized.
|
||||
{ ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized.
|
||||
{ ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized.
|
||||
{ ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
|
||||
|
||||
{ ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized.
|
||||
{ ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized.
|
||||
{ ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
|
||||
{ ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized.
|
||||
{ ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
|
||||
{ ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
|
||||
|
||||
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
|
||||
{ ISD::SDIV, MVT::v32i8, 32*20 },
|
||||
|
@ -266,3 +266,94 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
|
||||
%sra = lshr <4 x i32> %x, %trunc
|
||||
ret <4 x i32> %sra
|
||||
}
|
||||
|
||||
;
|
||||
; Vectorized byte shifts
|
||||
;
|
||||
|
||||
define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
|
||||
; CHECK-LABEL: shl_8i16
|
||||
; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
; CHECK: retq
|
||||
%shl = shl <8 x i16> %r, %a
|
||||
ret <8 x i16> %shl
|
||||
}
|
||||
|
||||
define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
|
||||
; CHECK-LABEL: shl_16i16
|
||||
; CHECK: vpxor %ymm2, %ymm2, %ymm2
|
||||
; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
|
||||
; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
|
||||
; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
|
||||
; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
|
||||
; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
|
||||
; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
|
||||
; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%shl = shl <16 x i16> %r, %a
|
||||
ret <16 x i16> %shl
|
||||
}
|
||||
|
||||
define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
|
||||
; CHECK-LABEL: ashr_8i16
|
||||
; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
|
||||
; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
; CHECK: retq
|
||||
%ashr = ashr <8 x i16> %r, %a
|
||||
ret <8 x i16> %ashr
|
||||
}
|
||||
|
||||
define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
|
||||
; CHECK-LABEL: ashr_16i16
|
||||
; CHECK: vpxor %ymm2, %ymm2, %ymm2
|
||||
; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
|
||||
; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
|
||||
; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3
|
||||
; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
|
||||
; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
|
||||
; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
|
||||
; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%ashr = ashr <16 x i16> %r, %a
|
||||
ret <16 x i16> %ashr
|
||||
}
|
||||
|
||||
define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
|
||||
; CHECK-LABEL: lshr_8i16
|
||||
; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
; CHECK: retq
|
||||
%lshr = lshr <8 x i16> %r, %a
|
||||
ret <8 x i16> %lshr
|
||||
}
|
||||
|
||||
define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
|
||||
; CHECK-LABEL: lshr_16i16
|
||||
; CHECK: vpxor %ymm2, %ymm2, %ymm2
|
||||
; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
|
||||
; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
|
||||
; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
|
||||
; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
|
||||
; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
|
||||
; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
|
||||
; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%lshr = lshr <16 x i16> %r, %a
|
||||
ret <16 x i16> %lshr
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user