From 9bb9f0f392a634bbfaa8f047d311bd2fee8e8a6e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 25 May 2015 17:49:13 +0000 Subject: [PATCH] [X86][AVX2] Vectorized i16 shift operators Part of D9474, this patch extends AVX2 v16i16 types to 2 x 8i32 vectors and uses i32 shift variable shifts before packing back to i16. Adds AVX2 tests for v8i16 and v16i16 llvm-svn: 238149 --- lib/Target/X86/X86ISelLowering.cpp | 34 +++++++-- lib/Target/X86/X86TargetTransformInfo.cpp | 14 ++-- test/CodeGen/X86/avx2-vector-shifts.ll | 91 +++++++++++++++++++++++ 3 files changed, 126 insertions(+), 13 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d04d455a0e4..ed9e1016e70 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16537,6 +16537,10 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); if (Op.getOpcode() == ISD::SHL) { + // Simple i8 add case + if (ShiftAmt == 1) + return DAG.getNode(ISD::ADD, dl, VT, R, R); + // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); @@ -16881,13 +16885,31 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // the extra overheads to get from v16i8 to v8i32 make the existing SSE // solution better. if (Subtarget->hasInt256() && VT == MVT::v8i16) { - MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16; + MVT ExtVT = MVT::v8i32; unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - R = DAG.getNode(ExtOpc, dl, NewVT, R); - Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); + R = DAG.getNode(ExtOpc, dl, ExtVT, R); + Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, - DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); + DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); + } + + if (Subtarget->hasInt256() && VT == MVT::v16i16) { + MVT ExtVT = MVT::v8i32; + SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); + SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); + SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R); + SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R); + ALo = DAG.getNode(ISD::BITCAST, dl, ExtVT, ALo); + AHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, AHi); + RLo = DAG.getNode(ISD::BITCAST, dl, ExtVT, RLo); + RHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, RHi); + SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo); + SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi); + Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT)); + Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); } // Decompose 256-bit shifts into smaller 128-bit shifts. @@ -20859,7 +20881,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); - if (!BCVT.isVector() || + if (!BCVT.isVector() || BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); @@ -20991,7 +21013,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, } EVT VT = N->getValueType(0); - + if (VT == MVT::i1 && dyn_cast(N->getOperand(1)) && InputVector.getOpcode() == ISD::BITCAST && dyn_cast(InputVector.getOperand(0))) { diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 17c86a7b9f0..bbfeba8b9d8 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -153,15 +153,15 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SHL, MVT::v4i64, 1 }, { ISD::SRL, MVT::v4i64, 1 }, - { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. - { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. + { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. + { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. - { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized. + { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. - { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. - { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v32i8, 32*20 }, diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll index e355301dd05..8aae90c3c03 100644 --- a/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/test/CodeGen/X86/avx2-vector-shifts.ll @@ -266,3 +266,94 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { %sra = lshr <4 x i32> %x, %trunc ret <4 x i32> %sra } + +; +; Vectorized byte shifts +; + +define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { +; CHECK-LABEL: shl_8i16 +; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK: retq + %shl = shl <8 x i16> %r, %a + ret <8 x i16> %shl +} + +define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { +; CHECK-LABEL: shl_16i16 +; CHECK: vpxor %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 +; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 +; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 +; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: retq + %shl = shl <16 x i16> %r, %a + ret <16 x i16> %shl +} + +define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { +; CHECK-LABEL: ashr_8i16 +; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 +; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK: retq + %ashr = ashr <8 x i16> %r, %a + ret <8 x i16> %ashr +} + +define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { +; CHECK-LABEL: ashr_16i16 +; CHECK: vpxor %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 +; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 +; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: retq + %ashr = ashr <16 x i16> %r, %a + ret <16 x i16> %ashr +} + +define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { +; CHECK-LABEL: lshr_8i16 +; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK: retq + %lshr = lshr <8 x i16> %r, %a + ret <8 x i16> %lshr +} + +define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { +; CHECK-LABEL: lshr_16i16 +; CHECK: vpxor %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 +; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 +; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: retq + %lshr = lshr <16 x i16> %r, %a + ret <16 x i16> %lshr +}