From 4743d020ce85fca81a90a901a803b57776c24a50 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Tue, 30 Jan 2018 00:41:56 +0300 Subject: [PATCH] X86: LowerShift: new algorithm for vector-vector shifts Emit pair of shifts of double size if possible --- lib/Target/X86/X86ISelLowering.cpp | 69 ++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e3106438df1..78a3539c9c3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -28446,6 +28446,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SDValue Amt = Op.getOperand(1); unsigned EltSizeInBits = VT.getScalarSizeInBits(); bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); + bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize(); unsigned Opc = Op.getOpcode(); unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true); @@ -28654,6 +28655,74 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7}); } + // Alternative version of vector-vector shifts for unsupported types. + // Should be more effective with slow cross-lane moves. + if ((!OptForMinSize && Subtarget.hasInt256() && VT == MVT::v8i16) || + (Subtarget.hasInt256() && VT == MVT::v16i16) || + (!OptForMinSize && Subtarget.hasAVX512() && VT == MVT::v16i16) || + (Subtarget.hasAVX512() && VT == MVT::v32i16) || + (!OptForMinSize && Subtarget.hasBWI() && VT == MVT::v16i8) || + (!OptForMinSize && Subtarget.hasBWI() && VT == MVT::v32i8) || + (Subtarget.hasBWI() && VT == MVT::v64i8)) { + MVT EvtSVT = Subtarget.hasBWI() && VT != MVT::v32i16 ? MVT::i16 : MVT::i32; + MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements() / 2); + int DivCnt = EvtSVT == MVT::i16 ? 4 : 2; + MVT Ext32 = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / DivCnt); + MVT Ext64 = MVT::getVectorVT(MVT::i64, Ext32.getVectorNumElements() / 2); + int ShC = EvtSVT == MVT::i16 ? 8 : 16; + unsigned MaskValue = EvtSVT == MVT::i16 ? 0xff00U : 0xffff0000U; + SDValue MaskH; + SDValue AH; + SDValue RH; + + R = DAG.getBitcast(ExtVT, R); + RH = DAG.getBitcast(ExtVT, R); + MaskH = DAG.getConstant(MaskValue, dl, ExtVT); + AH = DAG.getBitcast(ExtVT, Amt); + AH = DAG.getNode(ISD::SRL, dl, ExtVT, AH, DAG.getConstant(ShC, dl, ExtVT)); + if (Op->getOpcode() == ISD::SRA) { + // Increase shift value in order to emplace it into the low position + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, DAG.getConstant(ShC, dl, VT)); + } + if (VT != MVT::v64i8) { + SDValue Mask8 = DAG.getBitcast(VT, MaskH); + Amt = DAG.getNode(ISD::USUBSAT, dl, VT, Amt, Mask8); + Amt = DAG.getBitcast(ExtVT, Amt); + } else { + SDValue Mask64 = DAG.getBitcast(Ext64, MaskH); + Amt = DAG.getBitcast(Ext64, Amt); + Amt = DAG.getNode(X86ISD::ANDNP, dl, Ext64, Mask64, Amt); + Amt = DAG.getBitcast(ExtVT, Amt); + } + if (Op->getOpcode() == ISD::SHL) { + RH = DAG.getNode(ISD::AND, dl, ExtVT, R, MaskH); + } + if (Op->getOpcode() == ISD::SRL) { + SDValue Mask64 = DAG.getBitcast(Ext64, MaskH); + R = DAG.getBitcast(Ext64, R); + R = DAG.getNode(X86ISD::ANDNP, dl, Ext64, Mask64, R); + R = DAG.getBitcast(ExtVT, R); + } + if (Op->getOpcode() == ISD::SRA) { + R = DAG.getNode(ISD::SHL, dl, ExtVT, R, DAG.getConstant(ShC, dl, ExtVT)); + } + RH = DAG.getNode(Op->getOpcode(), dl, ExtVT, RH, AH); + R = DAG.getNode(Op->getOpcode(), dl, ExtVT, R, Amt); + + // Merge high and low results (Mask ? RH : R) + if (Subtarget.hasAVX512()) { + R = DAG.getNode(X86ISD::VPTERNLOG, dl, Ext32, DAG.getBitcast(Ext32, RH), + DAG.getBitcast(Ext32, R), DAG.getBitcast(Ext32, MaskH), + DAG.getTargetConstant(0xe4, dl, MVT::i8)); + } else { + R = DAG.getNode(X86ISD::BLENDI, dl, VT, DAG.getBitcast(VT, R), + DAG.getBitcast(VT, RH), + DAG.getTargetConstant(0xaa, dl, MVT::i8)); + } + + return DAG.getBitcast(VT, R); + } + // It's worth extending once and using the vXi16/vXi32 shifts for smaller // types, but without AVX512 the extra overheads to get from vXi8 to vXi32 // make the existing SSE solution better.