1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 02:33:06 +01:00

X86: LowerShift: new algorithm for vector-vector shifts

Emit pair of shifts of double size if possible
This commit is contained in:
Nekotekina 2018-01-30 00:41:56 +03:00
parent 80d76b612e
commit 6e3871c033

View File

@ -5051,7 +5051,7 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
// relocation target a movq or addq instruction: don't let the load shrink.
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@ -26838,6 +26838,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SDValue Amt = Op.getOperand(1);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
unsigned Opc = Op.getOpcode();
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
@ -27046,6 +27047,74 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
}
// Alternative version of vector-vector shifts for unsupported types.
// Should be more effective with slow cross-lane moves.
if ((!OptForMinSize && Subtarget.hasInt256() && VT == MVT::v8i16) ||
(Subtarget.hasInt256() && VT == MVT::v16i16) ||
(!OptForMinSize && Subtarget.hasAVX512() && VT == MVT::v16i16) ||
(Subtarget.hasAVX512() && VT == MVT::v32i16) ||
(!OptForMinSize && Subtarget.hasBWI() && VT == MVT::v16i8) ||
(!OptForMinSize && Subtarget.hasBWI() && VT == MVT::v32i8) ||
(Subtarget.hasBWI() && VT == MVT::v64i8)) {
MVT EvtSVT = Subtarget.hasBWI() && VT != MVT::v32i16 ? MVT::i16 : MVT::i32;
MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements() / 2);
int DivCnt = EvtSVT == MVT::i16 ? 4 : 2;
MVT Ext32 = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / DivCnt);
MVT Ext64 = MVT::getVectorVT(MVT::i64, Ext32.getVectorNumElements() / 2);
int ShC = EvtSVT == MVT::i16 ? 8 : 16;
unsigned MaskValue = EvtSVT == MVT::i16 ? 0xff00U : 0xffff0000U;
SDValue MaskH;
SDValue AH;
SDValue RH;
R = DAG.getBitcast(ExtVT, R);
RH = DAG.getBitcast(ExtVT, R);
MaskH = DAG.getConstant(MaskValue, dl, ExtVT);
AH = DAG.getBitcast(ExtVT, Amt);
AH = DAG.getNode(ISD::SRL, dl, ExtVT, AH, DAG.getConstant(ShC, dl, ExtVT));
if (Op->getOpcode() == ISD::SRA) {
// Increase shift value in order to emplace it into the low position
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, DAG.getConstant(ShC, dl, VT));
}
if (VT != MVT::v64i8) {
SDValue Mask8 = DAG.getBitcast(VT, MaskH);
Amt = DAG.getNode(ISD::USUBSAT, dl, VT, Amt, Mask8);
Amt = DAG.getBitcast(ExtVT, Amt);
} else {
SDValue Mask64 = DAG.getBitcast(Ext64, MaskH);
Amt = DAG.getBitcast(Ext64, Amt);
Amt = DAG.getNode(X86ISD::ANDNP, dl, Ext64, Mask64, Amt);
Amt = DAG.getBitcast(ExtVT, Amt);
}
if (Op->getOpcode() == ISD::SHL) {
RH = DAG.getNode(ISD::AND, dl, ExtVT, R, MaskH);
}
if (Op->getOpcode() == ISD::SRL) {
SDValue Mask64 = DAG.getBitcast(Ext64, MaskH);
R = DAG.getBitcast(Ext64, R);
R = DAG.getNode(X86ISD::ANDNP, dl, Ext64, Mask64, R);
R = DAG.getBitcast(ExtVT, R);
}
if (Op->getOpcode() == ISD::SRA) {
R = DAG.getNode(ISD::SHL, dl, ExtVT, R, DAG.getConstant(ShC, dl, ExtVT));
}
RH = DAG.getNode(Op->getOpcode(), dl, ExtVT, RH, AH);
R = DAG.getNode(Op->getOpcode(), dl, ExtVT, R, Amt);
// Merge high and low results (Mask ? RH : R)
if (Subtarget.hasAVX512()) {
R = DAG.getNode(X86ISD::VPTERNLOG, dl, Ext32, DAG.getBitcast(Ext32, RH),
DAG.getBitcast(Ext32, R), DAG.getBitcast(Ext32, MaskH),
DAG.getTargetConstant(0xe4, dl, MVT::i8));
} else {
R = DAG.getNode(X86ISD::BLENDI, dl, VT, DAG.getBitcast(VT, R),
DAG.getBitcast(VT, RH),
DAG.getTargetConstant(0xaa, dl, MVT::i8));
}
return DAG.getBitcast(VT, R);
}
// It's worth extending once and using the vXi16/vXi32 shifts for smaller
// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
// make the existing SSE solution better.