mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-21 18:22:53 +01:00
X86: LowerShift: new algorithm for vector-vector shifts
Emit pair of shifts of double size if possible
This commit is contained in:
parent
d5b5885c23
commit
4743d020ce
@ -28446,6 +28446,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
|
||||
SDValue Amt = Op.getOperand(1);
|
||||
unsigned EltSizeInBits = VT.getScalarSizeInBits();
|
||||
bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
|
||||
bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
|
||||
|
||||
unsigned Opc = Op.getOpcode();
|
||||
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
|
||||
@ -28654,6 +28655,74 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
|
||||
return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
|
||||
}
|
||||
|
||||
// Alternative version of vector-vector shifts for unsupported types.
|
||||
// Should be more effective with slow cross-lane moves.
|
||||
if ((!OptForMinSize && Subtarget.hasInt256() && VT == MVT::v8i16) ||
|
||||
(Subtarget.hasInt256() && VT == MVT::v16i16) ||
|
||||
(!OptForMinSize && Subtarget.hasAVX512() && VT == MVT::v16i16) ||
|
||||
(Subtarget.hasAVX512() && VT == MVT::v32i16) ||
|
||||
(!OptForMinSize && Subtarget.hasBWI() && VT == MVT::v16i8) ||
|
||||
(!OptForMinSize && Subtarget.hasBWI() && VT == MVT::v32i8) ||
|
||||
(Subtarget.hasBWI() && VT == MVT::v64i8)) {
|
||||
MVT EvtSVT = Subtarget.hasBWI() && VT != MVT::v32i16 ? MVT::i16 : MVT::i32;
|
||||
MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements() / 2);
|
||||
int DivCnt = EvtSVT == MVT::i16 ? 4 : 2;
|
||||
MVT Ext32 = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / DivCnt);
|
||||
MVT Ext64 = MVT::getVectorVT(MVT::i64, Ext32.getVectorNumElements() / 2);
|
||||
int ShC = EvtSVT == MVT::i16 ? 8 : 16;
|
||||
unsigned MaskValue = EvtSVT == MVT::i16 ? 0xff00U : 0xffff0000U;
|
||||
SDValue MaskH;
|
||||
SDValue AH;
|
||||
SDValue RH;
|
||||
|
||||
R = DAG.getBitcast(ExtVT, R);
|
||||
RH = DAG.getBitcast(ExtVT, R);
|
||||
MaskH = DAG.getConstant(MaskValue, dl, ExtVT);
|
||||
AH = DAG.getBitcast(ExtVT, Amt);
|
||||
AH = DAG.getNode(ISD::SRL, dl, ExtVT, AH, DAG.getConstant(ShC, dl, ExtVT));
|
||||
if (Op->getOpcode() == ISD::SRA) {
|
||||
// Increase shift value in order to emplace it into the low position
|
||||
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, DAG.getConstant(ShC, dl, VT));
|
||||
}
|
||||
if (VT != MVT::v64i8) {
|
||||
SDValue Mask8 = DAG.getBitcast(VT, MaskH);
|
||||
Amt = DAG.getNode(ISD::USUBSAT, dl, VT, Amt, Mask8);
|
||||
Amt = DAG.getBitcast(ExtVT, Amt);
|
||||
} else {
|
||||
SDValue Mask64 = DAG.getBitcast(Ext64, MaskH);
|
||||
Amt = DAG.getBitcast(Ext64, Amt);
|
||||
Amt = DAG.getNode(X86ISD::ANDNP, dl, Ext64, Mask64, Amt);
|
||||
Amt = DAG.getBitcast(ExtVT, Amt);
|
||||
}
|
||||
if (Op->getOpcode() == ISD::SHL) {
|
||||
RH = DAG.getNode(ISD::AND, dl, ExtVT, R, MaskH);
|
||||
}
|
||||
if (Op->getOpcode() == ISD::SRL) {
|
||||
SDValue Mask64 = DAG.getBitcast(Ext64, MaskH);
|
||||
R = DAG.getBitcast(Ext64, R);
|
||||
R = DAG.getNode(X86ISD::ANDNP, dl, Ext64, Mask64, R);
|
||||
R = DAG.getBitcast(ExtVT, R);
|
||||
}
|
||||
if (Op->getOpcode() == ISD::SRA) {
|
||||
R = DAG.getNode(ISD::SHL, dl, ExtVT, R, DAG.getConstant(ShC, dl, ExtVT));
|
||||
}
|
||||
RH = DAG.getNode(Op->getOpcode(), dl, ExtVT, RH, AH);
|
||||
R = DAG.getNode(Op->getOpcode(), dl, ExtVT, R, Amt);
|
||||
|
||||
// Merge high and low results (Mask ? RH : R)
|
||||
if (Subtarget.hasAVX512()) {
|
||||
R = DAG.getNode(X86ISD::VPTERNLOG, dl, Ext32, DAG.getBitcast(Ext32, RH),
|
||||
DAG.getBitcast(Ext32, R), DAG.getBitcast(Ext32, MaskH),
|
||||
DAG.getTargetConstant(0xe4, dl, MVT::i8));
|
||||
} else {
|
||||
R = DAG.getNode(X86ISD::BLENDI, dl, VT, DAG.getBitcast(VT, R),
|
||||
DAG.getBitcast(VT, RH),
|
||||
DAG.getTargetConstant(0xaa, dl, MVT::i8));
|
||||
}
|
||||
|
||||
return DAG.getBitcast(VT, R);
|
||||
}
|
||||
|
||||
// It's worth extending once and using the vXi16/vXi32 shifts for smaller
|
||||
// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
|
||||
// make the existing SSE solution better.
|
||||
|
Loading…
Reference in New Issue
Block a user