mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[SelectionDAG] Initial support for FSHL/FSHR funnel shift opcodes (PR39467)
This is an initial patch to add a minimum level of support for funnel shifts to the SelectionDAG and to begin wiring it up to the X86 SHLD/SHRD instructions. Some partial legalization code has been added to handle the case for 'SlowSHLD' where we want to expand instead and I've added a few DAG combines so we don't get regressions from the existing DAG builder expansion code. Differential Revision: https://reviews.llvm.org/D54698 llvm-svn: 348353
This commit is contained in:
parent
8d062f4fb7
commit
573fb67bcf
@ -394,9 +394,13 @@ namespace ISD {
|
||||
/// When the 1st operand is a vector, the shift amount must be in the same
|
||||
/// type. (TLI.getShiftAmountTy() will return the same type when the input
|
||||
/// type is a vector.)
|
||||
/// For rotates, the shift amount is treated as an unsigned amount modulo
|
||||
/// the element size of the first operand.
|
||||
SHL, SRA, SRL, ROTL, ROTR,
|
||||
/// For rotates and funnel shifts, the shift amount is treated as an unsigned
|
||||
/// amount modulo the element size of the first operand.
|
||||
///
|
||||
/// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
|
||||
/// fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
|
||||
/// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
|
||||
SHL, SRA, SRL, ROTL, ROTR, FSHL, FSHR,
|
||||
|
||||
/// Byte Swap and Counting operators.
|
||||
BSWAP, CTTZ, CTLZ, CTPOP, BITREVERSE,
|
||||
|
@ -3689,6 +3689,12 @@ public:
|
||||
SDValue LL = SDValue(), SDValue LH = SDValue(),
|
||||
SDValue RL = SDValue(), SDValue RH = SDValue()) const;
|
||||
|
||||
/// Expand funnel shift.
|
||||
/// \param N Node to expand
|
||||
/// \param Result output after conversion
|
||||
/// \returns True, if the expansion was successful, false otherwise
|
||||
bool expandFunnelShift(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
|
||||
|
||||
/// Expand float(f32) to SINT(i64) conversion
|
||||
/// \param N Node to expand
|
||||
/// \param Result output after conversion
|
||||
|
@ -325,6 +325,7 @@ namespace {
|
||||
SDValue visitSHL(SDNode *N);
|
||||
SDValue visitSRA(SDNode *N);
|
||||
SDValue visitSRL(SDNode *N);
|
||||
SDValue visitFunnelShift(SDNode *N);
|
||||
SDValue visitRotate(SDNode *N);
|
||||
SDValue visitABS(SDNode *N);
|
||||
SDValue visitBSWAP(SDNode *N);
|
||||
@ -1513,6 +1514,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
|
||||
case ISD::SRL: return visitSRL(N);
|
||||
case ISD::ROTR:
|
||||
case ISD::ROTL: return visitRotate(N);
|
||||
case ISD::FSHL:
|
||||
case ISD::FSHR: return visitFunnelShift(N);
|
||||
case ISD::ABS: return visitABS(N);
|
||||
case ISD::BSWAP: return visitBSWAP(N);
|
||||
case ISD::BITREVERSE: return visitBITREVERSE(N);
|
||||
@ -6926,6 +6929,39 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
|
||||
EVT VT = N->getValueType(0);
|
||||
SDValue N0 = N->getOperand(0);
|
||||
SDValue N1 = N->getOperand(1);
|
||||
SDValue N2 = N->getOperand(2);
|
||||
bool IsFSHL = N->getOpcode() == ISD::FSHL;
|
||||
unsigned BitWidth = VT.getScalarSizeInBits();
|
||||
|
||||
// fold (fshl N0, N1, 0) -> N0
|
||||
// fold (fshr N0, N1, 0) -> N1
|
||||
if (DAG.MaskedValueIsZero(N2, APInt::getAllOnesValue(BitWidth)))
|
||||
return IsFSHL ? N0 : N1;
|
||||
|
||||
// fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
|
||||
if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
|
||||
if (Cst->getAPIntValue().uge(BitWidth)) {
|
||||
uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
|
||||
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
|
||||
DAG.getConstant(RotAmt, SDLoc(N), N2.getValueType()));
|
||||
}
|
||||
}
|
||||
|
||||
// fold (fshl N0, N0, N2) -> (rotl N0, N2)
|
||||
// fold (fshr N0, N0, N2) -> (rotr N0, N2)
|
||||
// TODO: Investigate flipping this rotate if only one is legal, if funnel shift
|
||||
// is legal as well we might be better off avoiding non-constant (BW - N2).
|
||||
unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
|
||||
if (N0 == N1 && hasOperation(RotOpc, VT))
|
||||
return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitABS(SDNode *N) {
|
||||
SDValue N0 = N->getOperand(0);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
@ -1170,6 +1170,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
|
||||
}
|
||||
}
|
||||
break;
|
||||
case ISD::FSHL:
|
||||
case ISD::FSHR:
|
||||
case ISD::SRL_PARTS:
|
||||
case ISD::SRA_PARTS:
|
||||
case ISD::SHL_PARTS: {
|
||||
@ -3262,6 +3264,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ISD::FSHL:
|
||||
case ISD::FSHR:
|
||||
if (TLI.expandFunnelShift(Node, Tmp1, DAG))
|
||||
Results.push_back(Tmp1);
|
||||
break;
|
||||
case ISD::SADDSAT:
|
||||
case ISD::UADDSAT:
|
||||
case ISD::SSUBSAT:
|
||||
|
@ -129,12 +129,13 @@ class VectorLegalizer {
|
||||
SDValue ExpandFNEG(SDValue Op);
|
||||
SDValue ExpandFSUB(SDValue Op);
|
||||
SDValue ExpandBITREVERSE(SDValue Op);
|
||||
SDValue ExpandCTPOP(SDValue Op);
|
||||
SDValue ExpandCTLZ(SDValue Op);
|
||||
SDValue ExpandCTTZ(SDValue Op);
|
||||
SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
|
||||
SDValue ExpandStrictFPOp(SDValue Op);
|
||||
|
||||
SDValue ExpandCTPOP(SDValue Op);
|
||||
SDValue ExpandCTLZ(SDValue Op);
|
||||
SDValue ExpandCTTZ(SDValue Op);
|
||||
SDValue ExpandFunnelShift(SDValue Op);
|
||||
SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
|
||||
SDValue ExpandStrictFPOp(SDValue Op);
|
||||
|
||||
/// Implements vector promotion.
|
||||
///
|
||||
/// This is essentially just bitcasting the operands to a different type and
|
||||
@ -746,12 +747,15 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
|
||||
case ISD::CTLZ:
|
||||
case ISD::CTLZ_ZERO_UNDEF:
|
||||
return ExpandCTLZ(Op);
|
||||
case ISD::CTTZ:
|
||||
case ISD::CTTZ_ZERO_UNDEF:
|
||||
return ExpandCTTZ(Op);
|
||||
case ISD::FMINNUM:
|
||||
case ISD::FMAXNUM:
|
||||
return ExpandFMINNUM_FMAXNUM(Op);
|
||||
case ISD::CTTZ:
|
||||
case ISD::CTTZ_ZERO_UNDEF:
|
||||
return ExpandCTTZ(Op);
|
||||
case ISD::FSHL:
|
||||
case ISD::FSHR:
|
||||
return ExpandFunnelShift(Op);
|
||||
case ISD::FMINNUM:
|
||||
case ISD::FMAXNUM:
|
||||
return ExpandFMINNUM_FMAXNUM(Op);
|
||||
case ISD::STRICT_FADD:
|
||||
case ISD::STRICT_FSUB:
|
||||
case ISD::STRICT_FMUL:
|
||||
@ -1123,32 +1127,40 @@ SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
|
||||
return Op; // Defer to LegalizeDAG
|
||||
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
|
||||
SDValue Result;
|
||||
if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
|
||||
return Result;
|
||||
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
|
||||
SDValue Result;
|
||||
if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
|
||||
return Result;
|
||||
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
|
||||
SDValue Result;
|
||||
if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
|
||||
return Result;
|
||||
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
|
||||
SDValue Result;
|
||||
if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
|
||||
return Result;
|
||||
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
|
||||
SDValue Result;
|
||||
if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
|
||||
return Result;
|
||||
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
|
||||
SDValue Result;
|
||||
if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
|
||||
return Result;
|
||||
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandFunnelShift(SDValue Op) {
|
||||
SDValue Result;
|
||||
if (TLI.expandFunnelShift(Op.getNode(), Result, DAG))
|
||||
return Result;
|
||||
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
|
||||
if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
|
||||
return Expanded;
|
||||
|
@ -5751,6 +5751,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
|
||||
SDValue Zero = DAG.getConstant(0, sdl, VT);
|
||||
SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
|
||||
|
||||
auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
|
||||
if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
|
||||
setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// When X == Y, this is rotate. If the data type has a power-of-2 size, we
|
||||
// avoid the select that is necessary in the general case to filter out
|
||||
// the 0-shift possibility that leads to UB.
|
||||
|
@ -237,6 +237,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
|
||||
case ISD::SRL: return "srl";
|
||||
case ISD::ROTL: return "rotl";
|
||||
case ISD::ROTR: return "rotr";
|
||||
case ISD::FSHL: return "fshl";
|
||||
case ISD::FSHR: return "fshr";
|
||||
case ISD::FADD: return "fadd";
|
||||
case ISD::STRICT_FADD: return "strict_fadd";
|
||||
case ISD::FSUB: return "fsub";
|
||||
|
@ -4114,6 +4114,54 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
|
||||
return Ok;
|
||||
}
|
||||
|
||||
bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
|
||||
SelectionDAG &DAG) const {
|
||||
EVT VT = Node->getValueType(0);
|
||||
|
||||
if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
|
||||
!isOperationLegalOrCustom(ISD::SRL, VT) ||
|
||||
!isOperationLegalOrCustom(ISD::SUB, VT) ||
|
||||
!isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
|
||||
return false;
|
||||
|
||||
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
|
||||
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
|
||||
SDValue X = Node->getOperand(0);
|
||||
SDValue Y = Node->getOperand(1);
|
||||
SDValue Z = Node->getOperand(2);
|
||||
|
||||
unsigned EltSizeInBits = VT.getScalarSizeInBits();
|
||||
bool IsFSHL = Node->getOpcode() == ISD::FSHL;
|
||||
SDLoc DL(SDValue(Node, 0));
|
||||
|
||||
EVT ShVT = Z.getValueType();
|
||||
SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
|
||||
SDValue Zero = DAG.getConstant(0, DL, ShVT);
|
||||
|
||||
SDValue ShAmt;
|
||||
if (isPowerOf2_32(EltSizeInBits)) {
|
||||
SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
|
||||
ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
|
||||
} else {
|
||||
ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
|
||||
}
|
||||
|
||||
SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
|
||||
SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
|
||||
SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
|
||||
SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
|
||||
|
||||
// If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
|
||||
// and that is undefined. We must compare and select to avoid UB.
|
||||
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
|
||||
|
||||
// For fshl, 0-shift returns the 1st arg (X).
|
||||
// For fshr, 0-shift returns the 2nd arg (Y).
|
||||
SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
|
||||
Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
|
||||
SelectionDAG &DAG) const {
|
||||
SDValue Src = Node->getOperand(0);
|
||||
|
@ -610,6 +610,8 @@ void TargetLoweringBase::initActions() {
|
||||
setOperationAction(ISD::UMIN, VT, Expand);
|
||||
setOperationAction(ISD::UMAX, VT, Expand);
|
||||
setOperationAction(ISD::ABS, VT, Expand);
|
||||
setOperationAction(ISD::FSHL, VT, Expand);
|
||||
setOperationAction(ISD::FSHR, VT, Expand);
|
||||
setOperationAction(ISD::SADDSAT, VT, Expand);
|
||||
setOperationAction(ISD::UADDSAT, VT, Expand);
|
||||
setOperationAction(ISD::SSUBSAT, VT, Expand);
|
||||
|
@ -195,6 +195,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
||||
setOperationAction(ISD::ABS , MVT::i64 , Custom);
|
||||
}
|
||||
|
||||
// Funnel shifts.
|
||||
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
|
||||
setOperationAction(ShiftOp , MVT::i16 , Custom);
|
||||
setOperationAction(ShiftOp , MVT::i32 , Custom);
|
||||
if (Subtarget.is64Bit())
|
||||
setOperationAction(ShiftOp , MVT::i64 , Custom);
|
||||
}
|
||||
|
||||
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
|
||||
// operation.
|
||||
setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
|
||||
@ -16972,6 +16980,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
||||
/// Lower SRA_PARTS and friends, which return two i32 values
|
||||
/// and take a 2 x i32 value to shift plus a shift amount.
|
||||
/// TODO: Can this be moved to general expansion code?
|
||||
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
|
||||
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
@ -16981,8 +16990,8 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
|
||||
SDValue ShOpLo = Op.getOperand(0);
|
||||
SDValue ShOpHi = Op.getOperand(1);
|
||||
SDValue ShAmt = Op.getOperand(2);
|
||||
// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
|
||||
// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
|
||||
// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
|
||||
// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
|
||||
// during isel.
|
||||
SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
|
||||
DAG.getConstant(VTBits - 1, dl, MVT::i8));
|
||||
@ -16992,10 +17001,10 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
|
||||
|
||||
SDValue Tmp2, Tmp3;
|
||||
if (Op.getOpcode() == ISD::SHL_PARTS) {
|
||||
Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
|
||||
Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
|
||||
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
|
||||
} else {
|
||||
Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
|
||||
Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
|
||||
Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
|
||||
}
|
||||
|
||||
@ -17019,6 +17028,37 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
|
||||
return DAG.getMergeValues({ Lo, Hi }, dl);
|
||||
}
|
||||
|
||||
static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
|
||||
"Unexpected funnel shift opcode!");
|
||||
assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
|
||||
"Unexpected funnel shift type!");
|
||||
|
||||
SDLoc DL(Op);
|
||||
SDValue Op0 = Op.getOperand(0);
|
||||
SDValue Op1 = Op.getOperand(1);
|
||||
SDValue Amt = Op.getOperand(2);
|
||||
|
||||
// Expand slow SHLD/SHRD cases.
|
||||
// TODO - can we be more selective here: OptSize/RMW etc.?
|
||||
if (Subtarget.isSHLDSlow())
|
||||
return SDValue();
|
||||
|
||||
bool IsFSHR = Op.getOpcode() == ISD::FSHR;
|
||||
if (IsFSHR)
|
||||
std::swap(Op0, Op1);
|
||||
|
||||
// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
|
||||
if (VT == MVT::i16)
|
||||
Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
|
||||
DAG.getConstant(15, DL, Amt.getValueType()));
|
||||
|
||||
unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
|
||||
return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
|
||||
}
|
||||
|
||||
// Try to use a packed vector operation to handle i64 on 32-bit targets when
|
||||
// AVX512DQ is enabled.
|
||||
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
|
||||
@ -26115,6 +26155,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::SHL_PARTS:
|
||||
case ISD::SRA_PARTS:
|
||||
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
|
||||
case ISD::FSHL:
|
||||
case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
|
||||
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
|
||||
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
|
||||
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
|
||||
|
@ -2022,7 +2022,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
|
||||
{ ISD::ROTL, MVT::i64, 1 },
|
||||
{ ISD::ROTR, MVT::i64, 1 },
|
||||
{ X86ISD::SHLD, MVT::i64, 4 }
|
||||
{ ISD::FSHL, MVT::i64, 4 }
|
||||
};
|
||||
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
|
||||
{ ISD::ROTL, MVT::i32, 1 },
|
||||
@ -2031,9 +2031,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
{ ISD::ROTR, MVT::i32, 1 },
|
||||
{ ISD::ROTR, MVT::i16, 1 },
|
||||
{ ISD::ROTR, MVT::i8, 1 },
|
||||
{ X86ISD::SHLD, MVT::i32, 4 },
|
||||
{ X86ISD::SHLD, MVT::i16, 4 },
|
||||
{ X86ISD::SHLD, MVT::i8, 4 }
|
||||
{ ISD::FSHL, MVT::i32, 4 },
|
||||
{ ISD::FSHL, MVT::i16, 4 },
|
||||
{ ISD::FSHL, MVT::i8, 4 }
|
||||
};
|
||||
|
||||
unsigned ISD = ISD::DELETED_NODE;
|
||||
@ -2041,13 +2041,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
default:
|
||||
break;
|
||||
case Intrinsic::fshl:
|
||||
ISD = X86ISD::SHLD;
|
||||
ISD = ISD::FSHL;
|
||||
if (Args[0] == Args[1])
|
||||
ISD = ISD::ROTL;
|
||||
break;
|
||||
case Intrinsic::fshr:
|
||||
// SHRD has same costs so don't duplicate.
|
||||
ISD = X86ISD::SHLD;
|
||||
// FSHR has same costs so don't duplicate.
|
||||
ISD = ISD::FSHL;
|
||||
if (Args[0] == Args[1])
|
||||
ISD = ISD::ROTR;
|
||||
break;
|
||||
|
@ -58,20 +58,11 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
|
||||
define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
|
||||
; X86-FAST-LABEL: var_shift_i16:
|
||||
; X86-FAST: # %bb.0:
|
||||
; X86-FAST-NEXT: pushl %esi
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %esi
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-FAST-NEXT: andl $15, %ecx
|
||||
; X86-FAST-NEXT: movl %eax, %edx
|
||||
; X86-FAST-NEXT: shldw %cl, %si, %dx
|
||||
; X86-FAST-NEXT: testw %cx, %cx
|
||||
; X86-FAST-NEXT: je .LBB1_2
|
||||
; X86-FAST-NEXT: # %bb.1:
|
||||
; X86-FAST-NEXT: movl %edx, %eax
|
||||
; X86-FAST-NEXT: .LBB1_2:
|
||||
; X86-FAST-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X86-FAST-NEXT: popl %esi
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %edx
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-FAST-NEXT: andb $15, %cl
|
||||
; X86-FAST-NEXT: shldw %cl, %dx, %ax
|
||||
; X86-FAST-NEXT: retl
|
||||
;
|
||||
; X86-SLOW-LABEL: var_shift_i16:
|
||||
@ -79,17 +70,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
|
||||
; X86-SLOW-NEXT: pushl %edi
|
||||
; X86-SLOW-NEXT: pushl %esi
|
||||
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %esi
|
||||
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: andl $15, %edx
|
||||
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
|
||||
; X86-SLOW-NEXT: andb $15, %dl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl %eax, %edi
|
||||
; X86-SLOW-NEXT: movl %edx, %ecx
|
||||
; X86-SLOW-NEXT: shll %cl, %edi
|
||||
; X86-SLOW-NEXT: movl $16, %ecx
|
||||
; X86-SLOW-NEXT: subl %edx, %ecx
|
||||
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X86-SLOW-NEXT: movb $16, %cl
|
||||
; X86-SLOW-NEXT: subb %dl, %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %esi
|
||||
; X86-SLOW-NEXT: testw %dx, %dx
|
||||
; X86-SLOW-NEXT: testb %dl, %dl
|
||||
; X86-SLOW-NEXT: je .LBB1_2
|
||||
; X86-SLOW-NEXT: # %bb.1:
|
||||
; X86-SLOW-NEXT: orl %esi, %edi
|
||||
@ -103,27 +93,25 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
|
||||
; X64-FAST-LABEL: var_shift_i16:
|
||||
; X64-FAST: # %bb.0:
|
||||
; X64-FAST-NEXT: movl %edx, %ecx
|
||||
; X64-FAST-NEXT: andl $15, %ecx
|
||||
; X64-FAST-NEXT: movl %edi, %eax
|
||||
; X64-FAST-NEXT: andb $15, %cl
|
||||
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-FAST-NEXT: shldw %cl, %si, %ax
|
||||
; X64-FAST-NEXT: testw %cx, %cx
|
||||
; X64-FAST-NEXT: cmovel %edi, %eax
|
||||
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X64-FAST-NEXT: retq
|
||||
;
|
||||
; X64-SLOW-LABEL: var_shift_i16:
|
||||
; X64-SLOW: # %bb.0:
|
||||
; X64-SLOW-NEXT: movzwl %si, %eax
|
||||
; X64-SLOW-NEXT: andl $15, %edx
|
||||
; X64-SLOW-NEXT: andb $15, %dl
|
||||
; X64-SLOW-NEXT: movl %edi, %esi
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: shll %cl, %esi
|
||||
; X64-SLOW-NEXT: movl $16, %ecx
|
||||
; X64-SLOW-NEXT: subl %edx, %ecx
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-SLOW-NEXT: movb $16, %cl
|
||||
; X64-SLOW-NEXT: subb %dl, %cl
|
||||
; X64-SLOW-NEXT: shrl %cl, %eax
|
||||
; X64-SLOW-NEXT: orl %esi, %eax
|
||||
; X64-SLOW-NEXT: testw %dx, %dx
|
||||
; X64-SLOW-NEXT: testb %dl, %dl
|
||||
; X64-SLOW-NEXT: cmovel %edi, %eax
|
||||
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X64-SLOW-NEXT: retq
|
||||
@ -134,19 +122,10 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
|
||||
define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
; X86-FAST-LABEL: var_shift_i32:
|
||||
; X86-FAST: # %bb.0:
|
||||
; X86-FAST-NEXT: pushl %esi
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-FAST-NEXT: andl $31, %ecx
|
||||
; X86-FAST-NEXT: movl %eax, %edx
|
||||
; X86-FAST-NEXT: shldl %cl, %esi, %edx
|
||||
; X86-FAST-NEXT: testl %ecx, %ecx
|
||||
; X86-FAST-NEXT: je .LBB2_2
|
||||
; X86-FAST-NEXT: # %bb.1:
|
||||
; X86-FAST-NEXT: movl %edx, %eax
|
||||
; X86-FAST-NEXT: .LBB2_2:
|
||||
; X86-FAST-NEXT: popl %esi
|
||||
; X86-FAST-NEXT: shldl %cl, %edx, %eax
|
||||
; X86-FAST-NEXT: retl
|
||||
;
|
||||
; X86-SLOW-LABEL: var_shift_i32:
|
||||
@ -154,17 +133,16 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
; X86-SLOW-NEXT: pushl %edi
|
||||
; X86-SLOW-NEXT: pushl %esi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: andl $31, %edx
|
||||
; X86-SLOW-NEXT: movl %eax, %edi
|
||||
; X86-SLOW-NEXT: movl %edx, %ecx
|
||||
; X86-SLOW-NEXT: shll %cl, %edi
|
||||
; X86-SLOW-NEXT: andb $31, %dl
|
||||
; X86-SLOW-NEXT: movl %edx, %ecx
|
||||
; X86-SLOW-NEXT: negl %ecx
|
||||
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X86-SLOW-NEXT: negb %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %esi
|
||||
; X86-SLOW-NEXT: testl %edx, %edx
|
||||
; X86-SLOW-NEXT: testb %dl, %dl
|
||||
; X86-SLOW-NEXT: je .LBB2_2
|
||||
; X86-SLOW-NEXT: # %bb.1:
|
||||
; X86-SLOW-NEXT: orl %esi, %edi
|
||||
@ -177,26 +155,23 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
; X64-FAST-LABEL: var_shift_i32:
|
||||
; X64-FAST: # %bb.0:
|
||||
; X64-FAST-NEXT: movl %edx, %ecx
|
||||
; X64-FAST-NEXT: andl $31, %ecx
|
||||
; X64-FAST-NEXT: movl %edi, %eax
|
||||
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-FAST-NEXT: shldl %cl, %esi, %eax
|
||||
; X64-FAST-NEXT: testl %ecx, %ecx
|
||||
; X64-FAST-NEXT: cmovel %edi, %eax
|
||||
; X64-FAST-NEXT: retq
|
||||
;
|
||||
; X64-SLOW-LABEL: var_shift_i32:
|
||||
; X64-SLOW: # %bb.0:
|
||||
; X64-SLOW-NEXT: movl %esi, %eax
|
||||
; X64-SLOW-NEXT: andl $31, %edx
|
||||
; X64-SLOW-NEXT: movl %edi, %esi
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: shll %cl, %esi
|
||||
; X64-SLOW-NEXT: andb $31, %dl
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: negl %ecx
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-SLOW-NEXT: negb %cl
|
||||
; X64-SLOW-NEXT: shrl %cl, %eax
|
||||
; X64-SLOW-NEXT: orl %esi, %eax
|
||||
; X64-SLOW-NEXT: testl %edx, %edx
|
||||
; X64-SLOW-NEXT: testb %dl, %dl
|
||||
; X64-SLOW-NEXT: cmovel %edi, %eax
|
||||
; X64-SLOW-NEXT: retq
|
||||
%tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
|
||||
@ -204,85 +179,166 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
}
|
||||
|
||||
define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
|
||||
; X86-LABEL: var_shift_i64:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: pushl %ebp
|
||||
; X86-NEXT: pushl %ebx
|
||||
; X86-NEXT: pushl %edi
|
||||
; X86-NEXT: pushl %esi
|
||||
; X86-NEXT: pushl %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
|
||||
; X86-NEXT: andl $63, %ebx
|
||||
; X86-NEXT: movl %eax, %edi
|
||||
; X86-NEXT: movl %ebx, %ecx
|
||||
; X86-NEXT: shll %cl, %edi
|
||||
; X86-NEXT: shldl %cl, %eax, %ebp
|
||||
; X86-NEXT: testb $32, %bl
|
||||
; X86-NEXT: je .LBB3_2
|
||||
; X86-NEXT: # %bb.1:
|
||||
; X86-NEXT: movl %edi, %ebp
|
||||
; X86-NEXT: xorl %edi, %edi
|
||||
; X86-NEXT: .LBB3_2:
|
||||
; X86-NEXT: movb $64, %cl
|
||||
; X86-NEXT: subb %bl, %cl
|
||||
; X86-NEXT: movl %edx, %esi
|
||||
; X86-NEXT: shrl %cl, %esi
|
||||
; X86-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
|
||||
; X86-NEXT: testb $32, %cl
|
||||
; X86-NEXT: jne .LBB3_3
|
||||
; X86-NEXT: # %bb.4:
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
|
||||
; X86-NEXT: testl %ebx, %ebx
|
||||
; X86-NEXT: jne .LBB3_6
|
||||
; X86-NEXT: jmp .LBB3_7
|
||||
; X86-NEXT: .LBB3_3:
|
||||
; X86-NEXT: movl %esi, %ecx
|
||||
; X86-NEXT: xorl %esi, %esi
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-NEXT: testl %ebx, %ebx
|
||||
; X86-NEXT: je .LBB3_7
|
||||
; X86-NEXT: .LBB3_6:
|
||||
; X86-NEXT: orl %esi, %ebp
|
||||
; X86-NEXT: orl %ecx, %edi
|
||||
; X86-NEXT: movl %edi, %eax
|
||||
; X86-NEXT: movl %ebp, %edx
|
||||
; X86-NEXT: .LBB3_7:
|
||||
; X86-NEXT: addl $4, %esp
|
||||
; X86-NEXT: popl %esi
|
||||
; X86-NEXT: popl %edi
|
||||
; X86-NEXT: popl %ebx
|
||||
; X86-NEXT: popl %ebp
|
||||
; X86-NEXT: retl
|
||||
; X86-FAST-LABEL: var_shift_i64:
|
||||
; X86-FAST: # %bb.0:
|
||||
; X86-FAST-NEXT: pushl %ebp
|
||||
; X86-FAST-NEXT: pushl %ebx
|
||||
; X86-FAST-NEXT: pushl %edi
|
||||
; X86-FAST-NEXT: pushl %esi
|
||||
; X86-FAST-NEXT: pushl %eax
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx
|
||||
; X86-FAST-NEXT: andl $63, %ebx
|
||||
; X86-FAST-NEXT: movl %eax, %edi
|
||||
; X86-FAST-NEXT: movl %ebx, %ecx
|
||||
; X86-FAST-NEXT: shll %cl, %edi
|
||||
; X86-FAST-NEXT: shldl %cl, %eax, %ebp
|
||||
; X86-FAST-NEXT: testb $32, %bl
|
||||
; X86-FAST-NEXT: je .LBB3_2
|
||||
; X86-FAST-NEXT: # %bb.1:
|
||||
; X86-FAST-NEXT: movl %edi, %ebp
|
||||
; X86-FAST-NEXT: xorl %edi, %edi
|
||||
; X86-FAST-NEXT: .LBB3_2:
|
||||
; X86-FAST-NEXT: movb $64, %cl
|
||||
; X86-FAST-NEXT: subb %bl, %cl
|
||||
; X86-FAST-NEXT: movl %edx, %esi
|
||||
; X86-FAST-NEXT: shrl %cl, %esi
|
||||
; X86-FAST-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
|
||||
; X86-FAST-NEXT: testb $32, %cl
|
||||
; X86-FAST-NEXT: jne .LBB3_3
|
||||
; X86-FAST-NEXT: # %bb.4:
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-FAST-NEXT: movl (%esp), %ecx # 4-byte Reload
|
||||
; X86-FAST-NEXT: testl %ebx, %ebx
|
||||
; X86-FAST-NEXT: jne .LBB3_6
|
||||
; X86-FAST-NEXT: jmp .LBB3_7
|
||||
; X86-FAST-NEXT: .LBB3_3:
|
||||
; X86-FAST-NEXT: movl %esi, %ecx
|
||||
; X86-FAST-NEXT: xorl %esi, %esi
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-FAST-NEXT: testl %ebx, %ebx
|
||||
; X86-FAST-NEXT: je .LBB3_7
|
||||
; X86-FAST-NEXT: .LBB3_6:
|
||||
; X86-FAST-NEXT: orl %esi, %ebp
|
||||
; X86-FAST-NEXT: orl %ecx, %edi
|
||||
; X86-FAST-NEXT: movl %edi, %eax
|
||||
; X86-FAST-NEXT: movl %ebp, %edx
|
||||
; X86-FAST-NEXT: .LBB3_7:
|
||||
; X86-FAST-NEXT: addl $4, %esp
|
||||
; X86-FAST-NEXT: popl %esi
|
||||
; X86-FAST-NEXT: popl %edi
|
||||
; X86-FAST-NEXT: popl %ebx
|
||||
; X86-FAST-NEXT: popl %ebp
|
||||
; X86-FAST-NEXT: retl
|
||||
;
|
||||
; X86-SLOW-LABEL: var_shift_i64:
|
||||
; X86-SLOW: # %bb.0:
|
||||
; X86-SLOW-NEXT: pushl %ebp
|
||||
; X86-SLOW-NEXT: pushl %ebx
|
||||
; X86-SLOW-NEXT: pushl %edi
|
||||
; X86-SLOW-NEXT: pushl %esi
|
||||
; X86-SLOW-NEXT: subl $8, %esp
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
|
||||
; X86-SLOW-NEXT: andl $63, %ebx
|
||||
; X86-SLOW-NEXT: movb $64, %dh
|
||||
; X86-SLOW-NEXT: subb %bl, %dh
|
||||
; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: movb %dh, %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %eax
|
||||
; X86-SLOW-NEXT: movb %dh, %dl
|
||||
; X86-SLOW-NEXT: andb $31, %dl
|
||||
; X86-SLOW-NEXT: movl %edx, %ecx
|
||||
; X86-SLOW-NEXT: negb %cl
|
||||
; X86-SLOW-NEXT: movl %esi, %ebp
|
||||
; X86-SLOW-NEXT: shll %cl, %ebp
|
||||
; X86-SLOW-NEXT: testb %dl, %dl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: je .LBB3_2
|
||||
; X86-SLOW-NEXT: # %bb.1:
|
||||
; X86-SLOW-NEXT: orl %eax, %ebp
|
||||
; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: .LBB3_2:
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
|
||||
; X86-SLOW-NEXT: movl %ebp, %eax
|
||||
; X86-SLOW-NEXT: movl %ebx, %ecx
|
||||
; X86-SLOW-NEXT: shll %cl, %eax
|
||||
; X86-SLOW-NEXT: movb %bl, %ch
|
||||
; X86-SLOW-NEXT: andb $31, %ch
|
||||
; X86-SLOW-NEXT: movb %ch, %cl
|
||||
; X86-SLOW-NEXT: negb %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: testb %ch, %ch
|
||||
; X86-SLOW-NEXT: je .LBB3_4
|
||||
; X86-SLOW-NEXT: # %bb.3:
|
||||
; X86-SLOW-NEXT: orl %edi, %eax
|
||||
; X86-SLOW-NEXT: movl %eax, %ebp
|
||||
; X86-SLOW-NEXT: .LBB3_4:
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl %eax, %edi
|
||||
; X86-SLOW-NEXT: movl %ebx, %ecx
|
||||
; X86-SLOW-NEXT: shll %cl, %edi
|
||||
; X86-SLOW-NEXT: testb $32, %bl
|
||||
; X86-SLOW-NEXT: je .LBB3_6
|
||||
; X86-SLOW-NEXT: # %bb.5:
|
||||
; X86-SLOW-NEXT: movl %edi, %ebp
|
||||
; X86-SLOW-NEXT: xorl %edi, %edi
|
||||
; X86-SLOW-NEXT: .LBB3_6:
|
||||
; X86-SLOW-NEXT: movb %dh, %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %esi
|
||||
; X86-SLOW-NEXT: testb $32, %dh
|
||||
; X86-SLOW-NEXT: jne .LBB3_7
|
||||
; X86-SLOW-NEXT: # %bb.8:
|
||||
; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload
|
||||
; X86-SLOW-NEXT: testl %ebx, %ebx
|
||||
; X86-SLOW-NEXT: jne .LBB3_10
|
||||
; X86-SLOW-NEXT: jmp .LBB3_11
|
||||
; X86-SLOW-NEXT: .LBB3_7:
|
||||
; X86-SLOW-NEXT: movl %esi, %ecx
|
||||
; X86-SLOW-NEXT: xorl %esi, %esi
|
||||
; X86-SLOW-NEXT: testl %ebx, %ebx
|
||||
; X86-SLOW-NEXT: je .LBB3_11
|
||||
; X86-SLOW-NEXT: .LBB3_10:
|
||||
; X86-SLOW-NEXT: orl %esi, %ebp
|
||||
; X86-SLOW-NEXT: orl %ecx, %edi
|
||||
; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: movl %edi, %eax
|
||||
; X86-SLOW-NEXT: .LBB3_11:
|
||||
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
|
||||
; X86-SLOW-NEXT: addl $8, %esp
|
||||
; X86-SLOW-NEXT: popl %esi
|
||||
; X86-SLOW-NEXT: popl %edi
|
||||
; X86-SLOW-NEXT: popl %ebx
|
||||
; X86-SLOW-NEXT: popl %ebp
|
||||
; X86-SLOW-NEXT: retl
|
||||
;
|
||||
; X64-FAST-LABEL: var_shift_i64:
|
||||
; X64-FAST: # %bb.0:
|
||||
; X64-FAST-NEXT: movq %rdx, %rcx
|
||||
; X64-FAST-NEXT: andl $63, %ecx
|
||||
; X64-FAST-NEXT: movq %rdi, %rax
|
||||
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-FAST-NEXT: shldq %cl, %rsi, %rax
|
||||
; X64-FAST-NEXT: testq %rcx, %rcx
|
||||
; X64-FAST-NEXT: cmoveq %rdi, %rax
|
||||
; X64-FAST-NEXT: retq
|
||||
;
|
||||
; X64-SLOW-LABEL: var_shift_i64:
|
||||
; X64-SLOW: # %bb.0:
|
||||
; X64-SLOW-NEXT: movq %rsi, %rax
|
||||
; X64-SLOW-NEXT: andl $63, %edx
|
||||
; X64-SLOW-NEXT: movq %rdi, %rsi
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: shlq %cl, %rsi
|
||||
; X64-SLOW-NEXT: andb $63, %dl
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: negl %ecx
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-SLOW-NEXT: negb %cl
|
||||
; X64-SLOW-NEXT: shrq %cl, %rax
|
||||
; X64-SLOW-NEXT: orq %rsi, %rax
|
||||
; X64-SLOW-NEXT: testq %rdx, %rdx
|
||||
; X64-SLOW-NEXT: testb %dl, %dl
|
||||
; X64-SLOW-NEXT: cmoveq %rdi, %rax
|
||||
; X64-SLOW-NEXT: retq
|
||||
%tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
|
||||
|
@ -58,20 +58,11 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
|
||||
define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
|
||||
; X86-FAST-LABEL: var_shift_i16:
|
||||
; X86-FAST: # %bb.0:
|
||||
; X86-FAST-NEXT: pushl %esi
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %esi
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-FAST-NEXT: andl $15, %ecx
|
||||
; X86-FAST-NEXT: movl %eax, %edx
|
||||
; X86-FAST-NEXT: shrdw %cl, %si, %dx
|
||||
; X86-FAST-NEXT: testw %cx, %cx
|
||||
; X86-FAST-NEXT: je .LBB1_2
|
||||
; X86-FAST-NEXT: # %bb.1:
|
||||
; X86-FAST-NEXT: movl %edx, %eax
|
||||
; X86-FAST-NEXT: .LBB1_2:
|
||||
; X86-FAST-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X86-FAST-NEXT: popl %esi
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %edx
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-FAST-NEXT: andb $15, %cl
|
||||
; X86-FAST-NEXT: shrdw %cl, %dx, %ax
|
||||
; X86-FAST-NEXT: retl
|
||||
;
|
||||
; X86-SLOW-LABEL: var_shift_i16:
|
||||
@ -79,17 +70,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
|
||||
; X86-SLOW-NEXT: pushl %edi
|
||||
; X86-SLOW-NEXT: pushl %esi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: andl $15, %edx
|
||||
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
|
||||
; X86-SLOW-NEXT: andb $15, %dl
|
||||
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl %eax, %edi
|
||||
; X86-SLOW-NEXT: movl %edx, %ecx
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: movl $16, %ecx
|
||||
; X86-SLOW-NEXT: subl %edx, %ecx
|
||||
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X86-SLOW-NEXT: movb $16, %cl
|
||||
; X86-SLOW-NEXT: subb %dl, %cl
|
||||
; X86-SLOW-NEXT: shll %cl, %esi
|
||||
; X86-SLOW-NEXT: testw %dx, %dx
|
||||
; X86-SLOW-NEXT: testb %dl, %dl
|
||||
; X86-SLOW-NEXT: je .LBB1_2
|
||||
; X86-SLOW-NEXT: # %bb.1:
|
||||
; X86-SLOW-NEXT: orl %edi, %esi
|
||||
@ -103,26 +93,24 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
|
||||
; X64-FAST-LABEL: var_shift_i16:
|
||||
; X64-FAST: # %bb.0:
|
||||
; X64-FAST-NEXT: movl %edx, %ecx
|
||||
; X64-FAST-NEXT: andl $15, %ecx
|
||||
; X64-FAST-NEXT: movl %esi, %eax
|
||||
; X64-FAST-NEXT: andb $15, %cl
|
||||
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-FAST-NEXT: shrdw %cl, %di, %ax
|
||||
; X64-FAST-NEXT: testw %cx, %cx
|
||||
; X64-FAST-NEXT: cmovel %esi, %eax
|
||||
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X64-FAST-NEXT: retq
|
||||
;
|
||||
; X64-SLOW-LABEL: var_shift_i16:
|
||||
; X64-SLOW: # %bb.0:
|
||||
; X64-SLOW-NEXT: movzwl %si, %eax
|
||||
; X64-SLOW-NEXT: andl $15, %edx
|
||||
; X64-SLOW-NEXT: andb $15, %dl
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: shrl %cl, %eax
|
||||
; X64-SLOW-NEXT: movl $16, %ecx
|
||||
; X64-SLOW-NEXT: subl %edx, %ecx
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-SLOW-NEXT: movb $16, %cl
|
||||
; X64-SLOW-NEXT: subb %dl, %cl
|
||||
; X64-SLOW-NEXT: shll %cl, %edi
|
||||
; X64-SLOW-NEXT: orl %edi, %eax
|
||||
; X64-SLOW-NEXT: testw %dx, %dx
|
||||
; X64-SLOW-NEXT: testb %dl, %dl
|
||||
; X64-SLOW-NEXT: cmovel %esi, %eax
|
||||
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X64-SLOW-NEXT: retq
|
||||
@ -133,19 +121,10 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
|
||||
define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
; X86-FAST-LABEL: var_shift_i32:
|
||||
; X86-FAST: # %bb.0:
|
||||
; X86-FAST-NEXT: pushl %esi
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-FAST-NEXT: andl $31, %ecx
|
||||
; X86-FAST-NEXT: movl %eax, %edx
|
||||
; X86-FAST-NEXT: shrdl %cl, %esi, %edx
|
||||
; X86-FAST-NEXT: testl %ecx, %ecx
|
||||
; X86-FAST-NEXT: je .LBB2_2
|
||||
; X86-FAST-NEXT: # %bb.1:
|
||||
; X86-FAST-NEXT: movl %edx, %eax
|
||||
; X86-FAST-NEXT: .LBB2_2:
|
||||
; X86-FAST-NEXT: popl %esi
|
||||
; X86-FAST-NEXT: shrdl %cl, %edx, %eax
|
||||
; X86-FAST-NEXT: retl
|
||||
;
|
||||
; X86-SLOW-LABEL: var_shift_i32:
|
||||
@ -153,17 +132,16 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
; X86-SLOW-NEXT: pushl %edi
|
||||
; X86-SLOW-NEXT: pushl %esi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: andl $31, %edx
|
||||
; X86-SLOW-NEXT: movl %eax, %edi
|
||||
; X86-SLOW-NEXT: movl %edx, %ecx
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: andb $31, %dl
|
||||
; X86-SLOW-NEXT: movl %edx, %ecx
|
||||
; X86-SLOW-NEXT: negl %ecx
|
||||
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X86-SLOW-NEXT: negb %cl
|
||||
; X86-SLOW-NEXT: shll %cl, %esi
|
||||
; X86-SLOW-NEXT: testl %edx, %edx
|
||||
; X86-SLOW-NEXT: testb %dl, %dl
|
||||
; X86-SLOW-NEXT: je .LBB2_2
|
||||
; X86-SLOW-NEXT: # %bb.1:
|
||||
; X86-SLOW-NEXT: orl %edi, %esi
|
||||
@ -176,26 +154,23 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
; X64-FAST-LABEL: var_shift_i32:
|
||||
; X64-FAST: # %bb.0:
|
||||
; X64-FAST-NEXT: movl %edx, %ecx
|
||||
; X64-FAST-NEXT: andl $31, %ecx
|
||||
; X64-FAST-NEXT: movl %esi, %eax
|
||||
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-FAST-NEXT: shrdl %cl, %edi, %eax
|
||||
; X64-FAST-NEXT: testl %ecx, %ecx
|
||||
; X64-FAST-NEXT: cmovel %esi, %eax
|
||||
; X64-FAST-NEXT: retq
|
||||
;
|
||||
; X64-SLOW-LABEL: var_shift_i32:
|
||||
; X64-SLOW: # %bb.0:
|
||||
; X64-SLOW-NEXT: movl %edi, %eax
|
||||
; X64-SLOW-NEXT: andl $31, %edx
|
||||
; X64-SLOW-NEXT: movl %esi, %edi
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: shrl %cl, %edi
|
||||
; X64-SLOW-NEXT: andb $31, %dl
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: negl %ecx
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-SLOW-NEXT: negb %cl
|
||||
; X64-SLOW-NEXT: shll %cl, %eax
|
||||
; X64-SLOW-NEXT: orl %edi, %eax
|
||||
; X64-SLOW-NEXT: testl %edx, %edx
|
||||
; X64-SLOW-NEXT: testb %dl, %dl
|
||||
; X64-SLOW-NEXT: cmovel %esi, %eax
|
||||
; X64-SLOW-NEXT: retq
|
||||
%tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
|
||||
@ -203,81 +178,164 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
}
|
||||
|
||||
define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
|
||||
; X86-LABEL: var_shift_i64:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: pushl %ebp
|
||||
; X86-NEXT: pushl %ebx
|
||||
; X86-NEXT: pushl %edi
|
||||
; X86-NEXT: pushl %esi
|
||||
; X86-NEXT: pushl %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
|
||||
; X86-NEXT: andl $63, %ebx
|
||||
; X86-NEXT: movb $64, %cl
|
||||
; X86-NEXT: subb %bl, %cl
|
||||
; X86-NEXT: movl %eax, %edi
|
||||
; X86-NEXT: shll %cl, %edi
|
||||
; X86-NEXT: shldl %cl, %eax, %esi
|
||||
; X86-NEXT: testb $32, %cl
|
||||
; X86-NEXT: je .LBB3_2
|
||||
; X86-NEXT: # %bb.1:
|
||||
; X86-NEXT: movl %edi, %esi
|
||||
; X86-NEXT: xorl %edi, %edi
|
||||
; X86-NEXT: .LBB3_2:
|
||||
; X86-NEXT: movl %edx, %ebp
|
||||
; X86-NEXT: movl %ebx, %ecx
|
||||
; X86-NEXT: shrl %cl, %ebp
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: shrdl %cl, %edx, %eax
|
||||
; X86-NEXT: testb $32, %bl
|
||||
; X86-NEXT: je .LBB3_4
|
||||
; X86-NEXT: # %bb.3:
|
||||
; X86-NEXT: movl %ebp, %eax
|
||||
; X86-NEXT: xorl %ebp, %ebp
|
||||
; X86-NEXT: .LBB3_4:
|
||||
; X86-NEXT: testl %ebx, %ebx
|
||||
; X86-NEXT: je .LBB3_6
|
||||
; X86-NEXT: # %bb.5:
|
||||
; X86-NEXT: orl %ebp, %esi
|
||||
; X86-NEXT: orl %eax, %edi
|
||||
; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
|
||||
; X86-NEXT: movl %esi, %edx
|
||||
; X86-NEXT: .LBB3_6:
|
||||
; X86-NEXT: movl (%esp), %eax # 4-byte Reload
|
||||
; X86-NEXT: addl $4, %esp
|
||||
; X86-NEXT: popl %esi
|
||||
; X86-NEXT: popl %edi
|
||||
; X86-NEXT: popl %ebx
|
||||
; X86-NEXT: popl %ebp
|
||||
; X86-NEXT: retl
|
||||
; X86-FAST-LABEL: var_shift_i64:
|
||||
; X86-FAST: # %bb.0:
|
||||
; X86-FAST-NEXT: pushl %ebp
|
||||
; X86-FAST-NEXT: pushl %ebx
|
||||
; X86-FAST-NEXT: pushl %edi
|
||||
; X86-FAST-NEXT: pushl %esi
|
||||
; X86-FAST-NEXT: pushl %eax
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx
|
||||
; X86-FAST-NEXT: andl $63, %ebx
|
||||
; X86-FAST-NEXT: movb $64, %cl
|
||||
; X86-FAST-NEXT: subb %bl, %cl
|
||||
; X86-FAST-NEXT: movl %eax, %edi
|
||||
; X86-FAST-NEXT: shll %cl, %edi
|
||||
; X86-FAST-NEXT: shldl %cl, %eax, %esi
|
||||
; X86-FAST-NEXT: testb $32, %cl
|
||||
; X86-FAST-NEXT: je .LBB3_2
|
||||
; X86-FAST-NEXT: # %bb.1:
|
||||
; X86-FAST-NEXT: movl %edi, %esi
|
||||
; X86-FAST-NEXT: xorl %edi, %edi
|
||||
; X86-FAST-NEXT: .LBB3_2:
|
||||
; X86-FAST-NEXT: movl %edx, %ebp
|
||||
; X86-FAST-NEXT: movl %ebx, %ecx
|
||||
; X86-FAST-NEXT: shrl %cl, %ebp
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: shrdl %cl, %edx, %eax
|
||||
; X86-FAST-NEXT: testb $32, %bl
|
||||
; X86-FAST-NEXT: je .LBB3_4
|
||||
; X86-FAST-NEXT: # %bb.3:
|
||||
; X86-FAST-NEXT: movl %ebp, %eax
|
||||
; X86-FAST-NEXT: xorl %ebp, %ebp
|
||||
; X86-FAST-NEXT: .LBB3_4:
|
||||
; X86-FAST-NEXT: testl %ebx, %ebx
|
||||
; X86-FAST-NEXT: je .LBB3_6
|
||||
; X86-FAST-NEXT: # %bb.5:
|
||||
; X86-FAST-NEXT: orl %ebp, %esi
|
||||
; X86-FAST-NEXT: orl %eax, %edi
|
||||
; X86-FAST-NEXT: movl %edi, (%esp) # 4-byte Spill
|
||||
; X86-FAST-NEXT: movl %esi, %edx
|
||||
; X86-FAST-NEXT: .LBB3_6:
|
||||
; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload
|
||||
; X86-FAST-NEXT: addl $4, %esp
|
||||
; X86-FAST-NEXT: popl %esi
|
||||
; X86-FAST-NEXT: popl %edi
|
||||
; X86-FAST-NEXT: popl %ebx
|
||||
; X86-FAST-NEXT: popl %ebp
|
||||
; X86-FAST-NEXT: retl
|
||||
;
|
||||
; X86-SLOW-LABEL: var_shift_i64:
|
||||
; X86-SLOW: # %bb.0:
|
||||
; X86-SLOW-NEXT: pushl %ebp
|
||||
; X86-SLOW-NEXT: pushl %ebx
|
||||
; X86-SLOW-NEXT: pushl %edi
|
||||
; X86-SLOW-NEXT: pushl %esi
|
||||
; X86-SLOW-NEXT: subl $8, %esp
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
|
||||
; X86-SLOW-NEXT: andl $63, %ebx
|
||||
; X86-SLOW-NEXT: movb $64, %al
|
||||
; X86-SLOW-NEXT: subb %bl, %al
|
||||
; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: movl %eax, %ecx
|
||||
; X86-SLOW-NEXT: shll %cl, %edx
|
||||
; X86-SLOW-NEXT: movb %al, %ch
|
||||
; X86-SLOW-NEXT: andb $31, %ch
|
||||
; X86-SLOW-NEXT: movb %ch, %cl
|
||||
; X86-SLOW-NEXT: negb %cl
|
||||
; X86-SLOW-NEXT: movl %esi, %edi
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: testb %ch, %ch
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
|
||||
; X86-SLOW-NEXT: je .LBB3_2
|
||||
; X86-SLOW-NEXT: # %bb.1:
|
||||
; X86-SLOW-NEXT: orl %edi, %edx
|
||||
; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: .LBB3_2:
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-SLOW-NEXT: movl %ecx, %edx
|
||||
; X86-SLOW-NEXT: movl %ebx, %ecx
|
||||
; X86-SLOW-NEXT: shrl %cl, %edx
|
||||
; X86-SLOW-NEXT: movb %bl, %ah
|
||||
; X86-SLOW-NEXT: andb $31, %ah
|
||||
; X86-SLOW-NEXT: movb %ah, %cl
|
||||
; X86-SLOW-NEXT: negb %cl
|
||||
; X86-SLOW-NEXT: movl %ebp, %edi
|
||||
; X86-SLOW-NEXT: shll %cl, %edi
|
||||
; X86-SLOW-NEXT: testb %ah, %ah
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
|
||||
; X86-SLOW-NEXT: je .LBB3_4
|
||||
; X86-SLOW-NEXT: # %bb.3:
|
||||
; X86-SLOW-NEXT: orl %edx, %edi
|
||||
; X86-SLOW-NEXT: movl %edi, %ebp
|
||||
; X86-SLOW-NEXT: .LBB3_4:
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
|
||||
; X86-SLOW-NEXT: movl %ebx, %ecx
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: testb $32, %bl
|
||||
; X86-SLOW-NEXT: je .LBB3_6
|
||||
; X86-SLOW-NEXT: # %bb.5:
|
||||
; X86-SLOW-NEXT: movl %edi, %ebp
|
||||
; X86-SLOW-NEXT: xorl %edi, %edi
|
||||
; X86-SLOW-NEXT: .LBB3_6:
|
||||
; X86-SLOW-NEXT: movl %eax, %ecx
|
||||
; X86-SLOW-NEXT: shll %cl, %esi
|
||||
; X86-SLOW-NEXT: testb $32, %al
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: jne .LBB3_7
|
||||
; X86-SLOW-NEXT: # %bb.8:
|
||||
; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
|
||||
; X86-SLOW-NEXT: testl %ebx, %ebx
|
||||
; X86-SLOW-NEXT: jne .LBB3_10
|
||||
; X86-SLOW-NEXT: jmp .LBB3_11
|
||||
; X86-SLOW-NEXT: .LBB3_7:
|
||||
; X86-SLOW-NEXT: movl %esi, %eax
|
||||
; X86-SLOW-NEXT: xorl %esi, %esi
|
||||
; X86-SLOW-NEXT: testl %ebx, %ebx
|
||||
; X86-SLOW-NEXT: je .LBB3_11
|
||||
; X86-SLOW-NEXT: .LBB3_10:
|
||||
; X86-SLOW-NEXT: orl %ebp, %esi
|
||||
; X86-SLOW-NEXT: orl %edi, %eax
|
||||
; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: movl %eax, %edx
|
||||
; X86-SLOW-NEXT: .LBB3_11:
|
||||
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
|
||||
; X86-SLOW-NEXT: addl $8, %esp
|
||||
; X86-SLOW-NEXT: popl %esi
|
||||
; X86-SLOW-NEXT: popl %edi
|
||||
; X86-SLOW-NEXT: popl %ebx
|
||||
; X86-SLOW-NEXT: popl %ebp
|
||||
; X86-SLOW-NEXT: retl
|
||||
;
|
||||
; X64-FAST-LABEL: var_shift_i64:
|
||||
; X64-FAST: # %bb.0:
|
||||
; X64-FAST-NEXT: movq %rdx, %rcx
|
||||
; X64-FAST-NEXT: andl $63, %ecx
|
||||
; X64-FAST-NEXT: movq %rsi, %rax
|
||||
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-FAST-NEXT: shrdq %cl, %rdi, %rax
|
||||
; X64-FAST-NEXT: testq %rcx, %rcx
|
||||
; X64-FAST-NEXT: cmoveq %rsi, %rax
|
||||
; X64-FAST-NEXT: retq
|
||||
;
|
||||
; X64-SLOW-LABEL: var_shift_i64:
|
||||
; X64-SLOW: # %bb.0:
|
||||
; X64-SLOW-NEXT: movq %rdi, %rax
|
||||
; X64-SLOW-NEXT: andl $63, %edx
|
||||
; X64-SLOW-NEXT: movq %rsi, %rdi
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: shrq %cl, %rdi
|
||||
; X64-SLOW-NEXT: andb $63, %dl
|
||||
; X64-SLOW-NEXT: movl %edx, %ecx
|
||||
; X64-SLOW-NEXT: negl %ecx
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-SLOW-NEXT: negb %cl
|
||||
; X64-SLOW-NEXT: shlq %cl, %rax
|
||||
; X64-SLOW-NEXT: orq %rdi, %rax
|
||||
; X64-SLOW-NEXT: testq %rdx, %rdx
|
||||
; X64-SLOW-NEXT: testb %dl, %dl
|
||||
; X64-SLOW-NEXT: cmoveq %rsi, %rax
|
||||
; X64-SLOW-NEXT: retq
|
||||
%tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
|
||||
@ -315,7 +373,7 @@ define i16 @const_shift_i16(i16 %x, i16 %y) nounwind {
|
||||
; X86-FAST: # %bb.0:
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: shldw $9, %cx, %ax
|
||||
; X86-FAST-NEXT: shrdw $7, %cx, %ax
|
||||
; X86-FAST-NEXT: retl
|
||||
;
|
||||
; X86-SLOW-LABEL: const_shift_i16:
|
||||
@ -330,8 +388,8 @@ define i16 @const_shift_i16(i16 %x, i16 %y) nounwind {
|
||||
;
|
||||
; X64-FAST-LABEL: const_shift_i16:
|
||||
; X64-FAST: # %bb.0:
|
||||
; X64-FAST-NEXT: movl %edi, %eax
|
||||
; X64-FAST-NEXT: shldw $9, %si, %ax
|
||||
; X64-FAST-NEXT: movl %esi, %eax
|
||||
; X64-FAST-NEXT: shrdw $7, %di, %ax
|
||||
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X64-FAST-NEXT: retq
|
||||
;
|
||||
@ -352,7 +410,7 @@ define i32 @const_shift_i32(i32 %x, i32 %y) nounwind {
|
||||
; X86-FAST: # %bb.0:
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-FAST-NEXT: shldl $25, %ecx, %eax
|
||||
; X86-FAST-NEXT: shrdl $7, %ecx, %eax
|
||||
; X86-FAST-NEXT: retl
|
||||
;
|
||||
; X86-SLOW-LABEL: const_shift_i32:
|
||||
|
@ -14,31 +14,23 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
|
||||
declare i64 @llvm.fshr.i64(i64, i64, i64)
|
||||
declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
|
||||
|
||||
; General case - all operands can be variables - x86 has shld, but the mask and cmov are not needed?
|
||||
; General case - all operands can be variables
|
||||
|
||||
define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
; X32-SSE2-LABEL: fshl_i32:
|
||||
; X32-SSE2: # %bb.0:
|
||||
; X32-SSE2-NEXT: pushl %esi
|
||||
; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-SSE2-NEXT: andl $31, %ecx
|
||||
; X32-SSE2-NEXT: movl %esi, %eax
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE2-NEXT: shldl %cl, %edx, %eax
|
||||
; X32-SSE2-NEXT: testl %ecx, %ecx
|
||||
; X32-SSE2-NEXT: cmovel %esi, %eax
|
||||
; X32-SSE2-NEXT: popl %esi
|
||||
; X32-SSE2-NEXT: retl
|
||||
;
|
||||
; X64-AVX2-LABEL: fshl_i32:
|
||||
; X64-AVX2: # %bb.0:
|
||||
; X64-AVX2-NEXT: movl %edx, %ecx
|
||||
; X64-AVX2-NEXT: andl $31, %ecx
|
||||
; X64-AVX2-NEXT: movl %edi, %eax
|
||||
; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-AVX2-NEXT: shldl %cl, %esi, %eax
|
||||
; X64-AVX2-NEXT: testl %ecx, %ecx
|
||||
; X64-AVX2-NEXT: cmovel %edi, %eax
|
||||
; X64-AVX2-NEXT: retq
|
||||
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
|
||||
ret i32 %f
|
||||
@ -212,31 +204,23 @@ define i8 @fshl_i8_const_fold() nounwind {
|
||||
|
||||
; Repeat everything for funnel shift right.
|
||||
|
||||
; General case - all operands can be variables - x86 has 'shrd', but the mask and cmov are not needed?
|
||||
; General case - all operands can be variables
|
||||
|
||||
define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
||||
; X32-SSE2-LABEL: fshr_i32:
|
||||
; X32-SSE2: # %bb.0:
|
||||
; X32-SSE2-NEXT: pushl %esi
|
||||
; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-SSE2-NEXT: andl $31, %ecx
|
||||
; X32-SSE2-NEXT: movl %esi, %eax
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE2-NEXT: shrdl %cl, %edx, %eax
|
||||
; X32-SSE2-NEXT: testl %ecx, %ecx
|
||||
; X32-SSE2-NEXT: cmovel %esi, %eax
|
||||
; X32-SSE2-NEXT: popl %esi
|
||||
; X32-SSE2-NEXT: retl
|
||||
;
|
||||
; X64-AVX2-LABEL: fshr_i32:
|
||||
; X64-AVX2: # %bb.0:
|
||||
; X64-AVX2-NEXT: movl %edx, %ecx
|
||||
; X64-AVX2-NEXT: andl $31, %ecx
|
||||
; X64-AVX2-NEXT: movl %esi, %eax
|
||||
; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-AVX2-NEXT: shrdl %cl, %edi, %eax
|
||||
; X64-AVX2-NEXT: testl %ecx, %ecx
|
||||
; X64-AVX2-NEXT: cmovel %esi, %eax
|
||||
; X64-AVX2-NEXT: retq
|
||||
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
|
||||
ret i32 %f
|
||||
@ -341,7 +325,7 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
|
||||
; X32-SSE2: # %bb.0:
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE2-NEXT: shldl $23, %ecx, %eax
|
||||
; X32-SSE2-NEXT: shrdl $9, %ecx, %eax
|
||||
; X32-SSE2-NEXT: retl
|
||||
;
|
||||
; X64-AVX2-LABEL: fshr_i32_const_shift:
|
||||
@ -353,14 +337,14 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
|
||||
ret i32 %f
|
||||
}
|
||||
|
||||
; Check modulo math on shift amount. 41-32=9, but right-shift became left, so 32-9=23.
|
||||
; Check modulo math on shift amount. 41-32=9, but right-shift may became left, so 32-9=23.
|
||||
|
||||
define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) nounwind {
|
||||
; X32-SSE2-LABEL: fshr_i32_const_overshift:
|
||||
; X32-SSE2: # %bb.0:
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE2-NEXT: shldl $23, %ecx, %eax
|
||||
; X32-SSE2-NEXT: shrdl $9, %ecx, %eax
|
||||
; X32-SSE2-NEXT: retl
|
||||
;
|
||||
; X64-AVX2-LABEL: fshr_i32_const_overshift:
|
||||
|
Loading…
x
Reference in New Issue
Block a user