1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[SelectionDAG] Initial support for FSHL/FSHR funnel shift opcodes (PR39467)

This is an initial patch to add a minimum level of support for funnel shifts to the SelectionDAG and to begin wiring it up to the X86 SHLD/SHRD instructions.

Some partial legalization code has been added to handle the case for 'SlowSHLD' where we want to expand instead and I've added a few DAG combines so we don't get regressions from the existing DAG builder expansion code.

Differential Revision: https://reviews.llvm.org/D54698

llvm-svn: 348353
This commit is contained in:
Simon Pilgrim 2018-12-05 11:12:12 +00:00
parent 8d062f4fb7
commit 573fb67bcf
14 changed files with 570 additions and 307 deletions

View File

@ -394,9 +394,13 @@ namespace ISD {
/// When the 1st operand is a vector, the shift amount must be in the same
/// type. (TLI.getShiftAmountTy() will return the same type when the input
/// type is a vector.)
/// For rotates, the shift amount is treated as an unsigned amount modulo
/// the element size of the first operand.
SHL, SRA, SRL, ROTL, ROTR,
/// For rotates and funnel shifts, the shift amount is treated as an unsigned
/// amount modulo the element size of the first operand.
///
/// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
/// fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
/// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
SHL, SRA, SRL, ROTL, ROTR, FSHL, FSHR,
/// Byte Swap and Counting operators.
BSWAP, CTTZ, CTLZ, CTPOP, BITREVERSE,

View File

@ -3689,6 +3689,12 @@ public:
SDValue LL = SDValue(), SDValue LH = SDValue(),
SDValue RL = SDValue(), SDValue RH = SDValue()) const;
/// Expand funnel shift.
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandFunnelShift(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand float(f32) to SINT(i64) conversion
/// \param N Node to expand
/// \param Result output after conversion

View File

@ -325,6 +325,7 @@ namespace {
SDValue visitSHL(SDNode *N);
SDValue visitSRA(SDNode *N);
SDValue visitSRL(SDNode *N);
SDValue visitFunnelShift(SDNode *N);
SDValue visitRotate(SDNode *N);
SDValue visitABS(SDNode *N);
SDValue visitBSWAP(SDNode *N);
@ -1513,6 +1514,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::SRL: return visitSRL(N);
case ISD::ROTR:
case ISD::ROTL: return visitRotate(N);
case ISD::FSHL:
case ISD::FSHR: return visitFunnelShift(N);
case ISD::ABS: return visitABS(N);
case ISD::BSWAP: return visitBSWAP(N);
case ISD::BITREVERSE: return visitBITREVERSE(N);
@ -6926,6 +6929,39 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
return SDValue();
}
SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
bool IsFSHL = N->getOpcode() == ISD::FSHL;
unsigned BitWidth = VT.getScalarSizeInBits();
// fold (fshl N0, N1, 0) -> N0
// fold (fshr N0, N1, 0) -> N1
if (DAG.MaskedValueIsZero(N2, APInt::getAllOnesValue(BitWidth)))
return IsFSHL ? N0 : N1;
// fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
if (Cst->getAPIntValue().uge(BitWidth)) {
uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
DAG.getConstant(RotAmt, SDLoc(N), N2.getValueType()));
}
}
// fold (fshl N0, N0, N2) -> (rotl N0, N2)
// fold (fshr N0, N0, N2) -> (rotr N0, N2)
// TODO: Investigate flipping this rotate if only one is legal, if funnel shift
// is legal as well we might be better off avoiding non-constant (BW - N2).
unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
if (N0 == N1 && hasOperation(RotOpc, VT))
return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
return SDValue();
}
SDValue DAGCombiner::visitABS(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);

View File

@ -1170,6 +1170,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
}
}
break;
case ISD::FSHL:
case ISD::FSHR:
case ISD::SRL_PARTS:
case ISD::SRA_PARTS:
case ISD::SHL_PARTS: {
@ -3262,6 +3264,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
}
break;
}
case ISD::FSHL:
case ISD::FSHR:
if (TLI.expandFunnelShift(Node, Tmp1, DAG))
Results.push_back(Tmp1);
break;
case ISD::SADDSAT:
case ISD::UADDSAT:
case ISD::SSUBSAT:

View File

@ -129,12 +129,13 @@ class VectorLegalizer {
SDValue ExpandFNEG(SDValue Op);
SDValue ExpandFSUB(SDValue Op);
SDValue ExpandBITREVERSE(SDValue Op);
SDValue ExpandCTPOP(SDValue Op);
SDValue ExpandCTLZ(SDValue Op);
SDValue ExpandCTTZ(SDValue Op);
SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
SDValue ExpandStrictFPOp(SDValue Op);
SDValue ExpandCTPOP(SDValue Op);
SDValue ExpandCTLZ(SDValue Op);
SDValue ExpandCTTZ(SDValue Op);
SDValue ExpandFunnelShift(SDValue Op);
SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
SDValue ExpandStrictFPOp(SDValue Op);
/// Implements vector promotion.
///
/// This is essentially just bitcasting the operands to a different type and
@ -746,12 +747,15 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
return ExpandCTLZ(Op);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
return ExpandCTTZ(Op);
case ISD::FMINNUM:
case ISD::FMAXNUM:
return ExpandFMINNUM_FMAXNUM(Op);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
return ExpandCTTZ(Op);
case ISD::FSHL:
case ISD::FSHR:
return ExpandFunnelShift(Op);
case ISD::FMINNUM:
case ISD::FMAXNUM:
return ExpandFMINNUM_FMAXNUM(Op);
case ISD::STRICT_FADD:
case ISD::STRICT_FSUB:
case ISD::STRICT_FMUL:
@ -1123,32 +1127,40 @@ SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
return Op; // Defer to LegalizeDAG
return DAG.UnrollVectorOp(Op.getNode());
}
SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
SDValue Result;
if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
return Result;
return DAG.UnrollVectorOp(Op.getNode());
}
SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
SDValue Result;
if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
return Result;
return DAG.UnrollVectorOp(Op.getNode());
}
SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
SDValue Result;
if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
return Result;
return DAG.UnrollVectorOp(Op.getNode());
}
}
SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
SDValue Result;
if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
return Result;
return DAG.UnrollVectorOp(Op.getNode());
}
SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
SDValue Result;
if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
return Result;
return DAG.UnrollVectorOp(Op.getNode());
}
SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
SDValue Result;
if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
return Result;
return DAG.UnrollVectorOp(Op.getNode());
}
SDValue VectorLegalizer::ExpandFunnelShift(SDValue Op) {
SDValue Result;
if (TLI.expandFunnelShift(Op.getNode(), Result, DAG))
return Result;
return DAG.UnrollVectorOp(Op.getNode());
}
SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
return Expanded;

View File

@ -5751,6 +5751,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
SDValue Zero = DAG.getConstant(0, sdl, VT);
SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
return nullptr;
}
// When X == Y, this is rotate. If the data type has a power-of-2 size, we
// avoid the select that is necessary in the general case to filter out
// the 0-shift possibility that leads to UB.

View File

@ -237,6 +237,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::SRL: return "srl";
case ISD::ROTL: return "rotl";
case ISD::ROTR: return "rotr";
case ISD::FSHL: return "fshl";
case ISD::FSHR: return "fshr";
case ISD::FADD: return "fadd";
case ISD::STRICT_FADD: return "strict_fadd";
case ISD::FSUB: return "fsub";

View File

@ -4114,6 +4114,54 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
return Ok;
}
bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
EVT VT = Node->getValueType(0);
if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
!isOperationLegalOrCustom(ISD::SRL, VT) ||
!isOperationLegalOrCustom(ISD::SUB, VT) ||
!isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
return false;
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
SDValue X = Node->getOperand(0);
SDValue Y = Node->getOperand(1);
SDValue Z = Node->getOperand(2);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
bool IsFSHL = Node->getOpcode() == ISD::FSHL;
SDLoc DL(SDValue(Node, 0));
EVT ShVT = Z.getValueType();
SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
SDValue Zero = DAG.getConstant(0, DL, ShVT);
SDValue ShAmt;
if (isPowerOf2_32(EltSizeInBits)) {
SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
} else {
ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
}
SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
// If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
// and that is undefined. We must compare and select to avoid UB.
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
// For fshl, 0-shift returns the 1st arg (X).
// For fshr, 0-shift returns the 2nd arg (Y).
SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
return true;
}
bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
SDValue Src = Node->getOperand(0);

View File

@ -610,6 +610,8 @@ void TargetLoweringBase::initActions() {
setOperationAction(ISD::UMIN, VT, Expand);
setOperationAction(ISD::UMAX, VT, Expand);
setOperationAction(ISD::ABS, VT, Expand);
setOperationAction(ISD::FSHL, VT, Expand);
setOperationAction(ISD::FSHR, VT, Expand);
setOperationAction(ISD::SADDSAT, VT, Expand);
setOperationAction(ISD::UADDSAT, VT, Expand);
setOperationAction(ISD::SSUBSAT, VT, Expand);

View File

@ -195,6 +195,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ABS , MVT::i64 , Custom);
}
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
setOperationAction(ShiftOp , MVT::i16 , Custom);
setOperationAction(ShiftOp , MVT::i32 , Custom);
if (Subtarget.is64Bit())
setOperationAction(ShiftOp , MVT::i64 , Custom);
}
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
// operation.
setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
@ -16972,6 +16980,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
/// Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
/// TODO: Can this be moved to general expansion code?
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
MVT VT = Op.getSimpleValueType();
@ -16981,8 +16990,8 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
// during isel.
SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits - 1, dl, MVT::i8));
@ -16992,10 +17001,10 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
SDValue Tmp2, Tmp3;
if (Op.getOpcode() == ISD::SHL_PARTS) {
Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
} else {
Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
}
@ -17019,6 +17028,37 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues({ Lo, Hi }, dl);
}
static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
"Unexpected funnel shift opcode!");
assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!");
SDLoc DL(Op);
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Amt = Op.getOperand(2);
// Expand slow SHLD/SHRD cases.
// TODO - can we be more selective here: OptSize/RMW etc.?
if (Subtarget.isSHLDSlow())
return SDValue();
bool IsFSHR = Op.getOpcode() == ISD::FSHR;
if (IsFSHR)
std::swap(Op0, Op1);
// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
if (VT == MVT::i16)
Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
DAG.getConstant(15, DL, Amt.getValueType()));
unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
}
// Try to use a packed vector operation to handle i64 on 32-bit targets when
// AVX512DQ is enabled.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
@ -26115,6 +26155,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SHL_PARTS:
case ISD::SRA_PARTS:
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::FSHL:
case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

View File

@ -2022,7 +2022,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::ROTL, MVT::i64, 1 },
{ ISD::ROTR, MVT::i64, 1 },
{ X86ISD::SHLD, MVT::i64, 4 }
{ ISD::FSHL, MVT::i64, 4 }
};
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::ROTL, MVT::i32, 1 },
@ -2031,9 +2031,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::ROTR, MVT::i32, 1 },
{ ISD::ROTR, MVT::i16, 1 },
{ ISD::ROTR, MVT::i8, 1 },
{ X86ISD::SHLD, MVT::i32, 4 },
{ X86ISD::SHLD, MVT::i16, 4 },
{ X86ISD::SHLD, MVT::i8, 4 }
{ ISD::FSHL, MVT::i32, 4 },
{ ISD::FSHL, MVT::i16, 4 },
{ ISD::FSHL, MVT::i8, 4 }
};
unsigned ISD = ISD::DELETED_NODE;
@ -2041,13 +2041,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
default:
break;
case Intrinsic::fshl:
ISD = X86ISD::SHLD;
ISD = ISD::FSHL;
if (Args[0] == Args[1])
ISD = ISD::ROTL;
break;
case Intrinsic::fshr:
// SHRD has same costs so don't duplicate.
ISD = X86ISD::SHLD;
// FSHR has same costs so don't duplicate.
ISD = ISD::FSHL;
if (Args[0] == Args[1])
ISD = ISD::ROTR;
break;

View File

@ -58,20 +58,11 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X86-FAST-LABEL: var_shift_i16:
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: pushl %esi
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %esi
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-FAST-NEXT: andl $15, %ecx
; X86-FAST-NEXT: movl %eax, %edx
; X86-FAST-NEXT: shldw %cl, %si, %dx
; X86-FAST-NEXT: testw %cx, %cx
; X86-FAST-NEXT: je .LBB1_2
; X86-FAST-NEXT: # %bb.1:
; X86-FAST-NEXT: movl %edx, %eax
; X86-FAST-NEXT: .LBB1_2:
; X86-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; X86-FAST-NEXT: popl %esi
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-FAST-NEXT: andb $15, %cl
; X86-FAST-NEXT: shldw %cl, %dx, %ax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i16:
@ -79,17 +70,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: andl $15, %edx
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: andb $15, %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: movl $16, %ecx
; X86-SLOW-NEXT: subl %edx, %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-SLOW-NEXT: movb $16, %cl
; X86-SLOW-NEXT: subb %dl, %cl
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: testw %dx, %dx
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB1_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %esi, %edi
@ -103,27 +93,25 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X64-FAST-LABEL: var_shift_i16:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edx, %ecx
; X64-FAST-NEXT: andl $15, %ecx
; X64-FAST-NEXT: movl %edi, %eax
; X64-FAST-NEXT: andb $15, %cl
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-FAST-NEXT: shldw %cl, %si, %ax
; X64-FAST-NEXT: testw %cx, %cx
; X64-FAST-NEXT: cmovel %edi, %eax
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movzwl %si, %eax
; X64-SLOW-NEXT: andl $15, %edx
; X64-SLOW-NEXT: andb $15, %dl
; X64-SLOW-NEXT: movl %edi, %esi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shll %cl, %esi
; X64-SLOW-NEXT: movl $16, %ecx
; X64-SLOW-NEXT: subl %edx, %ecx
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: movb $16, %cl
; X64-SLOW-NEXT: subb %dl, %cl
; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: orl %esi, %eax
; X64-SLOW-NEXT: testw %dx, %dx
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %edi, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
@ -134,19 +122,10 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X86-FAST-LABEL: var_shift_i32:
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: pushl %esi
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-FAST-NEXT: andl $31, %ecx
; X86-FAST-NEXT: movl %eax, %edx
; X86-FAST-NEXT: shldl %cl, %esi, %edx
; X86-FAST-NEXT: testl %ecx, %ecx
; X86-FAST-NEXT: je .LBB2_2
; X86-FAST-NEXT: # %bb.1:
; X86-FAST-NEXT: movl %edx, %eax
; X86-FAST-NEXT: .LBB2_2:
; X86-FAST-NEXT: popl %esi
; X86-FAST-NEXT: shldl %cl, %edx, %eax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i32:
@ -154,17 +133,16 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: andl $31, %edx
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: andb $31, %dl
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: negl %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: testl %edx, %edx
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB2_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %esi, %edi
@ -177,26 +155,23 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X64-FAST-LABEL: var_shift_i32:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edx, %ecx
; X64-FAST-NEXT: andl $31, %ecx
; X64-FAST-NEXT: movl %edi, %eax
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-FAST-NEXT: shldl %cl, %esi, %eax
; X64-FAST-NEXT: testl %ecx, %ecx
; X64-FAST-NEXT: cmovel %edi, %eax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i32:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movl %esi, %eax
; X64-SLOW-NEXT: andl $31, %edx
; X64-SLOW-NEXT: movl %edi, %esi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shll %cl, %esi
; X64-SLOW-NEXT: andb $31, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: negl %ecx
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: orl %esi, %eax
; X64-SLOW-NEXT: testl %edx, %edx
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %edi, %eax
; X64-SLOW-NEXT: retq
%tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@ -204,85 +179,166 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
}
define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-LABEL: var_shift_i64:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: andl $63, %ebx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: shll %cl, %edi
; X86-NEXT: shldl %cl, %eax, %ebp
; X86-NEXT: testb $32, %bl
; X86-NEXT: je .LBB3_2
; X86-NEXT: # %bb.1:
; X86-NEXT: movl %edi, %ebp
; X86-NEXT: xorl %edi, %edi
; X86-NEXT: .LBB3_2:
; X86-NEXT: movb $64, %cl
; X86-NEXT: subb %bl, %cl
; X86-NEXT: movl %edx, %esi
; X86-NEXT: shrl %cl, %esi
; X86-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
; X86-NEXT: testb $32, %cl
; X86-NEXT: jne .LBB3_3
; X86-NEXT: # %bb.4:
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: jne .LBB3_6
; X86-NEXT: jmp .LBB3_7
; X86-NEXT: .LBB3_3:
; X86-NEXT: movl %esi, %ecx
; X86-NEXT: xorl %esi, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: je .LBB3_7
; X86-NEXT: .LBB3_6:
; X86-NEXT: orl %esi, %ebp
; X86-NEXT: orl %ecx, %edi
; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl %ebp, %edx
; X86-NEXT: .LBB3_7:
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
; X86-NEXT: retl
; X86-FAST-LABEL: var_shift_i64:
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: pushl %ebp
; X86-FAST-NEXT: pushl %ebx
; X86-FAST-NEXT: pushl %edi
; X86-FAST-NEXT: pushl %esi
; X86-FAST-NEXT: pushl %eax
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-FAST-NEXT: andl $63, %ebx
; X86-FAST-NEXT: movl %eax, %edi
; X86-FAST-NEXT: movl %ebx, %ecx
; X86-FAST-NEXT: shll %cl, %edi
; X86-FAST-NEXT: shldl %cl, %eax, %ebp
; X86-FAST-NEXT: testb $32, %bl
; X86-FAST-NEXT: je .LBB3_2
; X86-FAST-NEXT: # %bb.1:
; X86-FAST-NEXT: movl %edi, %ebp
; X86-FAST-NEXT: xorl %edi, %edi
; X86-FAST-NEXT: .LBB3_2:
; X86-FAST-NEXT: movb $64, %cl
; X86-FAST-NEXT: subb %bl, %cl
; X86-FAST-NEXT: movl %edx, %esi
; X86-FAST-NEXT: shrl %cl, %esi
; X86-FAST-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
; X86-FAST-NEXT: testb $32, %cl
; X86-FAST-NEXT: jne .LBB3_3
; X86-FAST-NEXT: # %bb.4:
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-FAST-NEXT: testl %ebx, %ebx
; X86-FAST-NEXT: jne .LBB3_6
; X86-FAST-NEXT: jmp .LBB3_7
; X86-FAST-NEXT: .LBB3_3:
; X86-FAST-NEXT: movl %esi, %ecx
; X86-FAST-NEXT: xorl %esi, %esi
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: testl %ebx, %ebx
; X86-FAST-NEXT: je .LBB3_7
; X86-FAST-NEXT: .LBB3_6:
; X86-FAST-NEXT: orl %esi, %ebp
; X86-FAST-NEXT: orl %ecx, %edi
; X86-FAST-NEXT: movl %edi, %eax
; X86-FAST-NEXT: movl %ebp, %edx
; X86-FAST-NEXT: .LBB3_7:
; X86-FAST-NEXT: addl $4, %esp
; X86-FAST-NEXT: popl %esi
; X86-FAST-NEXT: popl %edi
; X86-FAST-NEXT: popl %ebx
; X86-FAST-NEXT: popl %ebp
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i64:
; X86-SLOW: # %bb.0:
; X86-SLOW-NEXT: pushl %ebp
; X86-SLOW-NEXT: pushl %ebx
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: subl $8, %esp
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SLOW-NEXT: andl $63, %ebx
; X86-SLOW-NEXT: movb $64, %dh
; X86-SLOW-NEXT: subb %bl, %dh
; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movb %dh, %cl
; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: movb %dh, %dl
; X86-SLOW-NEXT: andb $31, %dl
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: movl %esi, %ebp
; X86-SLOW-NEXT: shll %cl, %ebp
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: je .LBB3_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %eax, %ebp
; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: .LBB3_2:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SLOW-NEXT: movl %ebp, %eax
; X86-SLOW-NEXT: movl %ebx, %ecx
; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: movb %bl, %ch
; X86-SLOW-NEXT: andb $31, %ch
; X86-SLOW-NEXT: movb %ch, %cl
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: testb %ch, %ch
; X86-SLOW-NEXT: je .LBB3_4
; X86-SLOW-NEXT: # %bb.3:
; X86-SLOW-NEXT: orl %edi, %eax
; X86-SLOW-NEXT: movl %eax, %ebp
; X86-SLOW-NEXT: .LBB3_4:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %ebx, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: testb $32, %bl
; X86-SLOW-NEXT: je .LBB3_6
; X86-SLOW-NEXT: # %bb.5:
; X86-SLOW-NEXT: movl %edi, %ebp
; X86-SLOW-NEXT: xorl %edi, %edi
; X86-SLOW-NEXT: .LBB3_6:
; X86-SLOW-NEXT: movb %dh, %cl
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: testb $32, %dh
; X86-SLOW-NEXT: jne .LBB3_7
; X86-SLOW-NEXT: # %bb.8:
; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: jne .LBB3_10
; X86-SLOW-NEXT: jmp .LBB3_11
; X86-SLOW-NEXT: .LBB3_7:
; X86-SLOW-NEXT: movl %esi, %ecx
; X86-SLOW-NEXT: xorl %esi, %esi
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: je .LBB3_11
; X86-SLOW-NEXT: .LBB3_10:
; X86-SLOW-NEXT: orl %esi, %ebp
; X86-SLOW-NEXT: orl %ecx, %edi
; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %edi, %eax
; X86-SLOW-NEXT: .LBB3_11:
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SLOW-NEXT: addl $8, %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: popl %ebx
; X86-SLOW-NEXT: popl %ebp
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i64:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movq %rdx, %rcx
; X64-FAST-NEXT: andl $63, %ecx
; X64-FAST-NEXT: movq %rdi, %rax
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-FAST-NEXT: shldq %cl, %rsi, %rax
; X64-FAST-NEXT: testq %rcx, %rcx
; X64-FAST-NEXT: cmoveq %rdi, %rax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i64:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movq %rsi, %rax
; X64-SLOW-NEXT: andl $63, %edx
; X64-SLOW-NEXT: movq %rdi, %rsi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shlq %cl, %rsi
; X64-SLOW-NEXT: andb $63, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: negl %ecx
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shrq %cl, %rax
; X64-SLOW-NEXT: orq %rsi, %rax
; X64-SLOW-NEXT: testq %rdx, %rdx
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmoveq %rdi, %rax
; X64-SLOW-NEXT: retq
%tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)

View File

@ -58,20 +58,11 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X86-FAST-LABEL: var_shift_i16:
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: pushl %esi
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %esi
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-FAST-NEXT: andl $15, %ecx
; X86-FAST-NEXT: movl %eax, %edx
; X86-FAST-NEXT: shrdw %cl, %si, %dx
; X86-FAST-NEXT: testw %cx, %cx
; X86-FAST-NEXT: je .LBB1_2
; X86-FAST-NEXT: # %bb.1:
; X86-FAST-NEXT: movl %edx, %eax
; X86-FAST-NEXT: .LBB1_2:
; X86-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; X86-FAST-NEXT: popl %esi
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-FAST-NEXT: andb $15, %cl
; X86-FAST-NEXT: shrdw %cl, %dx, %ax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i16:
@ -79,17 +70,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: andl $15, %edx
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: andb $15, %dl
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: movl $16, %ecx
; X86-SLOW-NEXT: subl %edx, %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-SLOW-NEXT: movb $16, %cl
; X86-SLOW-NEXT: subb %dl, %cl
; X86-SLOW-NEXT: shll %cl, %esi
; X86-SLOW-NEXT: testw %dx, %dx
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB1_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %edi, %esi
@ -103,26 +93,24 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X64-FAST-LABEL: var_shift_i16:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edx, %ecx
; X64-FAST-NEXT: andl $15, %ecx
; X64-FAST-NEXT: movl %esi, %eax
; X64-FAST-NEXT: andb $15, %cl
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-FAST-NEXT: shrdw %cl, %di, %ax
; X64-FAST-NEXT: testw %cx, %cx
; X64-FAST-NEXT: cmovel %esi, %eax
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movzwl %si, %eax
; X64-SLOW-NEXT: andl $15, %edx
; X64-SLOW-NEXT: andb $15, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: movl $16, %ecx
; X64-SLOW-NEXT: subl %edx, %ecx
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: movb $16, %cl
; X64-SLOW-NEXT: subb %dl, %cl
; X64-SLOW-NEXT: shll %cl, %edi
; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: testw %dx, %dx
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %esi, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
@ -133,19 +121,10 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X86-FAST-LABEL: var_shift_i32:
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: pushl %esi
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-FAST-NEXT: andl $31, %ecx
; X86-FAST-NEXT: movl %eax, %edx
; X86-FAST-NEXT: shrdl %cl, %esi, %edx
; X86-FAST-NEXT: testl %ecx, %ecx
; X86-FAST-NEXT: je .LBB2_2
; X86-FAST-NEXT: # %bb.1:
; X86-FAST-NEXT: movl %edx, %eax
; X86-FAST-NEXT: .LBB2_2:
; X86-FAST-NEXT: popl %esi
; X86-FAST-NEXT: shrdl %cl, %edx, %eax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i32:
@ -153,17 +132,16 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: andl $31, %edx
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: andb $31, %dl
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: negl %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: shll %cl, %esi
; X86-SLOW-NEXT: testl %edx, %edx
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB2_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %edi, %esi
@ -176,26 +154,23 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X64-FAST-LABEL: var_shift_i32:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edx, %ecx
; X64-FAST-NEXT: andl $31, %ecx
; X64-FAST-NEXT: movl %esi, %eax
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-FAST-NEXT: shrdl %cl, %edi, %eax
; X64-FAST-NEXT: testl %ecx, %ecx
; X64-FAST-NEXT: cmovel %esi, %eax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i32:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movl %edi, %eax
; X64-SLOW-NEXT: andl $31, %edx
; X64-SLOW-NEXT: movl %esi, %edi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrl %cl, %edi
; X64-SLOW-NEXT: andb $31, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: negl %ecx
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shll %cl, %eax
; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: testl %edx, %edx
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %esi, %eax
; X64-SLOW-NEXT: retq
%tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
@ -203,81 +178,164 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
}
define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-LABEL: var_shift_i64:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: andl $63, %ebx
; X86-NEXT: movb $64, %cl
; X86-NEXT: subb %bl, %cl
; X86-NEXT: movl %eax, %edi
; X86-NEXT: shll %cl, %edi
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: testb $32, %cl
; X86-NEXT: je .LBB3_2
; X86-NEXT: # %bb.1:
; X86-NEXT: movl %edi, %esi
; X86-NEXT: xorl %edi, %edi
; X86-NEXT: .LBB3_2:
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: shrl %cl, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shrdl %cl, %edx, %eax
; X86-NEXT: testb $32, %bl
; X86-NEXT: je .LBB3_4
; X86-NEXT: # %bb.3:
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: xorl %ebp, %ebp
; X86-NEXT: .LBB3_4:
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: je .LBB3_6
; X86-NEXT: # %bb.5:
; X86-NEXT: orl %ebp, %esi
; X86-NEXT: orl %eax, %edi
; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: movl %esi, %edx
; X86-NEXT: .LBB3_6:
; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
; X86-NEXT: retl
; X86-FAST-LABEL: var_shift_i64:
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: pushl %ebp
; X86-FAST-NEXT: pushl %ebx
; X86-FAST-NEXT: pushl %edi
; X86-FAST-NEXT: pushl %esi
; X86-FAST-NEXT: pushl %eax
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-FAST-NEXT: andl $63, %ebx
; X86-FAST-NEXT: movb $64, %cl
; X86-FAST-NEXT: subb %bl, %cl
; X86-FAST-NEXT: movl %eax, %edi
; X86-FAST-NEXT: shll %cl, %edi
; X86-FAST-NEXT: shldl %cl, %eax, %esi
; X86-FAST-NEXT: testb $32, %cl
; X86-FAST-NEXT: je .LBB3_2
; X86-FAST-NEXT: # %bb.1:
; X86-FAST-NEXT: movl %edi, %esi
; X86-FAST-NEXT: xorl %edi, %edi
; X86-FAST-NEXT: .LBB3_2:
; X86-FAST-NEXT: movl %edx, %ebp
; X86-FAST-NEXT: movl %ebx, %ecx
; X86-FAST-NEXT: shrl %cl, %ebp
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: shrdl %cl, %edx, %eax
; X86-FAST-NEXT: testb $32, %bl
; X86-FAST-NEXT: je .LBB3_4
; X86-FAST-NEXT: # %bb.3:
; X86-FAST-NEXT: movl %ebp, %eax
; X86-FAST-NEXT: xorl %ebp, %ebp
; X86-FAST-NEXT: .LBB3_4:
; X86-FAST-NEXT: testl %ebx, %ebx
; X86-FAST-NEXT: je .LBB3_6
; X86-FAST-NEXT: # %bb.5:
; X86-FAST-NEXT: orl %ebp, %esi
; X86-FAST-NEXT: orl %eax, %edi
; X86-FAST-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-FAST-NEXT: movl %esi, %edx
; X86-FAST-NEXT: .LBB3_6:
; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-FAST-NEXT: addl $4, %esp
; X86-FAST-NEXT: popl %esi
; X86-FAST-NEXT: popl %edi
; X86-FAST-NEXT: popl %ebx
; X86-FAST-NEXT: popl %ebp
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i64:
; X86-SLOW: # %bb.0:
; X86-SLOW-NEXT: pushl %ebp
; X86-SLOW-NEXT: pushl %ebx
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: subl $8, %esp
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SLOW-NEXT: andl $63, %ebx
; X86-SLOW-NEXT: movb $64, %al
; X86-SLOW-NEXT: subb %bl, %al
; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movl %eax, %ecx
; X86-SLOW-NEXT: shll %cl, %edx
; X86-SLOW-NEXT: movb %al, %ch
; X86-SLOW-NEXT: andb $31, %ch
; X86-SLOW-NEXT: movb %ch, %cl
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: movl %esi, %edi
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: testb %ch, %ch
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SLOW-NEXT: je .LBB3_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %edi, %edx
; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: .LBB3_2:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SLOW-NEXT: movl %ecx, %edx
; X86-SLOW-NEXT: movl %ebx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edx
; X86-SLOW-NEXT: movb %bl, %ah
; X86-SLOW-NEXT: andb $31, %ah
; X86-SLOW-NEXT: movb %ah, %cl
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: movl %ebp, %edi
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: testb %ah, %ah
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SLOW-NEXT: je .LBB3_4
; X86-SLOW-NEXT: # %bb.3:
; X86-SLOW-NEXT: orl %edx, %edi
; X86-SLOW-NEXT: movl %edi, %ebp
; X86-SLOW-NEXT: .LBB3_4:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-SLOW-NEXT: movl %ebx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: testb $32, %bl
; X86-SLOW-NEXT: je .LBB3_6
; X86-SLOW-NEXT: # %bb.5:
; X86-SLOW-NEXT: movl %edi, %ebp
; X86-SLOW-NEXT: xorl %edi, %edi
; X86-SLOW-NEXT: .LBB3_6:
; X86-SLOW-NEXT: movl %eax, %ecx
; X86-SLOW-NEXT: shll %cl, %esi
; X86-SLOW-NEXT: testb $32, %al
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: jne .LBB3_7
; X86-SLOW-NEXT: # %bb.8:
; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: jne .LBB3_10
; X86-SLOW-NEXT: jmp .LBB3_11
; X86-SLOW-NEXT: .LBB3_7:
; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: xorl %esi, %esi
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: je .LBB3_11
; X86-SLOW-NEXT: .LBB3_10:
; X86-SLOW-NEXT: orl %ebp, %esi
; X86-SLOW-NEXT: orl %edi, %eax
; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %eax, %edx
; X86-SLOW-NEXT: .LBB3_11:
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SLOW-NEXT: addl $8, %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: popl %ebx
; X86-SLOW-NEXT: popl %ebp
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i64:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movq %rdx, %rcx
; X64-FAST-NEXT: andl $63, %ecx
; X64-FAST-NEXT: movq %rsi, %rax
; X64-FAST-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-FAST-NEXT: shrdq %cl, %rdi, %rax
; X64-FAST-NEXT: testq %rcx, %rcx
; X64-FAST-NEXT: cmoveq %rsi, %rax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i64:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movq %rdi, %rax
; X64-SLOW-NEXT: andl $63, %edx
; X64-SLOW-NEXT: movq %rsi, %rdi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrq %cl, %rdi
; X64-SLOW-NEXT: andb $63, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: negl %ecx
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shlq %cl, %rax
; X64-SLOW-NEXT: orq %rdi, %rax
; X64-SLOW-NEXT: testq %rdx, %rdx
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmoveq %rsi, %rax
; X64-SLOW-NEXT: retq
%tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
@ -315,7 +373,7 @@ define i16 @const_shift_i16(i16 %x, i16 %y) nounwind {
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: shldw $9, %cx, %ax
; X86-FAST-NEXT: shrdw $7, %cx, %ax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: const_shift_i16:
@ -330,8 +388,8 @@ define i16 @const_shift_i16(i16 %x, i16 %y) nounwind {
;
; X64-FAST-LABEL: const_shift_i16:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edi, %eax
; X64-FAST-NEXT: shldw $9, %si, %ax
; X64-FAST-NEXT: movl %esi, %eax
; X64-FAST-NEXT: shrdw $7, %di, %ax
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; X64-FAST-NEXT: retq
;
@ -352,7 +410,7 @@ define i32 @const_shift_i32(i32 %x, i32 %y) nounwind {
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-FAST-NEXT: shldl $25, %ecx, %eax
; X86-FAST-NEXT: shrdl $7, %ecx, %eax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: const_shift_i32:

View File

@ -14,31 +14,23 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
declare i64 @llvm.fshr.i64(i64, i64, i64)
declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
; General case - all operands can be variables - x86 has shld, but the mask and cmov are not needed?
; General case - all operands can be variables
define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X32-SSE2-LABEL: fshl_i32:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: pushl %esi
; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE2-NEXT: andl $31, %ecx
; X32-SSE2-NEXT: movl %esi, %eax
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: shldl %cl, %edx, %eax
; X32-SSE2-NEXT: testl %ecx, %ecx
; X32-SSE2-NEXT: cmovel %esi, %eax
; X32-SSE2-NEXT: popl %esi
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: fshl_i32:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: movl %edx, %ecx
; X64-AVX2-NEXT: andl $31, %ecx
; X64-AVX2-NEXT: movl %edi, %eax
; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-AVX2-NEXT: shldl %cl, %esi, %eax
; X64-AVX2-NEXT: testl %ecx, %ecx
; X64-AVX2-NEXT: cmovel %edi, %eax
; X64-AVX2-NEXT: retq
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
ret i32 %f
@ -212,31 +204,23 @@ define i8 @fshl_i8_const_fold() nounwind {
; Repeat everything for funnel shift right.
; General case - all operands can be variables - x86 has 'shrd', but the mask and cmov are not needed?
; General case - all operands can be variables
define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X32-SSE2-LABEL: fshr_i32:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: pushl %esi
; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE2-NEXT: andl $31, %ecx
; X32-SSE2-NEXT: movl %esi, %eax
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: shrdl %cl, %edx, %eax
; X32-SSE2-NEXT: testl %ecx, %ecx
; X32-SSE2-NEXT: cmovel %esi, %eax
; X32-SSE2-NEXT: popl %esi
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: fshr_i32:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: movl %edx, %ecx
; X64-AVX2-NEXT: andl $31, %ecx
; X64-AVX2-NEXT: movl %esi, %eax
; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-AVX2-NEXT: shrdl %cl, %edi, %eax
; X64-AVX2-NEXT: testl %ecx, %ecx
; X64-AVX2-NEXT: cmovel %esi, %eax
; X64-AVX2-NEXT: retq
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
ret i32 %f
@ -341,7 +325,7 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: shldl $23, %ecx, %eax
; X32-SSE2-NEXT: shrdl $9, %ecx, %eax
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: fshr_i32_const_shift:
@ -353,14 +337,14 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
ret i32 %f
}
; Check modulo math on shift amount. 41-32=9, but right-shift became left, so 32-9=23.
; Check modulo math on shift amount. 41-32=9, but right-shift may became left, so 32-9=23.
define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) nounwind {
; X32-SSE2-LABEL: fshr_i32_const_overshift:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: shldl $23, %ecx, %eax
; X32-SSE2-NEXT: shrdl $9, %ecx, %eax
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: fshr_i32_const_overshift: