[Intrinsic] Add fixed point saturating division intrinsics.

Summary: This patch adds intrinsics and ISelDAG nodes for signed and unsigned fixed-point division: ``` llvm.sdiv.fix.sat.* llvm.udiv.fix.sat.* ``` These intrinsics perform scaled, saturating division on two integers or vectors of integers. They are required for the implementation of the Embedded-C fixed-point arithmetic in Clang. Reviewers: bjope, leonardchan, craig.topper Subscribers: hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D71550
2024-11-22 02:33:06 +01:00 · 2019-12-16 15:25:52 +01:00 · 2019-12-16 15:25:52 +01:00 · 4f8b0d2f56
commit 4f8b0d2f56
parent 27f50511fb
16 changed files with 2250 additions and 51 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -14331,6 +14331,136 @@ Examples
      %res = call i4 @llvm.udiv.fix.i4(i4 3, i4 4, i32 1)  ; %res = 2 (or 1) (1.5 / 2 = 0.75)


+'``llvm.sdiv.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.sdiv.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.sdiv.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+      declare i32 @llvm.sdiv.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+      declare i64 @llvm.sdiv.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+      declare <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.sdiv.fix.sat``' family of intrinsic functions perform signed
+fixed point saturation division on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo signed fixed point division. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point division on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest signed value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest signed value representable by this bit width.
+
+It is undefined behavior if the second argument is zero.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 6, i4 2, i32 0)  ; %res = 3 (6 / 2 = 3)
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 6, i4 4, i32 1)  ; %res = 3 (3 / 2 = 1.5)
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 3, i4 -2, i32 1) ; %res = -3 (1.5 / -1 = -1.5)
+
+      ; The result in the following could be rounded up to 1 or down to 0.5
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 3, i4 4, i32 1)  ; %res = 2 (or 1) (1.5 / 2 = 0.75)
+
+      ; Saturation
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 -8, i4 -1, i32 0)  ; %res = 7 (-8 / -1 = 8 => 7)
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 4, i4 2, i32 2)  ; %res = 7 (1 / 0.5 = 2 => 1.75)
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 -4, i4 1, i32 2)  ; %res = -8 (-1 / 0.25 = -4 => -2)
+
+
+'``llvm.udiv.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.udiv.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.udiv.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+      declare i32 @llvm.udiv.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+      declare i64 @llvm.udiv.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+      declare <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.udiv.fix.sat``' family of intrinsic functions perform unsigned
+fixed point saturation division on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo unsigned fixed point division. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point division on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest unsigned value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest unsigned value representable by this bit width (zero).
+
+It is undefined behavior if the second argument is zero.
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.udiv.fix.sat.i4(i4 6, i4 2, i32 0)  ; %res = 3 (6 / 2 = 3)
+      %res = call i4 @llvm.udiv.fix.sat.i4(i4 6, i4 4, i32 1)  ; %res = 3 (3 / 2 = 1.5)
+
+      ; The result in the following could be rounded down to 0.5 or up to 1
+      %res = call i4 @llvm.udiv.fix.sat.i4(i4 3, i4 4, i32 1)  ; %res = 1 (or 2) (1.5 / 2 = 0.75)
+
+      ; Saturation
+      %res = call i4 @llvm.udiv.fix.sat.i4(i4 8, i4 2, i32 2)  ; %res = 15 (2 / 0.5 = 4 => 3.75)
+
+
 Specialised Arithmetic Intrinsics
 ---------------------------------

--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@ -291,6 +291,11 @@ namespace ISD {
    /// constant integer.
    SDIVFIX, UDIVFIX,

+    /// Same as the corresponding unsaturated fixed point instructions, but the
+    /// result is clamped between the min and max values representable by the
+    /// bits of the first 2 operands.
+    SDIVFIXSAT, UDIVFIXSAT,
+
    /// Simple binary floating point operators.
    FADD, FSUB, FMUL, FDIV, FREM,

--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@ -1043,7 +1043,9 @@ public:
    case ISD::UMULFIX:
    case ISD::UMULFIXSAT:
    case ISD::SDIVFIX:
+    case ISD::SDIVFIXSAT:
    case ISD::UDIVFIX:
+    case ISD::UDIVFIXSAT:
      Supported = isSupportedFixedPointOperation(Op, VT, Scale);
      break;
    }
@ -4269,7 +4271,7 @@ public:
  /// method accepts integers as its arguments.
  SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const;

-  /// Method for building the DAG expansion of ISD::[US]DIVFIX. This
+  /// Method for building the DAG expansion of ISD::[US]DIVFIX[SAT]. This
  /// method accepts integers as its arguments.
  /// Note: This method may fail if the division could not be performed
  /// within the type. Clients must retry with a wider type if this happens.
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -969,6 +969,14 @@ def int_umul_fix_sat : Intrinsic<[llvm_anyint_ty],
                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                                 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;

+def int_sdiv_fix_sat : Intrinsic<[llvm_anyint_ty],
+                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                                 [IntrNoMem, ImmArg<2>]>;
+
+def int_udiv_fix_sat : Intrinsic<[llvm_anyint_ty],
+                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                                 [IntrNoMem, ImmArg<2>]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@ -402,7 +402,9 @@ def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>
 def umulfix    : SDNode<"ISD::UMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 def umulfixsat : SDNode<"ISD::UMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
 def sdivfix    : SDNode<"ISD::SDIVFIX"   , SDTIntScaledBinOp>;
+def sdivfixsat : SDNode<"ISD::SDIVFIXSAT", SDTIntScaledBinOp>;
 def udivfix    : SDNode<"ISD::UDIVFIX"   , SDTIntScaledBinOp>;
+def udivfixsat : SDNode<"ISD::UDIVFIXSAT", SDTIntScaledBinOp>;

 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -1132,7 +1132,9 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
  case ISD::UMULFIX:
  case ISD::UMULFIXSAT:
  case ISD::SDIVFIX:
-  case ISD::UDIVFIX: {
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: {
    unsigned Scale = Node->getConstantOperandVal(2);
    Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                              Node->getValueType(0), Scale);
@ -3489,7 +3491,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
    Results.push_back(TLI.expandFixedPointMul(Node, DAG));
    break;
  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
    if (SDValue V = TLI.expandFixedPointDiv(Node->getOpcode(), SDLoc(Node),
                                            Node->getOperand(0),
                                            Node->getOperand(1),
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -162,7 +162,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
  case ISD::UMULFIXSAT:  Res = PromoteIntRes_MULFIX(N); break;

  case ISD::SDIVFIX:
-  case ISD::UDIVFIX:     Res = PromoteIntRes_DIVFIX(N); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:  Res = PromoteIntRes_DIVFIX(N); break;

  case ISD::ABS:         Res = PromoteIntRes_ABS(N); break;

@ -784,22 +786,51 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
                     N->getOperand(2));
 }

+static SDValue SaturateWidenedDIVFIX(SDValue V, SDLoc &dl,
+                                     unsigned SatW, bool Signed,
+                                     const TargetLowering &TLI,
+                                     SelectionDAG &DAG) {
+  EVT VT = V.getValueType();
+  unsigned VTW = VT.getScalarSizeInBits();
+
+  if (!Signed) {
+    // Saturate to the unsigned maximum by getting the minimum of V and the
+    // maximum.
+    return DAG.getNode(ISD::UMIN, dl, VT, V,
+                       DAG.getConstant(APInt::getLowBitsSet(VTW, SatW),
+                                       dl, VT));
+  }
+
+  // Saturate to the signed maximum (the low SatW - 1 bits) by taking the
+  // signed minimum of it and V.
+  V = DAG.getNode(ISD::SMIN, dl, VT, V,
+                  DAG.getConstant(APInt::getLowBitsSet(VTW, SatW - 1),
+                                  dl, VT));
+  // Saturate to the signed minimum (the high SatW + 1 bits) by taking the
+  // signed maximum of it and V.
+  V = DAG.getNode(ISD::SMAX, dl, VT, V,
+                  DAG.getConstant(APInt::getHighBitsSet(VTW, VTW - SatW + 1),
+                                  dl, VT));
+  return V;
+}
+
 static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS,
                                 unsigned Scale, const TargetLowering &TLI,
-                                    SelectionDAG &DAG) {
+                                 SelectionDAG &DAG, unsigned SatW = 0) {
  EVT VT = LHS.getValueType();
-  bool Signed = N->getOpcode() == ISD::SDIVFIX;
+  unsigned VTSize = VT.getScalarSizeInBits();
+  bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+                N->getOpcode() == ISD::SDIVFIXSAT;
+  bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+                    N->getOpcode() == ISD::UDIVFIXSAT;

  SDLoc dl(N);
-  // See if we can perform the division in this type without widening.
-  if (SDValue V = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
-                                          DAG))
-    return V;
-
-  // If that didn't work, double the type width and try again. That must work,
-  // or something is wrong.
-  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(),
-                                 VT.getScalarSizeInBits() * 2);
+  // Widen the types by a factor of two. This is guaranteed to expand, since it
+  // will always have enough high bits in the LHS to shift into.
+  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VTSize * 2);
+  if (VT.isVector())
+    WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
+                              VT.getVectorElementCount());
  if (Signed) {
    LHS = DAG.getSExtOrTrunc(LHS, dl, WideVT);
    RHS = DAG.getSExtOrTrunc(RHS, dl, WideVT);
@ -808,18 +839,28 @@ static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS,
    RHS = DAG.getZExtOrTrunc(RHS, dl, WideVT);
  }

-  // TODO: Saturation.
-
  SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
                                        DAG);
  assert(Res && "Expanding DIVFIX with wide type failed?");
+  if (Saturating) {
+    // If the caller has told us to saturate at something less, use that width
+    // instead of the type before doubling. However, it cannot be more than
+    // what we just widened!
+    assert(SatW <= VTSize &&
+           "Tried to saturate to more than the original type?");
+    Res = SaturateWidenedDIVFIX(Res, dl, SatW == 0 ? VTSize : SatW, Signed,
+                                TLI, DAG);
+  }
  return DAG.getZExtOrTrunc(Res, dl, VT);
 }

 SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
  SDLoc dl(N);
  SDValue Op1Promoted, Op2Promoted;
-  bool Signed = N->getOpcode() == ISD::SDIVFIX;
+  bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+                N->getOpcode() == ISD::SDIVFIXSAT;
+  bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+                    N->getOpcode() == ISD::UDIVFIXSAT;
  if (Signed) {
    Op1Promoted = SExtPromotedInteger(N->getOperand(0));
    Op2Promoted = SExtPromotedInteger(N->getOperand(1));
@ -830,23 +871,41 @@ SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
  EVT PromotedType = Op1Promoted.getValueType();
  unsigned Scale = N->getConstantOperandVal(2);

-  SDValue Res;
  // If the type is already legal and the operation is legal in that type, we
  // should not early expand.
  if (TLI.isTypeLegal(PromotedType)) {
    TargetLowering::LegalizeAction Action =
        TLI.getFixedPointOperationAction(N->getOpcode(), PromotedType, Scale);
-    if (Action == TargetLowering::Legal || Action == TargetLowering::Custom)
-      Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
+    if (Action == TargetLowering::Legal || Action == TargetLowering::Custom) {
+      EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+      unsigned Diff = PromotedType.getScalarSizeInBits() -
+                      N->getValueType(0).getScalarSizeInBits();
+      if (Saturating)
+        Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                                  DAG.getConstant(Diff, dl, ShiftTy));
+      SDValue Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
                                Op2Promoted, N->getOperand(2));
+      if (Saturating)
+        Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, dl, PromotedType, Res,
+                          DAG.getConstant(Diff, dl, ShiftTy));
+      return Res;
+    }
  }

-  if (!Res)
-    Res = earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG);
-
-  // TODO: Saturation.
-
+  // See if we can perform the division in this type without expanding.
+  if (SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, Op1Promoted,
+                                        Op2Promoted, Scale, DAG)) {
+    if (Saturating)
+      Res = SaturateWidenedDIVFIX(Res, dl,
+                                  N->getValueType(0).getScalarSizeInBits(),
+                                  Signed, TLI, DAG);
    return Res;
+  }
+  // If we cannot, expand it to twice the type width. If we are saturating, give
+  // it the original width as a saturating width so we don't need to emit
+  // two saturations.
+  return earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG,
+                            N->getValueType(0).getScalarSizeInBits());
 }

 SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
@ -1315,7 +1374,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
  case ISD::UMULFIX:
  case ISD::UMULFIXSAT:
  case ISD::SDIVFIX:
-  case ISD::UDIVFIX: Res = PromoteIntOp_FIX(N); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break;

  case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;

@ -1923,7 +1984,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
  case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;

  case ISD::SDIVFIX:
-  case ISD::UDIVFIX: ExpandIntRes_DIVFIX(N, Lo, Hi); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: ExpandIntRes_DIVFIX(N, Lo, Hi); break;

  case ISD::VECREDUCE_ADD:
  case ISD::VECREDUCE_MUL:
@ -3253,7 +3316,14 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,

 void DAGTypeLegalizer::ExpandIntRes_DIVFIX(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
-  SDValue Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
+  SDLoc dl(N);
+  // Try expanding in the existing type first.
+  SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, N->getOperand(0),
+                                        N->getOperand(1),
+                                        N->getConstantOperandVal(2), DAG);
+
+  if (!Res)
+    Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
                            N->getConstantOperandVal(2), TLI, DAG);
  SplitInteger(Res, Lo, Hi);
 }
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@ -142,7 +142,7 @@ class VectorLegalizer {
  void ExpandUADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
  void ExpandSADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
  void ExpandMULO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
-  SDValue ExpandFixedPointDiv(SDNode *Node);
+  void ExpandFixedPointDiv(SDNode *Node, SmallVectorImpl<SDValue> &Results);
  SDValue ExpandStrictFPOp(SDNode *Node);
  void ExpandStrictFPOp(SDNode *Node, SmallVectorImpl<SDValue> &Results);

@ -463,7 +463,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
  case ISD::UMULFIX:
  case ISD::UMULFIXSAT:
  case ISD::SDIVFIX:
-  case ISD::UDIVFIX: {
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: {
    unsigned Scale = Node->getConstantOperandVal(2);
    Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                              Node->getValueType(0), Scale);
@ -968,8 +970,11 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
    break;
  case ISD::SDIVFIX:
  case ISD::UDIVFIX:
-    Results.push_back(ExpandFixedPointDiv(Node));
+    ExpandFixedPointDiv(Node, Results);
    return;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIXSAT:
+    break;
 #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
  case ISD::STRICT_##DAGN:
 #include "llvm/IR/ConstrainedOps.def"
@ -1454,12 +1459,12 @@ void VectorLegalizer::ExpandMULO(SDNode *Node,
  Results.push_back(Overflow);
 }

-SDValue VectorLegalizer::ExpandFixedPointDiv(SDNode *Node) {
+void VectorLegalizer::ExpandFixedPointDiv(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
  SDNode *N = Node;
  if (SDValue Expanded = TLI.expandFixedPointDiv(N->getOpcode(), SDLoc(N),
          N->getOperand(0), N->getOperand(1), N->getConstantOperandVal(2), DAG))
-    return Expanded;
-  return DAG.UnrollVectorOp(N);
+    Results.push_back(Expanded);
 }

 void VectorLegalizer::ExpandStrictFPOp(SDNode *Node,
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -166,7 +166,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
  case ISD::UMULFIX:
  case ISD::UMULFIXSAT:
  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
    R = ScalarizeVecRes_FIX(N);
    break;
  }
@ -956,7 +958,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
  case ISD::UMULFIX:
  case ISD::UMULFIXSAT:
  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
    SplitVecRes_FIX(N, Lo, Hi);
    break;
  }
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -5451,7 +5451,8 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
                            SDValue LHS, SDValue RHS, SDValue Scale,
                            SelectionDAG &DAG, const TargetLowering &TLI) {
  EVT VT = LHS.getValueType();
-  bool Signed = Opcode == ISD::SDIVFIX;
+  bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+  bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
  LLVMContext &Ctx = *DAG.getContext();

  // If the type is legal but the operation isn't, this node might survive all
@ -5463,14 +5464,16 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
  // by bumping the size by one bit. This will force it to Promote, enabling the
  // early expansion and avoiding the need to expand later.

-  // We don't have to do this if Scale is 0; that can always be expanded.
+  // We don't have to do this if Scale is 0; that can always be expanded, unless
+  // it's a saturating signed operation. Those can experience true integer
+  // division overflow, a case which we must avoid.

  // FIXME: We wouldn't have to do this (or any of the early
  // expansion/promotion) if it was possible to expand a libcall of an
  // illegal type during operation legalization. But it's not, so things
  // get a bit hacky.
  unsigned ScaleInt = cast<ConstantSDNode>(Scale)->getZExtValue();
-  if (ScaleInt > 0 &&
+  if ((ScaleInt > 0 || (Saturating && Signed)) &&
      (TLI.isTypeLegal(VT) ||
       (VT.isVector() && TLI.isTypeLegal(VT.getVectorElementType())))) {
    TargetLowering::LegalizeAction Action = TLI.getFixedPointOperationAction(
@ -5492,8 +5495,16 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
        LHS = DAG.getZExtOrTrunc(LHS, DL, PromVT);
        RHS = DAG.getZExtOrTrunc(RHS, DL, PromVT);
      }
-      // TODO: Saturation.
+      EVT ShiftTy = TLI.getShiftAmountTy(PromVT, DAG.getDataLayout());
+      // For saturating operations, we need to shift up the LHS to get the
+      // proper saturation width, and then shift down again afterwards.
+      if (Saturating)
+        LHS = DAG.getNode(ISD::SHL, DL, PromVT, LHS,
+                          DAG.getConstant(1, DL, ShiftTy));
      SDValue Res = DAG.getNode(Opcode, DL, PromVT, LHS, RHS, Scale);
+      if (Saturating)
+        Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, PromVT, Res,
+                          DAG.getConstant(1, DL, ShiftTy));
      return DAG.getZExtOrTrunc(Res, DL, VT);
    }
  }
@ -5757,6 +5768,10 @@ static unsigned FixedPointIntrinsicToOpcode(unsigned Intrinsic) {
    return ISD::SDIVFIX;
  case Intrinsic::udiv_fix:
    return ISD::UDIVFIX;
+  case Intrinsic::sdiv_fix_sat:
+    return ISD::SDIVFIXSAT;
+  case Intrinsic::udiv_fix_sat:
+    return ISD::UDIVFIXSAT;
  default:
    llvm_unreachable("Unhandled fixed point intrinsic");
  }
@ -6460,7 +6475,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
    return;
  }
  case Intrinsic::sdiv_fix:
-  case Intrinsic::udiv_fix: {
+  case Intrinsic::udiv_fix:
+  case Intrinsic::sdiv_fix_sat:
+  case Intrinsic::udiv_fix_sat: {
    SDValue Op1 = getValue(I.getArgOperand(0));
    SDValue Op2 = getValue(I.getArgOperand(1));
    SDValue Op3 = getValue(I.getArgOperand(2));
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@ -314,7 +314,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
  case ISD::UMULFIXSAT:                 return "umulfixsat";

  case ISD::SDIVFIX:                    return "sdivfix";
+  case ISD::SDIVFIXSAT:                 return "sdivfixsat";
  case ISD::UDIVFIX:                    return "udivfix";
+  case ISD::UDIVFIXSAT:                 return "udivfixsat";

  // Conversion operators.
  case ISD::SIGN_EXTEND:                return "sign_extend";
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -7332,12 +7332,13 @@ SDValue
 TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
                                    SDValue LHS, SDValue RHS,
                                    unsigned Scale, SelectionDAG &DAG) const {
-  assert((Opcode == ISD::SDIVFIX ||
-          Opcode == ISD::UDIVFIX) &&
+  assert((Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT ||
+          Opcode == ISD::UDIVFIX || Opcode == ISD::UDIVFIXSAT) &&
         "Expected a fixed point division opcode");

  EVT VT = LHS.getValueType();
-  bool Signed = Opcode == ISD::SDIVFIX;
+  bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+  bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

  // If there is enough room in the type to upscale the LHS or downscale the
@ -7349,7 +7350,15 @@ TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
                            : DAG.computeKnownBits(LHS).countMinLeadingZeros();
  unsigned RHSTrail = DAG.computeKnownBits(RHS).countMinTrailingZeros();

-  if (LHSLead + RHSTrail < Scale)
+  // For signed saturating operations, we need to be able to detect true integer
+  // division overflow; that is, when you have MIN / -EPS. However, this
+  // is undefined behavior and if we emit divisions that could take such
+  // values it may cause undesired behavior (arithmetic exceptions on x86, for
+  // example).
+  // Avoid this by requiring an extra bit so that we never get this case.
+  // FIXME: This is a bit unfortunate as it means that for an 8-bit 7-scale
+  // signed saturating division, we need to emit a whopping 32-bit division.
+  if (LHSLead + RHSTrail < Scale + (unsigned)(Saturating && Signed))
    return SDValue();

  unsigned LHSShift = std::min(LHSLead, Scale);
@ -7403,8 +7412,6 @@ TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
    Quot = DAG.getNode(ISD::UDIV, dl, VT,
                       LHS, RHS);

-  // TODO: Saturation.
-
  return Quot;
 }

--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@ -660,7 +660,9 @@ void TargetLoweringBase::initActions() {
    setOperationAction(ISD::UMULFIX, VT, Expand);
    setOperationAction(ISD::UMULFIXSAT, VT, Expand);
    setOperationAction(ISD::SDIVFIX, VT, Expand);
+    setOperationAction(ISD::SDIVFIXSAT, VT, Expand);
    setOperationAction(ISD::UDIVFIX, VT, Expand);
+    setOperationAction(ISD::UDIVFIXSAT, VT, Expand);

    // Overflow operations default to expand
    setOperationAction(ISD::SADDO, VT, Expand);
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@ -4727,7 +4727,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
  case Intrinsic::umul_fix:
  case Intrinsic::umul_fix_sat:
  case Intrinsic::sdiv_fix:
-  case Intrinsic::udiv_fix: {
+  case Intrinsic::sdiv_fix_sat:
+  case Intrinsic::udiv_fix:
+  case Intrinsic::udiv_fix_sat: {
    Value *Op1 = Call.getArgOperand(0);
    Value *Op2 = Call.getArgOperand(1);
    Assert(Op1->getType()->isIntOrIntVectorTy(),
@ -4742,7 +4744,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "third argument of [us][mul|div]_fix[_sat] must fit within 32 bits");

    if (ID == Intrinsic::smul_fix || ID == Intrinsic::smul_fix_sat ||
-        ID == Intrinsic::sdiv_fix) {
+        ID == Intrinsic::sdiv_fix || ID == Intrinsic::sdiv_fix_sat) {
      Assert(
          Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
          "the scale of s[mul|div]_fix[_sat] must be less than the width of "
--- a/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/test/CodeGen/X86/sdiv_fix_sat.ll
--- a/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/test/CodeGen/X86/udiv_fix_sat.ll
@ -0,0 +1,528 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare  i4  @llvm.udiv.fix.sat.i4   (i4,  i4,  i32)
+declare  i15 @llvm.udiv.fix.sat.i15  (i15, i15, i32)
+declare  i16 @llvm.udiv.fix.sat.i16  (i16, i16, i32)
+declare  i18 @llvm.udiv.fix.sat.i18  (i18, i18, i32)
+declare  i64 @llvm.udiv.fix.sat.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i16 @func(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %si, %ecx
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    shll $8, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ecx
+; X64-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
+; X64-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X64-NEXT:    cmovael %ecx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divl %ecx
+; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
+; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %tmp = call i16 @llvm.udiv.fix.sat.i16(i16 %x, i16 %y, i32 7)
+  ret i16 %tmp
+}
+
+define i16 @func2(i8 %x, i8 %y) nounwind {
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    shll $14, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ecx
+; X64-NEXT:    cmpl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    cmovbl %eax, %ecx
+; X64-NEXT:    addl %ecx, %ecx
+; X64-NEXT:    movswl %cx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func2:
+; X86:       # %bb.0:
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X86-NEXT:    shll $14, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divl %ecx
+; X86-NEXT:    cmpl $32767, %eax # imm = 0x7FFF
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    movswl %cx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %x2 = sext i8 %x to i15
+  %y2 = sext i8 %y to i15
+  %tmp = call i15 @llvm.udiv.fix.sat.i15(i15 %x2, i15 %y2, i32 14)
+  %tmp2 = sext i15 %tmp to i16
+  ret i16 %tmp2
+}
+
+define i16 @func3(i15 %x, i8 %y) nounwind {
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    shll $4, %ecx
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divw %cx
+; X64-NEXT:    # kill: def $ax killed $ax def $eax
+; X64-NEXT:    movzwl %ax, %ecx
+; X64-NEXT:    cmpl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    cmovbl %eax, %ecx
+; X64-NEXT:    addl %ecx, %ecx
+; X64-NEXT:    movswl %cx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func3:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divw %cx
+; X86-NEXT:    # kill: def $ax killed $ax def $eax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    cmpl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    movswl %cx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %y2 = sext i8 %y to i15
+  %y3 = shl i15 %y2, 7
+  %tmp = call i15 @llvm.udiv.fix.sat.i15(i15 %x, i15 %y3, i32 4)
+  %tmp2 = sext i15 %tmp to i16
+  ret i16 %tmp2
+}
+
+define i4 @func4(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $15, %sil
+; X64-NEXT:    andb $15, %dil
+; X64-NEXT:    shlb $2, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    divb %sil
+; X64-NEXT:    movzbl %al, %ecx
+; X64-NEXT:    cmpb $15, %cl
+; X64-NEXT:    movl $15, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func4:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    andb $15, %al
+; X86-NEXT:    shlb $2, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    divb %cl
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    cmpb $15, %al
+; X86-NEXT:    movl $15, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.udiv.fix.sat.i4(i4 %x, i4 %y, i32 2)
+  ret i4 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    leaq (%rdi,%rdi), %rsi
+; X64-NEXT:    shrq $33, %rsi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    orq %rax, %rsi
+; X64-NEXT:    shlq $32, %rdi
+; X64-NEXT:    xorl %ebx, %ebx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    callq __udivti3
+; X64-NEXT:    cmpq $-1, %rax
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    cmovbq %rax, %rcx
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    movl $1, %esi
+; X64-NEXT:    cmovbq %rdx, %rsi
+; X64-NEXT:    sbbq %rbx, %rbx
+; X64-NEXT:    notq %rbx
+; X64-NEXT:    orq %rax, %rbx
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    cmoveq %rcx, %rbx
+; X64-NEXT:    shrdq $1, %rsi, %rbx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+;
+; X86-LABEL: func5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    movl %esp, %esi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    calll __udivti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovbl %eax, %esi
+; X86-NEXT:    cmpl $-1, %edx
+; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    cmovael %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.udiv.fix.sat.i64(i64 %x, i64 %y, i32 31)
+  ret i64 %tmp
+}
+
+define i18 @func6(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func6:
+; X64:       # %bb.0:
+; X64-NEXT:    movswl %di, %eax
+; X64-NEXT:    andl $262143, %eax # imm = 0x3FFFF
+; X64-NEXT:    movswl %si, %ecx
+; X64-NEXT:    andl $262143, %ecx # imm = 0x3FFFF
+; X64-NEXT:    shll $7, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ecx
+; X64-NEXT:    cmpl $262143, %eax # imm = 0x3FFFF
+; X64-NEXT:    movl $262143, %ecx # imm = 0x3FFFF
+; X64-NEXT:    cmovael %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func6:
+; X86:       # %bb.0:
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $262143, %ecx # imm = 0x3FFFF
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $262143, %eax # imm = 0x3FFFF
+; X86-NEXT:    shll $7, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divl %ecx
+; X86-NEXT:    cmpl $262143, %eax # imm = 0x3FFFF
+; X86-NEXT:    movl $262143, %ecx # imm = 0x3FFFF
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    retl
+  %x2 = sext i16 %x to i18
+  %y2 = sext i16 %y to i18
+  %tmp = call i18 @llvm.udiv.fix.sat.i18(i18 %x2, i18 %y2, i32 7)
+  ret i18 %tmp
+}
+
+define i16 @func7(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func7:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %si, %ecx
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    shlq $16, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    cmpq $131071, %rax # imm = 0x1FFFF
+; X64-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X64-NEXT:    cmovaeq %rcx, %rax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func7:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl $16, %edx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
+; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %tmp = call i16 @llvm.udiv.fix.sat.i16(i16 %x, i16 %y, i32 16)
+  ret i16 %tmp
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pxor %xmm8, %xmm8
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; X64-NEXT:    movq %xmm2, %rcx
+; X64-NEXT:    movdqa %xmm0, %xmm4
+; X64-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; X64-NEXT:    paddq %xmm4, %xmm4
+; X64-NEXT:    psllq $31, %xmm4
+; X64-NEXT:    movq %xmm4, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    movq %rax, %xmm7
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X64-NEXT:    movq %xmm2, %rcx
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; X64-NEXT:    movq %xmm2, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    movq %rax, %xmm2
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0]
+; X64-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; X64-NEXT:    movdqa %xmm7, %xmm2
+; X64-NEXT:    pxor %xmm4, %xmm2
+; X64-NEXT:    movdqa {{.*#+}} xmm9 = [9223372043297226751,9223372043297226751]
+; X64-NEXT:    movdqa %xmm9, %xmm6
+; X64-NEXT:    pcmpgtd %xmm2, %xmm6
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; X64-NEXT:    pcmpeqd %xmm9, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X64-NEXT:    pand %xmm3, %xmm5
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X64-NEXT:    por %xmm5, %xmm2
+; X64-NEXT:    movdqa {{.*#+}} xmm6 = [8589934591,8589934591]
+; X64-NEXT:    pand %xmm2, %xmm7
+; X64-NEXT:    pandn %xmm6, %xmm2
+; X64-NEXT:    por %xmm7, %xmm2
+; X64-NEXT:    psrlq $1, %xmm2
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; X64-NEXT:    movq %xmm1, %rcx
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; X64-NEXT:    paddq %xmm0, %xmm0
+; X64-NEXT:    psllq $31, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    movq %rax, %xmm3
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT:    movq %xmm1, %rcx
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    movq %rax, %xmm0
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; X64-NEXT:    pxor %xmm3, %xmm4
+; X64-NEXT:    movdqa %xmm9, %xmm0
+; X64-NEXT:    pcmpgtd %xmm4, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; X64-NEXT:    pcmpeqd %xmm9, %xmm4
+; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X64-NEXT:    pand %xmm1, %xmm4
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    por %xmm4, %xmm0
+; X64-NEXT:    pand %xmm0, %xmm3
+; X64-NEXT:    pandn %xmm6, %xmm0
+; X64-NEXT:    por %xmm3, %xmm0
+; X64-NEXT:    psrlq $1, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    notl %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovel %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:    cmovael %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $31, %esi, %eax
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    cmovbl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    shldl $31, %edi, %eax
+; X86-NEXT:    shll $31, %edi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    notl %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    addl %ebp, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    cmovael %edx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    cmovel %ebx, %edi
+; X86-NEXT:    shldl $31, %ebp, %ecx
+; X86-NEXT:    shll $31, %ebp
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $1, %ebx
+; X86-NEXT:    cmovbl %edx, %ebx
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    notl %ebp
+; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    cmovel %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $1, %eax, %esi
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $1, %eax, %edi
+; X86-NEXT:    shrdl $1, %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 31)
+  ret <4 x i32> %tmp
+}