[AArch64] Generate vector signed/unsigned mul and mla/mls long.

Phabricator Revision: http://reviews.llvm.org/D5589 Patch by Balaram Makam <bmakam@codeaurora.org>!! llvm-svn: 219276
2024-10-19 02:52:53 +02:00 · 2014-10-08 02:31:24 +00:00 · 2014-10-08 02:31:24 +00:00 · 75e17097bb
commit 75e17097bb
parent 3853b2ab38
4 changed files with 580 additions and 0 deletions
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -531,6 +531,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)

    // AArch64 doesn't have MUL.2d:
    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    // Custom handling for some quad-vector types to detect MULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
    // Likewise, narrowing and extending vector loads/stores aren't handled
@ -853,6 +858,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
+  case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
  }
 }

@ -1668,6 +1675,197 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
      0);
 }

+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
+                                                 const EVT &OrigTy,
+                                                 const EVT &ExtTy,
+                                                 unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  EVT VT = N->getValueType(0);
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
+                                             N->getOperand(0)->getValueType(0),
+                                             N->getValueType(0),
+                                             N->getOpcode());
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+}
+
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = AArch64ISD::SMULL;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = AArch64ISD::UMULL;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = AArch64ISD::SMULL;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a S/UMULL instruction
+  SDLoc DL(Op);
+  SDValue Op0;
+  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = skipExtensionForVectorMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
+  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
+  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}

 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
@ -1768,6 +1966,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
    return LowerFP_TO_INT(Op, DAG);
  case ISD::FSINCOS:
    return LowerFSINCOS(Op, DAG);
+  case ISD::MUL:
+    return LowerMUL(Op, DAG);
  }
 }

--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@ -169,6 +169,9 @@ enum {
  /// mode without emitting such REV instructions.
  NVCAST,

+  SMULL,
+  UMULL,
+
  // NEON Load/Store with post-increment base updates
  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
  LD3post,
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@ -239,6 +239,11 @@ def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",

 def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;

+def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                    SDTCisSameAs<1, 2>]>;
+def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
+def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
+
 //===----------------------------------------------------------------------===//

 //===----------------------------------------------------------------------===//
@ -3148,6 +3153,46 @@ defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
 defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;

+// Additional patterns for SMULL and UMULL
+multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (INST8B V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (INST4H V64:$Rn, V64:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (INST2S V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
+  SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
+defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
+  UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
+
+// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
+multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (INST2S  V128:$Rd, V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+  SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+  UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+  SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+  UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
+
 // Patterns for 64-bit pmull
 def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
--- a/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/test/CodeGen/AArch64/aarch64-smull.ll
@ -0,0 +1,332 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
+
+define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+; CHECK-LABEL: smull_v8i8_v8i16:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = mul <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+; CHECK-LABEL: smull_v4i16_v4i32:
+; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = mul <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+; CHECK-LABEL: smull_v2i32_v2i64:
+; CHECK:  smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = mul <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+; CHECK-LABEL: umull_v8i8_v8i16:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = mul <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+; CHECK-LABEL: umull_v4i16_v4i32:
+; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = mul <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+; CHECK-LABEL: umull_v2i32_v2i64:
+; CHECK:  umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = mul <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: smlal_v8i8_v8i16:
+; CHECK:  smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = add <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: smlal_v4i16_v4i32:
+; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = add <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: smlal_v2i32_v2i64:
+; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = add <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: umlal_v8i8_v8i16:
+; CHECK:  umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = add <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: umlal_v4i16_v4i32:
+; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = add <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: umlal_v2i32_v2i64:
+; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = add <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: smlsl_v8i8_v8i16:
+; CHECK:  smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = sub <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: smlsl_v4i16_v4i32:
+; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = sub <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: smlsl_v2i32_v2i64:
+; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = sub <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: umlsl_v8i8_v8i16:
+; CHECK:  umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = sub <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: umlsl_v4i16_v4i32:
+; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = sub <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: umlsl_v2i32_v2i64:
+; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = sub <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
+define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; CHECK-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp3 = sext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; Do not use SMULL if the BUILD_VECTOR element values are too big.
+; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK: movz
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %tmp3 = sext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
+; CHECK-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK:  smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp3 = sext <4 x i16> %arg to <4 x i32>
+  %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
+; CHECK: smull_extvec_v2i32_v2i64
+; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp3 = sext <2 x i32> %arg to <2 x i64>
+  %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp3 = zext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; Do not use SMULL if the BUILD_VECTOR element values are too big.
+; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK: movz
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %tmp3 = zext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK:  umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp3 = zext <4 x i16> %arg to <4 x i32>
+  %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp3 = zext <2 x i32> %arg to <2 x i64>
+  %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
+  ret <2 x i64> %tmp4
+}
+
+define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
+; If one operand has a zero-extend and the other a sign-extend, smull
+; cannot be used.
+; CHECK-LABEL: smullWithInconsistentExtensions:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %1 = sext <8 x i8> %vec to <8 x i16>
+  %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = extractelement <8 x i16> %2, i32 0
+  ret i16 %3
+}
+
+define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK-LABEL: distribute:
+; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
+; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
+  %0 = trunc i32 %mul to i8
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
+  %4 = bitcast <16 x i8> %3 to <2 x double>
+  %5 = extractelement <2 x double> %4, i32 1
+  %6 = bitcast double %5 to <8 x i8>
+  %7 = zext <8 x i8> %6 to <8 x i16>
+  %8 = zext <8 x i8> %2 to <8 x i16>
+  %9 = extractelement <2 x double> %4, i32 0
+  %10 = bitcast double %9 to <8 x i8>
+  %11 = zext <8 x i8> %10 to <8 x i16>
+  %12 = add <8 x i16> %7, %11
+  %13 = mul <8 x i16> %12, %8
+  %14 = bitcast i16* %dst to i8*
+  tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
+  ret void
+}
+
+declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+