[WebAssembly] Initial SIMD128 support.

Kicks off the implementation of wasm SIMD128 support (spec: https://github.com/stoklund/portable-simd/blob/master/portable-simd.md), adding support for add, sub, mul for i8x16, i16x8, i32x4, and f32x4. The spec is WIP, and might change in the near future. Patch by João Porto Differential Revision: https://reviews.llvm.org/D22686 llvm-svn: 277543
2025-01-31 20:51:52 +01:00 · 2016-08-02 23:16:09 +00:00 · 2016-08-02 23:16:09 +00:00 · 4c52f7daa5
commit 4c52f7daa5
parent f502af3ddc
17 changed files with 401 additions and 14 deletions
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@ -210,6 +210,11 @@ const char *llvm::WebAssembly::TypeToString(MVT Ty) {
    return "f32";
  case MVT::f64:
    return "f64";
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+  case MVT::v4f32:
+    return "v128";
  default:
    llvm_unreachable("unsupported type");
  }
--- a/lib/Target/WebAssembly/WebAssembly.td
+++ b/lib/Target/WebAssembly/WebAssembly.td
@ -23,7 +23,7 @@ include "llvm/Target/Target.td"
 // WebAssembly Subtarget features.
 //===----------------------------------------------------------------------===//

-def FeatureSIMD128 : SubtargetFeature<"simd128", "HasSIMD128", "false",
+def FeatureSIMD128 : SubtargetFeature<"simd128", "HasSIMD128", "true",
                                      "Enable 128-bit SIMD">;

 //===----------------------------------------------------------------------===//
--- a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@ -26,9 +26,10 @@
 ///
 //===----------------------------------------------------------------------===//

-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@ -71,6 +72,10 @@ static bool IsArgument(const MachineInstr &MI) {
  case WebAssembly::ARGUMENT_I64:
  case WebAssembly::ARGUMENT_F32:
  case WebAssembly::ARGUMENT_F64:
+  case WebAssembly::ARGUMENT_v16i8:
+  case WebAssembly::ARGUMENT_v8i16:
+  case WebAssembly::ARGUMENT_v4i32:
+  case WebAssembly::ARGUMENT_v4f32:
    return true;
  default:
    return false;
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@ -95,7 +95,8 @@ private:

 MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
  const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
-  for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
+  for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16,
+                MVT::v4i32, MVT::v4f32})
    if (TRC->hasType(T))
      return T;
  DEBUG(errs() << "Unknown type for register number: " << RegNo);
@ -234,13 +235,21 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
  case WebAssembly::ARGUMENT_I64:
  case WebAssembly::ARGUMENT_F32:
  case WebAssembly::ARGUMENT_F64:
+  case WebAssembly::ARGUMENT_v16i8:
+  case WebAssembly::ARGUMENT_v8i16:
+  case WebAssembly::ARGUMENT_v4i32:
+  case WebAssembly::ARGUMENT_v4f32:
    // These represent values which are live into the function entry, so there's
    // no instruction to emit.
    break;
  case WebAssembly::FALLTHROUGH_RETURN_I32:
  case WebAssembly::FALLTHROUGH_RETURN_I64:
  case WebAssembly::FALLTHROUGH_RETURN_F32:
-  case WebAssembly::FALLTHROUGH_RETURN_F64: {
+  case WebAssembly::FALLTHROUGH_RETURN_F64:
+  case WebAssembly::FALLTHROUGH_RETURN_v16i8:
+  case WebAssembly::FALLTHROUGH_RETURN_v8i16:
+  case WebAssembly::FALLTHROUGH_RETURN_v4i32:
+  case WebAssembly::FALLTHROUGH_RETURN_v4f32: {
    // These instructions represent the implicit return at the end of a
    // function body. The operand is always a pop.
    assert(MFI->isVRegStackified(MI->getOperand(0).getReg()));
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@ -113,6 +113,13 @@ private:
    case MVT::f32:
    case MVT::f64:
      return VT;
+    case MVT::v16i8:
+    case MVT::v8i16:
+    case MVT::v4i32:
+    case MVT::v4f32:
+      if (Subtarget->hasSIMD128())
+        return VT;
+      break;
    default:
      break;
    }
@ -575,7 +582,9 @@ bool WebAssemblyFastISel::fastLowerArguments() {
      return false;

    Type *ArgTy = Arg.getType();
-    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+      return false;
+    if (!Subtarget->hasSIMD128() && ArgTy->isVectorTy())
      return false;

    unsigned Opc;
@ -600,6 +609,22 @@ bool WebAssemblyFastISel::fastLowerArguments() {
      Opc = WebAssembly::ARGUMENT_F64;
      RC = &WebAssembly::F64RegClass;
      break;
+    case MVT::v16i8:
+      Opc = WebAssembly::ARGUMENT_v16i8;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v8i16:
+      Opc = WebAssembly::ARGUMENT_v8i16;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v4i32:
+      Opc = WebAssembly::ARGUMENT_v4i32;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v4f32:
+      Opc = WebAssembly::ARGUMENT_v4f32;
+      RC = &WebAssembly::V128RegClass;
+      break;
    default:
      return false;
    }
@ -639,6 +664,9 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
  if (IsVoid) {
    Opc = IsDirect ? WebAssembly::CALL_VOID : WebAssembly::CALL_INDIRECT_VOID;
  } else {
+    if (!Subtarget->hasSIMD128() && Call->getType()->isVectorTy())
+      return false;
+
    MVT::SimpleValueType RetTy = getSimpleType(Call->getType());
    switch (RetTy) {
    case MVT::i1:
@ -660,6 +688,26 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
      Opc = IsDirect ? WebAssembly::CALL_F64 : WebAssembly::CALL_INDIRECT_F64;
      ResultReg = createResultReg(&WebAssembly::F64RegClass);
      break;
+    case MVT::v16i8:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v16i8 : WebAssembly::CALL_INDIRECT_v16i8;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v8i16:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v8i16 : WebAssembly::CALL_INDIRECT_v8i16;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v4i32:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v4i32 : WebAssembly::CALL_INDIRECT_v4i32;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v4f32:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v4f32 : WebAssembly::CALL_INDIRECT_v4f32;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
    default:
      return false;
    }
@ -972,6 +1020,8 @@ bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
  const LoadInst *Load = cast<LoadInst>(I);
  if (Load->isAtomic())
    return false;
+  if (!Subtarget->hasSIMD128() && Load->getType()->isVectorTy())
+    return false;

  Address Addr;
  if (!computeAddress(Load->getPointerOperand(), Addr))
@ -1027,6 +1077,9 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) {
  const StoreInst *Store = cast<StoreInst>(I);
  if (Store->isAtomic())
    return false;
+  if (!Subtarget->hasSIMD128() &&
+      Store->getValueOperand()->getType()->isVectorTy())
+    return false;

  Address Addr;
  if (!computeAddress(Store->getPointerOperand(), Addr))
@ -1102,7 +1155,7 @@ bool WebAssemblyFastISel::selectBr(const Instruction *I) {
  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
      .addMBB(TBB)
      .addReg(CondReg);
-  
+
  finishCondBranch(Br->getParent(), TBB, FBB);
  return true;
 }
@ -1120,6 +1173,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
  }

  Value *RV = Ret->getOperand(0);
+  if (!Subtarget->hasSIMD128() && RV->getType()->isVectorTy())
+    return false;
+
  unsigned Opc;
  switch (getSimpleType(RV->getType())) {
  case MVT::i1: case MVT::i8:
@ -1129,8 +1185,24 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
  case MVT::i64:
    Opc = WebAssembly::RETURN_I64;
    break;
-  case MVT::f32: Opc = WebAssembly::RETURN_F32; break;
-  case MVT::f64: Opc = WebAssembly::RETURN_F64; break;
+  case MVT::f32:
+    Opc = WebAssembly::RETURN_F32;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::RETURN_F64;
+    break;
+  case MVT::v16i8:
+    Opc = WebAssembly::RETURN_v16i8;
+    break;
+  case MVT::v8i16:
+    Opc = WebAssembly::RETURN_v8i16;
+    break;
+  case MVT::v4i32:
+    Opc = WebAssembly::RETURN_v4i32;
+    break;
+  case MVT::v4f32:
+    Opc = WebAssembly::RETURN_v4f32;
+    break;
  default: return false;
  }

--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@ -54,6 +54,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
  addRegisterClass(MVT::i64, &WebAssembly::I64RegClass);
  addRegisterClass(MVT::f32, &WebAssembly::F32RegClass);
  addRegisterClass(MVT::f64, &WebAssembly::F64RegClass);
+  if (Subtarget->hasSIMD128()) {
+    addRegisterClass(MVT::v16i8, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v8i16, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v4i32, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v4f32, &WebAssembly::V128RegClass);
+  }
  // Compute derived properties from the register classes.
  computeRegisterProperties(Subtarget->getRegisterInfo());

@ -190,6 +196,10 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
    switch (Constraint[0]) {
      case 'r':
        assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+        if (Subtarget->hasSIMD128() && VT.isVector()) {
+          if (VT.getSizeInBits() == 128)
+            return std::make_pair(0U, &WebAssembly::V128RegClass);
+        }
        if (VT.isInteger() && !VT.isVector()) {
          if (VT.getSizeInBits() <= 32)
            return std::make_pair(0U, &WebAssembly::I32RegClass);
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@ -33,11 +33,29 @@ multiclass CALL<WebAssemblyRegClass vt, string prefix> {
                            [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
                            !strconcat(prefix, "call_indirect\t$dst, $callee")>;
 }
+
+multiclass SIMD_CALL<ValueType vt, string prefix> {
+  def CALL_#vt : SIMD_I<(outs V128:$dst), (ins i32imm:$callee, variable_ops),
+                         [(set (vt V128:$dst),
+                               (WebAssemblycall1 (i32 imm:$callee)))],
+                         !strconcat(prefix, "call\t$dst, $callee")>;
+  def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+                                  (ins I32:$callee, variable_ops),
+                                  [(set (vt V128:$dst),
+                                        (WebAssemblycall1 I32:$callee))],
+                                  !strconcat(prefix,
+                                             "call_indirect\t$dst, $callee")>;
+}
+
 let Uses = [SP32, SP64], isCall = 1 in {
  defm : CALL<I32, "i32.">;
  defm : CALL<I64, "i64.">;
  defm : CALL<F32, "f32.">;
  defm : CALL<F64, "f64.">;
+  defm : SIMD_CALL<v16i8, "i8x16.">;
+  defm : SIMD_CALL<v8i16, "i16x8.">;
+  defm : SIMD_CALL<v4i32, "i32x4.">;
+  defm : SIMD_CALL<v4f32, "f32x4.">;

  def CALL_VOID : I<(outs), (ins i32imm:$callee, variable_ops),
                    [(WebAssemblycall0 (i32 imm:$callee))],
@ -58,6 +76,14 @@ def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
          (CALL_F32 tglobaladdr:$callee)>;
 def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
          (CALL_F64 tglobaladdr:$callee)>;
+def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v16i8 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v8i16 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v4i32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
 def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
          (CALL_VOID tglobaladdr:$callee)>;

@ -70,5 +96,13 @@ def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
          (CALL_F32 texternalsym:$callee)>;
 def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
          (CALL_F64 texternalsym:$callee)>;
+def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v16i8 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v8i16 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v4i32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
 def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
          (CALL_VOID texternalsym:$callee)>;
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@ -77,12 +77,27 @@ multiclass RETURN<WebAssemblyRegClass vt> {
  def FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), []>;
 }

+multiclass SIMD_RETURN<ValueType vt> {
+  def RETURN_#vt : SIMD_I<(outs), (ins V128:$val),
+                          [(WebAssemblyreturn (vt V128:$val))],
+                          "return  \t$val">;
+  // Equivalent to RETURN_#vt, for use at the end of a function when wasm
+  // semantics return by falling off the end of the block.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_#vt : SIMD_I<(outs), (ins V128:$val), []>;
+}
+
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 let isReturn = 1 in {
  defm : RETURN<I32>;
  defm : RETURN<I64>;
  defm : RETURN<F32>;
  defm : RETURN<F64>;
+  defm : SIMD_RETURN<v16i8>;
+  defm : SIMD_RETURN<v8i16>;
+  defm : SIMD_RETURN<v4i32>;
+  defm : SIMD_RETURN<v4f32>;
+
  def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return">;

  // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@ -28,6 +28,9 @@ class I<dag oops, dag iops, list<dag> pattern, string asmstr = "">
  let Pattern        = pattern;
 }

+class SIMD_I<dag oops, dag iops, list<dag> pattern, string asmstr = "">
+    : I<oops, iops, pattern, asmstr>, Requires<[HasSIMD128]>;
+
 // Unary and binary instructions, for the local types that WebAssembly supports.
 multiclass UnaryInt<SDNode node, string name> {
  def _I32 : I<(outs I32:$dst), (ins I32:$src),
@ -61,6 +64,21 @@ multiclass BinaryFP<SDNode node, string name> {
               [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
               !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
 }
+multiclass SIMDBinary<SDNode node, SDNode fnode, string name> {
+  def _I8x16 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v16i8 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i8x16.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _I16x8 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v8i16 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i16x8.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _I32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v4i32 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _F32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v4f32 V128:$dst), (fnode V128:$lhs, V128:$rhs))],
+                      !strconcat("f32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+
+}
 multiclass ComparisonInt<CondCode cond, string name> {
  def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
               [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@ -100,10 +100,20 @@ multiclass ARGUMENT<WebAssemblyRegClass vt> {
  def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
                       [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
 }
+multiclass SIMD_ARGUMENT<ValueType vt> {
+  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+  def ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
+                            [(set (vt V128:$res),
+                                  (WebAssemblyargument timm:$argno))]>;
+}
 defm : ARGUMENT<I32>;
 defm : ARGUMENT<I64>;
 defm : ARGUMENT<F32>;
 defm : ARGUMENT<F64>;
+defm : SIMD_ARGUMENT<v16i8>;
+defm : SIMD_ARGUMENT<v8i16>;
+defm : SIMD_ARGUMENT<v4i32>;
+defm : SIMD_ARGUMENT<v4f32>;

 let Defs = [ARGUMENTS] in {

@ -131,6 +141,7 @@ defm : LOCAL<I32>;
 defm : LOCAL<I64>;
 defm : LOCAL<F32>;
 defm : LOCAL<F64>;
+defm : LOCAL<V128>, Requires<[HasSIMD128]>;

 let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm),
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@ -12,5 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//

-// TODO: Implement SIMD instructions.
-// Note: use Requires<[HasSIMD128]>.
+let isCommutable = 1 in {
+defm ADD : SIMDBinary<add, fadd, "add ">;
+defm MUL: SIMDBinary<mul, fmul, "mul ">;
+} // isCommutable = 1
+defm SUB: SIMDBinary<sub, fsub, "sub ">;
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@ -108,7 +108,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {

  MachineRegisterInfo &MRI = MF.getRegInfo();
  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
-  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const auto &Subtarget = MF.getSubtarget<WebAssemblySubtarget>();
+  const auto &TII = *Subtarget.getInstrInfo();
  const WebAssemblyTargetLowering &TLI =
      *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
@ -186,6 +187,34 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
            WebAssembly::COPY_LOCAL_F64);
        break;
+      case WebAssembly::RETURN_v16i8:
+        Changed |=
+            Subtarget.hasSIMD128() &&
+            MaybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII,
+                                      WebAssembly::FALLTHROUGH_RETURN_v16i8,
+                                      WebAssembly::COPY_LOCAL_V128);
+        break;
+      case WebAssembly::RETURN_v8i16:
+        Changed |=
+            Subtarget.hasSIMD128() &&
+            MaybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII,
+                                      WebAssembly::FALLTHROUGH_RETURN_v8i16,
+                                      WebAssembly::COPY_LOCAL_V128);
+        break;
+      case WebAssembly::RETURN_v4i32:
+        Changed |=
+            Subtarget.hasSIMD128() &&
+            MaybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII,
+                                      WebAssembly::FALLTHROUGH_RETURN_v4i32,
+                                      WebAssembly::COPY_LOCAL_V128);
+        break;
+      case WebAssembly::RETURN_v4f32:
+        Changed |=
+            Subtarget.hasSIMD128() &&
+            MaybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII,
+                                      WebAssembly::FALLTHROUGH_RETURN_v4f32,
+                                      WebAssembly::COPY_LOCAL_V128);
+        break;
      case WebAssembly::RETURN_VOID:
        if (!DisableWebAssemblyFallthroughReturnOpt &&
            &MBB == &MF.back() && &MI == &MBB.back())
--- a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@ -65,6 +65,10 @@ static bool IsArgument(const MachineInstr *MI) {
  case WebAssembly::ARGUMENT_I64:
  case WebAssembly::ARGUMENT_F32:
  case WebAssembly::ARGUMENT_F64:
+  case WebAssembly::ARGUMENT_v16i8:
+  case WebAssembly::ARGUMENT_v8i16:
+  case WebAssembly::ARGUMENT_v4i32:
+  case WebAssembly::ARGUMENT_v4f32:
    return true;
  default:
    return false;
@ -73,7 +77,7 @@ static bool IsArgument(const MachineInstr *MI) {

 // Test whether the given register has an ARGUMENT def.
 static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
-  for (auto &Def : MRI.def_instructions(Reg))
+  for (const auto &Def : MRI.def_instructions(Reg))
    if (IsArgument(&Def))
      return true;
  return false;
--- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@ -72,7 +72,11 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
    case WebAssembly::ARGUMENT_I32:
    case WebAssembly::ARGUMENT_I64:
    case WebAssembly::ARGUMENT_F32:
-    case WebAssembly::ARGUMENT_F64: {
+    case WebAssembly::ARGUMENT_F64:
+    case WebAssembly::ARGUMENT_v16i8:
+    case WebAssembly::ARGUMENT_v8i16:
+    case WebAssembly::ARGUMENT_v4i32:
+    case WebAssembly::ARGUMENT_v4f32: {
      int64_t Imm = MI.getOperand(1).getImm();
      DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
                   << Imm << "\n");
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@ -418,6 +418,8 @@ static unsigned GetTeeLocalOpcode(const TargetRegisterClass *RC) {
    return WebAssembly::TEE_LOCAL_F32;
  if (RC == &WebAssembly::F64RegClass)
    return WebAssembly::TEE_LOCAL_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::TEE_LOCAL_V128;
  llvm_unreachable("Unexpected register class");
 }

@ -765,7 +767,11 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
        if (Def->getOpcode() == WebAssembly::ARGUMENT_I32 ||
            Def->getOpcode() == WebAssembly::ARGUMENT_I64 ||
            Def->getOpcode() == WebAssembly::ARGUMENT_F32 ||
-            Def->getOpcode() == WebAssembly::ARGUMENT_F64)
+            Def->getOpcode() == WebAssembly::ARGUMENT_F64 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_v16i8 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_v8i16 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_v4i32 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_v4f32)
          continue;

        // Decide which strategy to take. Prefer to move a single-use value
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@ -39,6 +39,8 @@ def SP64 : WebAssemblyReg<"%SP64">;
 def F32_0 : WebAssemblyReg<"%f32.0">;
 def F64_0 : WebAssemblyReg<"%f64.0">;

+def V128_0: WebAssemblyReg<"%v128">;
+
 // The expression stack "register". This is an opaque entity which serves to
 // order uses and defs that must remain in LIFO order.
 def EXPR_STACK : WebAssemblyReg<"STACK">;
@ -56,3 +58,5 @@ def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>;
 def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>;
 def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
+def V128 : WebAssemblyRegClass<[v4f32, v4i32, v16i8, v8i16], 128, (add V128_0)>;
+
--- a/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/test/CodeGen/WebAssembly/simd-arith.ll
@ -0,0 +1,158 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -mattr=+simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -mattr=-simd128 | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -mattr=-simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+
+; Test that basic SIMD128 arithmetic operations assemble as expected.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i32 @llvm.ctpop.i32(i32)
+
+; ==============================================================================
+; 16 x i8
+; ==============================================================================
+; CHECK-LABEL: add_v16i8
+; NO-SIMD128-NOT: i8x16
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i8x16.add $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %a = add <16 x i8> %x, %y
+  ret <16 x i8> %a
+}
+
+; CHECK-LABEL: sub_v16i8
+; NO-SIMD128-NOT: i8x16
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i8x16.sub $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %a = sub <16 x i8> %x, %y
+  ret <16 x i8> %a
+}
+
+; CHECK-LABEL: mul_v16i8
+; NO-SIMD128-NOT: i8x16
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i8x16.mul $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %a = mul <16 x i8> %x, %y
+  ret <16 x i8> %a
+}
+
+; ==============================================================================
+; 8 x i16
+; ==============================================================================
+; CHECK-LABEL: add_v8i16
+; NO-SIMD128-NOT: i16x8
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i16x8.add $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
+  %a = add <8 x i16> %x, %y
+  ret <8 x i16> %a
+}
+
+; CHECK-LABEL: sub_v8i16
+; NO-SIMD128-NOT: i16x8
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i16x8.sub $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
+  %a = sub <8 x i16> %x, %y
+  ret <8 x i16> %a
+}
+
+; CHECK-LABEL: mul_v8i16
+; NO-SIMD128-NOT: i16x8
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i16x8.mul $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
+  %a = mul <8 x i16> %x, %y
+  ret <8 x i16> %a
+}
+
+; ==============================================================================
+; 4 x i32
+; ==============================================================================
+; CHECK-LABEL: add_v4i32
+; NO-SIMD128-NOT: i32x4
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i32x4.add $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
+  %a = add <4 x i32> %x, %y
+  ret <4 x i32> %a
+}
+
+; CHECK-LABEL: sub_v4i32
+; NO-SIMD128-NOT: i32x4
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i32x4.sub $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
+  %a = sub <4 x i32> %x, %y
+  ret <4 x i32> %a
+}
+
+; CHECK-LABEL: mul_v4i32
+; NO-SIMD128-NOT: i32x4
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: i32x4.mul $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
+  %a = mul <4 x i32> %x, %y
+  ret <4 x i32> %a
+}
+
+; ==============================================================================
+; 4 x float
+; ==============================================================================
+; CHECK-LABEL: add_v4f32
+; NO-SIMD128-NOT: f32x4
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: f32x4.add $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
+  %a = fadd <4 x float> %x, %y
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: sub_v4f32
+; NO-SIMD128-NOT: f32x4
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: f32x4.sub $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
+  %a = fsub <4 x float> %x, %y
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: mul_v4f32
+; NO-SIMD128-NOT: f32x4
+; SIMD128: .param v128, v128{{$}}
+; SIMD128: .result v128{{$}}
+; SIMD128: f32x4.mul $push0=, $0, $1{{$}}
+; SIMD128: return $pop0{{$}}
+define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
+  %a = fmul <4 x float> %x, %y
+  ret <4 x float> %a
+}
+