[AArch64] Armv8.6-a Matrix Mult Assembly + Intrinsics

This patch upstreams support for the Armv8.6-a Matrix Multiplication Extension. A summary of the features can be found here: https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a This patch includes: - Assembly support for AArch64 only (no SVE or Neon) - Intrinsics Support for AArch64 Armv8.6a Matrix Multiplication Instructions (No bfloat16 matrix multiplication) No IR types or C Types are needed for this extension. This is part of a patch series, starting with BFloat16 support and the other components in the armv8.6a extension (in previous patches linked in phabricator) Based on work by: - Luke Geeson - Oliver Stannard - Luke Cheeseman Reviewers: ostannard, t.p.northover, rengolin, kmclaughlin Reviewed By: kmclaughlin Subscribers: kmclaughlin, kristof.beyls, hiraditya, danielkiss, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D77871
2025-01-31 12:41:49 +01:00 · 2020-04-09 17:21:19 +01:00 · 2020-04-09 17:21:19 +01:00 · 2ce0a5d73d
commit 2ce0a5d73d
parent 115f590f26
9 changed files with 342 additions and 17 deletions
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@ -173,6 +173,11 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
    : Intrinsic<[llvm_anyvector_ty],
                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                [IntrNoMem]>;
+
+  class AdvSIMD_MatMul_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
 }

 // Arithmetic ops
@ -449,6 +454,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
  def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
  def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;

+// v8.6-A Matrix Multiply Intrinsics
+  def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
+
  // v8.2-A FP16 Fused Multiply-Add Long
  def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
  def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@ -373,6 +373,15 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
 def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
    "true", "Enable BFloat16 Extension" >;

+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+    "true", "Enable Matrix Multiply Int8 Extension">;
+
+def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
+    "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
+
+def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
+    "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
+
 def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
    "true", "Enable fine grained virtualization traps extension">;

@ -380,7 +389,6 @@ def FeatureEnhancedCounterVirtualization :
      SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
      "true", "Enable enhanced counter virtualization extension">;

-
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@ -413,7 +421,7 @@ def HasV8_6aOps : SubtargetFeature<
  "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",

  [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
-   FeatureEnhancedCounterVirtualization]>;
+   FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;

 //===----------------------------------------------------------------------===//
 // Register File Description
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@ -5550,11 +5550,11 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,

 // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
 // bytes from S-sized elements.
-class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
+class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
                                 string kind2, RegisterOperand RegType,
                                 ValueType AccumType, ValueType InputType,
                                 SDPatternOperator OpNode> :
-        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+        BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
        [(set (AccumType RegType:$dst),
              (OpNode (AccumType RegType:$Rd),
                      (InputType RegType:$Rn),
@ -5562,10 +5562,10 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
 }

-multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
                                         v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
                                         v4i32, v16i8, OpNode>;
 }

@ -7903,13 +7903,26 @@ class BF16ToSinglePrecision<string asm>
 }
 } // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0

+//----------------------------------------------------------------------------
+// Armv8.6 Matrix Multiply Extension
+//----------------------------------------------------------------------------
+
+class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
+  : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
+              [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
+  let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
+}
+
+//----------------------------------------------------------------------------
 // ARMv8.2-A Dot Product Instructions (Indexed)
-class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
-                                      string lhs_kind, string rhs_kind,
+class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
+                                      string dst_kind, string lhs_kind, string rhs_kind,
                                      RegisterOperand RegType,
                                      ValueType AccumType, ValueType InputType,
                                      SDPatternOperator OpNode> :
-        BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
+        BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
                            VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
        [(set (AccumType RegType:$dst),
              (AccumType (OpNode (AccumType RegType:$Rd),
@ -7922,11 +7935,11 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
  let Inst{11}    = idx{1};  // H
 }

-multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
                                       SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
+  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
                                              V64, v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
+  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
                                              V128, v4i32, v16i8, OpNode>;
 }

--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@ -146,6 +146,12 @@ def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
                       AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
 def HasBF16          : Predicate<"Subtarget->hasBF16()">,
                       AssemblerPredicate<(all_of FeatureBF16), "bf16">;
+def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
+                       AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
+def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
+                       AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
+def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
+                       AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@ -745,10 +751,10 @@ def TSB   : CRmSystemI<barrier_op, 0b010, "tsb", []> {

 // ARMv8.2-A Dot Product
 let Predicates = [HasDotProd] in {
-defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
-defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
+defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
+defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>;
+defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>;
+defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
 }

 // ARMv8.6-A BFloat
@ -765,6 +771,40 @@ def BFCVTN2      : SIMD_BFCVTN2;
 def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
 }

+// ARMv8.6A AArch64 matrix multiplication
+let Predicates = [HasMatMulInt8] in {
+def  SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
+def  UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
+def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
+defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
+defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
+
+// sudot lane has a pattern where usdot is expected (there is no sudot).
+// The second operand is used in the dup operation to repeat the indexed
+// element.
+class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
+                         string rhs_kind, RegisterOperand RegType,
+                         ValueType AccumType, ValueType InputType>
+      : BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind,
+                                        lhs_kind, rhs_kind, RegType, AccumType,
+                                        InputType, null_frag> {
+  let Pattern = [(set (AccumType RegType:$dst),
+                      (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
+                                 (InputType (bitconvert (AccumType
+                                    (AArch64duplane32 (v4i32 V128:$Rm),
+                                        VectorIndexS:$idx)))),
+                                 (InputType RegType:$Rn))))];
+}
+
+multiclass SIMDSUDOTIndex {
+  def v8i8  : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>;
+  def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>;
+}
+
+defm SUDOTlane : SIMDSUDOTIndex;
+
+}
+
 // ARMv8.2-A FP16 Fused Multiply-Add Long
 let Predicates = [HasNEON, HasFP16FML] in {
 defm FMLAL      : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@ -148,6 +148,9 @@ protected:

  // Armv8.6-A Extensions
  bool HasBF16 = false;
+  bool HasMatMulInt8 = false;
+  bool HasMatMulFP32 = false;
+  bool HasMatMulFP64 = false;
  bool HasAMVS = false;
  bool HasFineGrainedTraps = false;
  bool HasEnhancedCounterVirtualization = false;
@ -417,6 +420,9 @@ public:
  bool hasSVE2SM4() const { return HasSVE2SM4; }
  bool hasSVE2SHA3() const { return HasSVE2SHA3; }
  bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
+  bool hasMatMulInt8() const { return HasMatMulInt8; }
+  bool hasMatMulFP32() const { return HasMatMulFP32; }
+  bool hasMatMulFP64() const { return HasMatMulFP64; }

  // Armv8.6-A Extensions
  bool hasBF16() const { return HasBF16; }
--- a/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/test/CodeGen/AArch64/aarch64-matmul.ll
@ -0,0 +1,136 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
+
+define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: smmla.v4i32.v16i8
+; CHECK: smmla   v0.4s, v1.16b, v2.16b
+  %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x i32> %vmmla1.i
+}
+
+define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: ummla.v4i32.v16i8
+; CHECK: ummla   v0.4s, v1.16b, v2.16b
+  %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x i32> %vmmla1.i
+}
+
+define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usmmla.v4i32.v16i8
+; CHECK: usmmla   v0.4s, v1.16b, v2.16b
+  %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vusmmla1.i
+}
+
+define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot.v2i32.v8i8
+; CHECK: usdot   v0.2s, v1.8b, v2.8b
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v2i32.v8i8
+; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v2i32.v8i8
+; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v2i32.v16i8
+; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v2i32.v16i8
+; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3
+  ret <2 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.16b
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v4i32.v16i8
+; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_laneq.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_laneq.v4i32.v16i8
+; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+
--- a/test/MC/AArch64/armv8.6a-simd-matmul-error.s
+++ b/test/MC/AArch64/armv8.6a-simd-matmul-error.s
@ -0,0 +1,34 @@
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm       < %s 2>&1 | FileCheck %s
+
+// No interesting edge cases for [US]MMLA, except for the fact that the data
+// types are fixed (no 64-bit version), and USMMLA exists, but SUMMLA does not.
+smmla  v1.2s, v16.8b, v31.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+summla v1.4s, v16.16b, v31.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: unrecognized instruction mnemonic, did you mean: smmla, ummla, usmmla?
+
+// USDOT (vector) has two valid data type combinations, others are rejected.
+usdot v3.4s, v15.8b, v30.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+usdot v3.2s, v15.16b, v30.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+// For USDOT and SUDOT (indexed), the index is in range [0,3] (regardless of data types)
+usdot v31.2s, v1.8b,  v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+usdot v31.4s, v1.16b, v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+sudot v31.2s, v1.8b,  v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+sudot v31.4s, v1.16b, v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+
+// The arrangement specifiers of the first two operands must match.
+usdot v31.4s, v1.8b,  v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+usdot v31.2s, v1.16b, v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+sudot v31.4s, v1.8b,  v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+sudot v31.2s, v1.16b, v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
--- a/test/MC/AArch64/armv8.6a-simd-matmul.s
+++ b/test/MC/AArch64/armv8.6a-simd-matmul.s
@ -0,0 +1,43 @@
+// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm       < %s      | FileCheck %s
+// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a      < %s      | FileCheck %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a-i8mm < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
+
+smmla  v1.4s, v16.16b, v31.16b
+ummla  v1.4s, v16.16b, v31.16b
+usmmla v1.4s, v16.16b, v31.16b
+// CHECK: smmla   v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x4e]
+// CHECK: ummla   v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x6e]
+// CHECK: usmmla  v1.4s, v16.16b, v31.16b // encoding: [0x01,0xae,0x9f,0x4e]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: smmla  v1.4s, v16.16b, v31.16b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: ummla  v1.4s, v16.16b, v31.16b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usmmla  v1.4s, v16.16b, v31.16b
+
+usdot v3.2s, v15.8b, v30.8b
+usdot v3.4s, v15.16b, v30.16b
+// CHECK: usdot   v3.2s, v15.8b, v30.8b   // encoding: [0xe3,0x9d,0x9e,0x0e]
+// CHECK: usdot   v3.4s, v15.16b, v30.16b // encoding: [0xe3,0x9d,0x9e,0x4e]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot v3.2s, v15.8b, v30.8b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot v3.4s, v15.16b, v30.16b
+
+usdot v31.2s, v1.8b,  v2.4b[3]
+usdot v31.4s, v1.16b, v2.4b[3]
+// CHECK: usdot   v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x0f]
+// CHECK: usdot   v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x4f]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot   v31.2s, v1.8b, v2.4b[3]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot   v31.4s, v1.16b, v2.4b[3]
+
+sudot v31.2s, v1.8b,  v2.4b[3]
+sudot v31.4s, v1.16b, v2.4b[3]
+// CHECK: sudot   v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x0f]
+// CHECK: sudot   v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x4f]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: sudot   v31.2s, v1.8b, v2.4b[3]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: sudot   v31.4s, v1.16b, v2.4b[3]
--- a/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt
+++ b/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt
@ -0,0 +1,34 @@
+# RUN:     llvm-mc -triple=aarch64  -mattr=+i8mm -disassemble < %s       | FileCheck %s
+# RUN:     llvm-mc -triple=aarch64  -mattr=+v8.6a -disassemble < %s      | FileCheck %s
+# RUN: not llvm-mc -triple=aarch64  -mattr=+v8.5a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOI8MM
+
+[0x01,0xa6,0x9f,0x4e]
+[0x01,0xa6,0x9f,0x6e]
+[0x01,0xae,0x9f,0x4e]
+# CHECK: smmla   v1.4s, v16.16b, v31.16b
+# CHECK: ummla   v1.4s, v16.16b, v31.16b
+# CHECK: usmmla  v1.4s, v16.16b, v31.16b
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0xe3,0x9d,0x9e,0x0e]
+[0xe3,0x9d,0x9e,0x4e]
+# CHECK: usdot   v3.2s, v15.8b, v30.8b
+# CHECK: usdot   v3.4s, v15.16b, v30.16b
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x3f,0xf8,0xa2,0x0f]
+[0x3f,0xf8,0xa2,0x4f]
+# CHECK: usdot   v31.2s, v1.8b, v2.4b[3]
+# CHECK: usdot   v31.4s, v1.16b, v2.4b[3]
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x3f,0xf8,0x22,0x0f]
+[0x3f,0xf8,0x22,0x4f]
+# CHECK: sudot   v31.2s, v1.8b, v2.4b[3]
+# CHECK: sudot   v31.4s, v1.16b, v2.4b[3]
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding