1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[AArch64] Armv8.6-a Matrix Mult Assembly + Intrinsics

This patch upstreams support for the Armv8.6-a Matrix Multiplication
Extension. A summary of the features can be found here:

https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a

This patch includes:

- Assembly support for AArch64 only (no SVE or Neon)
- Intrinsics Support for AArch64 Armv8.6a Matrix Multiplication Instructions (No bfloat16 matrix multiplication)

No IR types or C Types are needed for this extension.

This is part of a patch series, starting with BFloat16 support and
the other components in the armv8.6a extension (in previous patches
linked in phabricator)

Based on work by:
- Luke Geeson
- Oliver Stannard
- Luke Cheeseman

Reviewers: ostannard, t.p.northover, rengolin, kmclaughlin

Reviewed By: kmclaughlin

Subscribers: kmclaughlin, kristof.beyls, hiraditya, danielkiss,
cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D77871
This commit is contained in:
Luke Geeson 2020-04-09 17:21:19 +01:00
parent 115f590f26
commit 2ce0a5d73d
9 changed files with 342 additions and 17 deletions

View File

@ -173,6 +173,11 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;
class AdvSIMD_MatMul_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;
}
// Arithmetic ops
@ -449,6 +454,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;
// v8.6-A Matrix Multiply Intrinsics
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
// v8.2-A FP16 Fused Multiply-Add Long
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;

View File

@ -373,6 +373,15 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
"true", "Enable BFloat16 Extension" >;
def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
"true", "Enable Matrix Multiply Int8 Extension">;
def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
"true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
"true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
"true", "Enable fine grained virtualization traps extension">;
@ -380,7 +389,6 @@ def FeatureEnhancedCounterVirtualization :
SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
"true", "Enable enhanced counter virtualization extension">;
//===----------------------------------------------------------------------===//
// Architectures.
//
@ -413,7 +421,7 @@ def HasV8_6aOps : SubtargetFeature<
"v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
[HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
FeatureEnhancedCounterVirtualization]>;
FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
//===----------------------------------------------------------------------===//
// Register File Description

View File

@ -5550,11 +5550,11 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
// ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
// bytes from S-sized elements.
class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
string kind2, RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
SDPatternOperator OpNode> :
BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
[(set (AccumType RegType:$dst),
(OpNode (AccumType RegType:$Rd),
(InputType RegType:$Rn),
@ -5562,10 +5562,10 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
}
multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
v2i32, v8i8, OpNode>;
def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
v4i32, v16i8, OpNode>;
}
@ -7903,13 +7903,26 @@ class BF16ToSinglePrecision<string asm>
}
} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
//----------------------------------------------------------------------------
// Armv8.6 Matrix Multiply Extension
//----------------------------------------------------------------------------
class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
: BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
}
//----------------------------------------------------------------------------
// ARMv8.2-A Dot Product Instructions (Indexed)
class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
string lhs_kind, string rhs_kind,
class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
string dst_kind, string lhs_kind, string rhs_kind,
RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
SDPatternOperator OpNode> :
BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
[(set (AccumType RegType:$dst),
(AccumType (OpNode (AccumType RegType:$Rd),
@ -7922,11 +7935,11 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
let Inst{11} = idx{1}; // H
}
multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
V64, v2i32, v8i8, OpNode>;
def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
V128, v4i32, v16i8, OpNode>;
}

View File

@ -146,6 +146,12 @@ def HasTRBE : Predicate<"Subtarget->hasTRBE()">,
AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">,
AssemblerPredicate<(all_of FeatureBF16), "bf16">;
def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">,
AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">,
AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">,
AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
@ -745,10 +751,10 @@ def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> {
// ARMv8.2-A Dot Product
let Predicates = [HasDotProd] in {
defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>;
defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>;
defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
}
// ARMv8.6-A BFloat
@ -765,6 +771,40 @@ def BFCVTN2 : SIMD_BFCVTN2;
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
}
// ARMv8.6A AArch64 matrix multiplication
let Predicates = [HasMatMulInt8] in {
def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
// sudot lane has a pattern where usdot is expected (there is no sudot).
// The second operand is used in the dup operation to repeat the indexed
// element.
class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
string rhs_kind, RegisterOperand RegType,
ValueType AccumType, ValueType InputType>
: BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind,
lhs_kind, rhs_kind, RegType, AccumType,
InputType, null_frag> {
let Pattern = [(set (AccumType RegType:$dst),
(AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
(InputType (bitconvert (AccumType
(AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx)))),
(InputType RegType:$Rn))))];
}
multiclass SIMDSUDOTIndex {
def v8i8 : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>;
def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>;
}
defm SUDOTlane : SIMDSUDOTIndex;
}
// ARMv8.2-A FP16 Fused Multiply-Add Long
let Predicates = [HasNEON, HasFP16FML] in {
defm FMLAL : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;

View File

@ -148,6 +148,9 @@ protected:
// Armv8.6-A Extensions
bool HasBF16 = false;
bool HasMatMulInt8 = false;
bool HasMatMulFP32 = false;
bool HasMatMulFP64 = false;
bool HasAMVS = false;
bool HasFineGrainedTraps = false;
bool HasEnhancedCounterVirtualization = false;
@ -417,6 +420,9 @@ public:
bool hasSVE2SM4() const { return HasSVE2SM4; }
bool hasSVE2SHA3() const { return HasSVE2SHA3; }
bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
bool hasMatMulInt8() const { return HasMatMulInt8; }
bool hasMatMulFP32() const { return HasMatMulFP32; }
bool hasMatMulFP64() const { return HasMatMulFP64; }
// Armv8.6-A Extensions
bool hasBF16() const { return HasBF16; }

View File

@ -0,0 +1,136 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
entry:
; CHECK-LABEL: smmla.v4i32.v16i8
; CHECK: smmla v0.4s, v1.16b, v2.16b
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
ret <4 x i32> %vmmla1.i
}
define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
entry:
; CHECK-LABEL: ummla.v4i32.v16i8
; CHECK: ummla v0.4s, v1.16b, v2.16b
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
ret <4 x i32> %vmmla1.i
}
define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
entry:
; CHECK-LABEL: usmmla.v4i32.v16i8
; CHECK: usmmla v0.4s, v1.16b, v2.16b
%vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
ret <4 x i32> %vusmmla1.i
}
define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
entry:
; CHECK-LABEL: usdot.v2i32.v8i8
; CHECK: usdot v0.2s, v1.8b, v2.8b
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
ret <2 x i32> %vusdot1.i
}
define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
entry:
; CHECK-LABEL: usdot_lane.v2i32.v8i8
; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
ret <2 x i32> %vusdot1.i
}
define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
entry:
; CHECK-LABEL: sudot_lane.v2i32.v8i8
; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a)
ret <2 x i32> %vusdot1.i
}
define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
entry:
; CHECK-LABEL: usdot_lane.v2i32.v16i8
; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
ret <2 x i32> %vusdot1.i
}
define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
entry:
; CHECK-LABEL: sudot_lane.v2i32.v16i8
; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3
ret <2 x i32> %vusdot1.i
}
define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
entry:
; CHECK-LABEL: usdot.v4i32.v16i8
; CHECK: usdot v0.4s, v1.16b, v2.16b
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
ret <4 x i32> %vusdot1.i
}
define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
entry:
; CHECK-LABEL: usdot_lane.v4i32.v16i8
; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
ret <4 x i32> %vusdot1.i
}
define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
entry:
; CHECK-LABEL: sudot_lane.v4i32.v16i8
; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
ret <4 x i32> %vusdot1.i
}
define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
entry:
; CHECK-LABEL: usdot_laneq.v4i32.v16i8
; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
ret <4 x i32> %vusdot1.i
}
define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
entry:
; CHECK-LABEL: sudot_laneq.v4i32.v16i8
; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
ret <4 x i32> %vusdot1.i
}
declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2

View File

@ -0,0 +1,34 @@
// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm < %s 2>&1 | FileCheck %s
// No interesting edge cases for [US]MMLA, except for the fact that the data
// types are fixed (no 64-bit version), and USMMLA exists, but SUMMLA does not.
smmla v1.2s, v16.8b, v31.8b
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
summla v1.4s, v16.16b, v31.16b
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: unrecognized instruction mnemonic, did you mean: smmla, ummla, usmmla?
// USDOT (vector) has two valid data type combinations, others are rejected.
usdot v3.4s, v15.8b, v30.8b
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
usdot v3.2s, v15.16b, v30.16b
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
// For USDOT and SUDOT (indexed), the index is in range [0,3] (regardless of data types)
usdot v31.2s, v1.8b, v2.4b[4]
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
usdot v31.4s, v1.16b, v2.4b[4]
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
sudot v31.2s, v1.8b, v2.4b[4]
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
sudot v31.4s, v1.16b, v2.4b[4]
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
// The arrangement specifiers of the first two operands must match.
usdot v31.4s, v1.8b, v2.4b[0]
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
usdot v31.2s, v1.16b, v2.4b[0]
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
sudot v31.4s, v1.8b, v2.4b[0]
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
sudot v31.2s, v1.16b, v2.4b[0]
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

View File

@ -0,0 +1,43 @@
// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm < %s | FileCheck %s
// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a < %s | FileCheck %s
// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a-i8mm < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
smmla v1.4s, v16.16b, v31.16b
ummla v1.4s, v16.16b, v31.16b
usmmla v1.4s, v16.16b, v31.16b
// CHECK: smmla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x4e]
// CHECK: ummla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x6e]
// CHECK: usmmla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xae,0x9f,0x4e]
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: smmla v1.4s, v16.16b, v31.16b
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: ummla v1.4s, v16.16b, v31.16b
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: usmmla v1.4s, v16.16b, v31.16b
usdot v3.2s, v15.8b, v30.8b
usdot v3.4s, v15.16b, v30.16b
// CHECK: usdot v3.2s, v15.8b, v30.8b // encoding: [0xe3,0x9d,0x9e,0x0e]
// CHECK: usdot v3.4s, v15.16b, v30.16b // encoding: [0xe3,0x9d,0x9e,0x4e]
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: usdot v3.2s, v15.8b, v30.8b
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: usdot v3.4s, v15.16b, v30.16b
usdot v31.2s, v1.8b, v2.4b[3]
usdot v31.4s, v1.16b, v2.4b[3]
// CHECK: usdot v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x0f]
// CHECK: usdot v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x4f]
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: usdot v31.2s, v1.8b, v2.4b[3]
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: usdot v31.4s, v1.16b, v2.4b[3]
sudot v31.2s, v1.8b, v2.4b[3]
sudot v31.4s, v1.16b, v2.4b[3]
// CHECK: sudot v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x0f]
// CHECK: sudot v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x4f]
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: sudot v31.2s, v1.8b, v2.4b[3]
// NOMATMUL: instruction requires: i8mm
// NOMATMUL-NEXT: sudot v31.4s, v1.16b, v2.4b[3]

View File

@ -0,0 +1,34 @@
# RUN: llvm-mc -triple=aarch64 -mattr=+i8mm -disassemble < %s | FileCheck %s
# RUN: llvm-mc -triple=aarch64 -mattr=+v8.6a -disassemble < %s | FileCheck %s
# RUN: not llvm-mc -triple=aarch64 -mattr=+v8.5a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOI8MM
[0x01,0xa6,0x9f,0x4e]
[0x01,0xa6,0x9f,0x6e]
[0x01,0xae,0x9f,0x4e]
# CHECK: smmla v1.4s, v16.16b, v31.16b
# CHECK: ummla v1.4s, v16.16b, v31.16b
# CHECK: usmmla v1.4s, v16.16b, v31.16b
# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
[0xe3,0x9d,0x9e,0x0e]
[0xe3,0x9d,0x9e,0x4e]
# CHECK: usdot v3.2s, v15.8b, v30.8b
# CHECK: usdot v3.4s, v15.16b, v30.16b
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
[0x3f,0xf8,0xa2,0x0f]
[0x3f,0xf8,0xa2,0x4f]
# CHECK: usdot v31.2s, v1.8b, v2.4b[3]
# CHECK: usdot v31.4s, v1.16b, v2.4b[3]
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
[0x3f,0xf8,0x22,0x0f]
[0x3f,0xf8,0x22,0x4f]
# CHECK: sudot v31.2s, v1.8b, v2.4b[3]
# CHECK: sudot v31.4s, v1.16b, v2.4b[3]
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding