mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[AArch64] Armv8.6-a Matrix Mult Assembly + Intrinsics
This patch upstreams support for the Armv8.6-a Matrix Multiplication Extension. A summary of the features can be found here: https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a This patch includes: - Assembly support for AArch64 only (no SVE or Neon) - Intrinsics Support for AArch64 Armv8.6a Matrix Multiplication Instructions (No bfloat16 matrix multiplication) No IR types or C Types are needed for this extension. This is part of a patch series, starting with BFloat16 support and the other components in the armv8.6a extension (in previous patches linked in phabricator) Based on work by: - Luke Geeson - Oliver Stannard - Luke Cheeseman Reviewers: ostannard, t.p.northover, rengolin, kmclaughlin Reviewed By: kmclaughlin Subscribers: kmclaughlin, kristof.beyls, hiraditya, danielkiss, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D77871
This commit is contained in:
parent
115f590f26
commit
2ce0a5d73d
@ -173,6 +173,11 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
|
||||
: Intrinsic<[llvm_anyvector_ty],
|
||||
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
|
||||
[IntrNoMem]>;
|
||||
|
||||
class AdvSIMD_MatMul_Intrinsic
|
||||
: Intrinsic<[llvm_anyvector_ty],
|
||||
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
|
||||
[IntrNoMem]>;
|
||||
}
|
||||
|
||||
// Arithmetic ops
|
||||
@ -449,6 +454,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
|
||||
def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
|
||||
def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;
|
||||
|
||||
// v8.6-A Matrix Multiply Intrinsics
|
||||
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
|
||||
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
|
||||
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
|
||||
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
|
||||
|
||||
// v8.2-A FP16 Fused Multiply-Add Long
|
||||
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
|
||||
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
|
||||
|
@ -373,6 +373,15 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
|
||||
def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
|
||||
"true", "Enable BFloat16 Extension" >;
|
||||
|
||||
def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
|
||||
"true", "Enable Matrix Multiply Int8 Extension">;
|
||||
|
||||
def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
|
||||
"true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
|
||||
|
||||
def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
|
||||
"true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
|
||||
|
||||
def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
|
||||
"true", "Enable fine grained virtualization traps extension">;
|
||||
|
||||
@ -380,7 +389,6 @@ def FeatureEnhancedCounterVirtualization :
|
||||
SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
|
||||
"true", "Enable enhanced counter virtualization extension">;
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Architectures.
|
||||
//
|
||||
@ -413,7 +421,7 @@ def HasV8_6aOps : SubtargetFeature<
|
||||
"v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
|
||||
|
||||
[HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
|
||||
FeatureEnhancedCounterVirtualization]>;
|
||||
FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Register File Description
|
||||
|
@ -5550,11 +5550,11 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
|
||||
|
||||
// ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
|
||||
// bytes from S-sized elements.
|
||||
class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
|
||||
class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
|
||||
string kind2, RegisterOperand RegType,
|
||||
ValueType AccumType, ValueType InputType,
|
||||
SDPatternOperator OpNode> :
|
||||
BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
|
||||
BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
|
||||
[(set (AccumType RegType:$dst),
|
||||
(OpNode (AccumType RegType:$Rd),
|
||||
(InputType RegType:$Rn),
|
||||
@ -5562,10 +5562,10 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
|
||||
let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
|
||||
}
|
||||
|
||||
multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
|
||||
def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
|
||||
multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
|
||||
def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
|
||||
v2i32, v8i8, OpNode>;
|
||||
def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
|
||||
def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
|
||||
v4i32, v16i8, OpNode>;
|
||||
}
|
||||
|
||||
@ -7903,13 +7903,26 @@ class BF16ToSinglePrecision<string asm>
|
||||
}
|
||||
} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Armv8.6 Matrix Multiply Extension
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
|
||||
: BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
|
||||
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
|
||||
(v16i8 V128:$Rn),
|
||||
(v16i8 V128:$Rm)))]> {
|
||||
let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// ARMv8.2-A Dot Product Instructions (Indexed)
|
||||
class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
|
||||
string lhs_kind, string rhs_kind,
|
||||
class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
|
||||
string dst_kind, string lhs_kind, string rhs_kind,
|
||||
RegisterOperand RegType,
|
||||
ValueType AccumType, ValueType InputType,
|
||||
SDPatternOperator OpNode> :
|
||||
BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
|
||||
BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
|
||||
VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
|
||||
[(set (AccumType RegType:$dst),
|
||||
(AccumType (OpNode (AccumType RegType:$Rd),
|
||||
@ -7922,11 +7935,11 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
|
||||
let Inst{11} = idx{1}; // H
|
||||
}
|
||||
|
||||
multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
|
||||
multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
|
||||
SDPatternOperator OpNode> {
|
||||
def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
|
||||
def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
|
||||
V64, v2i32, v8i8, OpNode>;
|
||||
def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
|
||||
def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
|
||||
V128, v4i32, v16i8, OpNode>;
|
||||
}
|
||||
|
||||
|
@ -146,6 +146,12 @@ def HasTRBE : Predicate<"Subtarget->hasTRBE()">,
|
||||
AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
|
||||
def HasBF16 : Predicate<"Subtarget->hasBF16()">,
|
||||
AssemblerPredicate<(all_of FeatureBF16), "bf16">;
|
||||
def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">,
|
||||
AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
|
||||
def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">,
|
||||
AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
|
||||
def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">,
|
||||
AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
|
||||
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
|
||||
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
|
||||
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
|
||||
@ -745,10 +751,10 @@ def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> {
|
||||
|
||||
// ARMv8.2-A Dot Product
|
||||
let Predicates = [HasDotProd] in {
|
||||
defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
|
||||
defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
|
||||
defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
|
||||
defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
|
||||
defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
|
||||
defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>;
|
||||
defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>;
|
||||
defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
|
||||
}
|
||||
|
||||
// ARMv8.6-A BFloat
|
||||
@ -765,6 +771,40 @@ def BFCVTN2 : SIMD_BFCVTN2;
|
||||
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
|
||||
}
|
||||
|
||||
// ARMv8.6A AArch64 matrix multiplication
|
||||
let Predicates = [HasMatMulInt8] in {
|
||||
def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
|
||||
def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
|
||||
def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
|
||||
defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
|
||||
defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
|
||||
|
||||
// sudot lane has a pattern where usdot is expected (there is no sudot).
|
||||
// The second operand is used in the dup operation to repeat the indexed
|
||||
// element.
|
||||
class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
|
||||
string rhs_kind, RegisterOperand RegType,
|
||||
ValueType AccumType, ValueType InputType>
|
||||
: BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind,
|
||||
lhs_kind, rhs_kind, RegType, AccumType,
|
||||
InputType, null_frag> {
|
||||
let Pattern = [(set (AccumType RegType:$dst),
|
||||
(AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
|
||||
(InputType (bitconvert (AccumType
|
||||
(AArch64duplane32 (v4i32 V128:$Rm),
|
||||
VectorIndexS:$idx)))),
|
||||
(InputType RegType:$Rn))))];
|
||||
}
|
||||
|
||||
multiclass SIMDSUDOTIndex {
|
||||
def v8i8 : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>;
|
||||
def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>;
|
||||
}
|
||||
|
||||
defm SUDOTlane : SIMDSUDOTIndex;
|
||||
|
||||
}
|
||||
|
||||
// ARMv8.2-A FP16 Fused Multiply-Add Long
|
||||
let Predicates = [HasNEON, HasFP16FML] in {
|
||||
defm FMLAL : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
|
||||
|
@ -148,6 +148,9 @@ protected:
|
||||
|
||||
// Armv8.6-A Extensions
|
||||
bool HasBF16 = false;
|
||||
bool HasMatMulInt8 = false;
|
||||
bool HasMatMulFP32 = false;
|
||||
bool HasMatMulFP64 = false;
|
||||
bool HasAMVS = false;
|
||||
bool HasFineGrainedTraps = false;
|
||||
bool HasEnhancedCounterVirtualization = false;
|
||||
@ -417,6 +420,9 @@ public:
|
||||
bool hasSVE2SM4() const { return HasSVE2SM4; }
|
||||
bool hasSVE2SHA3() const { return HasSVE2SHA3; }
|
||||
bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
|
||||
bool hasMatMulInt8() const { return HasMatMulInt8; }
|
||||
bool hasMatMulFP32() const { return HasMatMulFP32; }
|
||||
bool hasMatMulFP64() const { return HasMatMulFP64; }
|
||||
|
||||
// Armv8.6-A Extensions
|
||||
bool hasBF16() const { return HasBF16; }
|
||||
|
136
test/CodeGen/AArch64/aarch64-matmul.ll
Normal file
136
test/CodeGen/AArch64/aarch64-matmul.ll
Normal file
@ -0,0 +1,136 @@
|
||||
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
|
||||
|
||||
define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: smmla.v4i32.v16i8
|
||||
; CHECK: smmla v0.4s, v1.16b, v2.16b
|
||||
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
|
||||
ret <4 x i32> %vmmla1.i
|
||||
}
|
||||
|
||||
define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: ummla.v4i32.v16i8
|
||||
; CHECK: ummla v0.4s, v1.16b, v2.16b
|
||||
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
|
||||
ret <4 x i32> %vmmla1.i
|
||||
}
|
||||
|
||||
define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: usmmla.v4i32.v16i8
|
||||
; CHECK: usmmla v0.4s, v1.16b, v2.16b
|
||||
%vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
|
||||
ret <4 x i32> %vusmmla1.i
|
||||
}
|
||||
|
||||
define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: usdot.v2i32.v8i8
|
||||
; CHECK: usdot v0.2s, v1.8b, v2.8b
|
||||
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
|
||||
ret <2 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: usdot_lane.v2i32.v8i8
|
||||
; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
|
||||
%0 = bitcast <8 x i8> %b to <2 x i32>
|
||||
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
|
||||
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
|
||||
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
|
||||
ret <2 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: sudot_lane.v2i32.v8i8
|
||||
; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
|
||||
%0 = bitcast <8 x i8> %b to <2 x i32>
|
||||
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
|
||||
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
|
||||
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a)
|
||||
ret <2 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: usdot_lane.v2i32.v16i8
|
||||
; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
|
||||
%0 = bitcast <16 x i8> %b to <4 x i32>
|
||||
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
|
||||
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
|
||||
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
|
||||
ret <2 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: sudot_lane.v2i32.v16i8
|
||||
; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
|
||||
%0 = bitcast <16 x i8> %b to <4 x i32>
|
||||
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
|
||||
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
|
||||
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3
|
||||
ret <2 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: usdot.v4i32.v16i8
|
||||
; CHECK: usdot v0.4s, v1.16b, v2.16b
|
||||
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
|
||||
ret <4 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: usdot_lane.v4i32.v16i8
|
||||
; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
|
||||
%0 = bitcast <8 x i8> %b to <2 x i32>
|
||||
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
|
||||
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
|
||||
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
|
||||
ret <4 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: sudot_lane.v4i32.v16i8
|
||||
; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
|
||||
%0 = bitcast <8 x i8> %b to <2 x i32>
|
||||
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
|
||||
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
|
||||
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
|
||||
ret <4 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: usdot_laneq.v4i32.v16i8
|
||||
; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
|
||||
%0 = bitcast <16 x i8> %b to <4 x i32>
|
||||
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
|
||||
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
|
||||
ret <4 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
|
||||
entry:
|
||||
; CHECK-LABEL: sudot_laneq.v4i32.v16i8
|
||||
; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
|
||||
%0 = bitcast <16 x i8> %b to <4 x i32>
|
||||
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
|
||||
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
|
||||
ret <4 x i32> %vusdot1.i
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
|
||||
declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
|
||||
declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
|
||||
declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
|
||||
declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
|
||||
|
34
test/MC/AArch64/armv8.6a-simd-matmul-error.s
Normal file
34
test/MC/AArch64/armv8.6a-simd-matmul-error.s
Normal file
@ -0,0 +1,34 @@
|
||||
// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm < %s 2>&1 | FileCheck %s
|
||||
|
||||
// No interesting edge cases for [US]MMLA, except for the fact that the data
|
||||
// types are fixed (no 64-bit version), and USMMLA exists, but SUMMLA does not.
|
||||
smmla v1.2s, v16.8b, v31.8b
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||
summla v1.4s, v16.16b, v31.16b
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: unrecognized instruction mnemonic, did you mean: smmla, ummla, usmmla?
|
||||
|
||||
// USDOT (vector) has two valid data type combinations, others are rejected.
|
||||
usdot v3.4s, v15.8b, v30.8b
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||
usdot v3.2s, v15.16b, v30.16b
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||
|
||||
// For USDOT and SUDOT (indexed), the index is in range [0,3] (regardless of data types)
|
||||
usdot v31.2s, v1.8b, v2.4b[4]
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
|
||||
usdot v31.4s, v1.16b, v2.4b[4]
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
|
||||
sudot v31.2s, v1.8b, v2.4b[4]
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
|
||||
sudot v31.4s, v1.16b, v2.4b[4]
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
|
||||
|
||||
// The arrangement specifiers of the first two operands must match.
|
||||
usdot v31.4s, v1.8b, v2.4b[0]
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||
usdot v31.2s, v1.16b, v2.4b[0]
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||
sudot v31.4s, v1.8b, v2.4b[0]
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||
sudot v31.2s, v1.16b, v2.4b[0]
|
||||
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
43
test/MC/AArch64/armv8.6a-simd-matmul.s
Normal file
43
test/MC/AArch64/armv8.6a-simd-matmul.s
Normal file
@ -0,0 +1,43 @@
|
||||
// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm < %s | FileCheck %s
|
||||
// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a < %s | FileCheck %s
|
||||
// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a-i8mm < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
|
||||
|
||||
smmla v1.4s, v16.16b, v31.16b
|
||||
ummla v1.4s, v16.16b, v31.16b
|
||||
usmmla v1.4s, v16.16b, v31.16b
|
||||
// CHECK: smmla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x4e]
|
||||
// CHECK: ummla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x6e]
|
||||
// CHECK: usmmla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xae,0x9f,0x4e]
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: smmla v1.4s, v16.16b, v31.16b
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: ummla v1.4s, v16.16b, v31.16b
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: usmmla v1.4s, v16.16b, v31.16b
|
||||
|
||||
usdot v3.2s, v15.8b, v30.8b
|
||||
usdot v3.4s, v15.16b, v30.16b
|
||||
// CHECK: usdot v3.2s, v15.8b, v30.8b // encoding: [0xe3,0x9d,0x9e,0x0e]
|
||||
// CHECK: usdot v3.4s, v15.16b, v30.16b // encoding: [0xe3,0x9d,0x9e,0x4e]
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: usdot v3.2s, v15.8b, v30.8b
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: usdot v3.4s, v15.16b, v30.16b
|
||||
|
||||
usdot v31.2s, v1.8b, v2.4b[3]
|
||||
usdot v31.4s, v1.16b, v2.4b[3]
|
||||
// CHECK: usdot v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x0f]
|
||||
// CHECK: usdot v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x4f]
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: usdot v31.2s, v1.8b, v2.4b[3]
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: usdot v31.4s, v1.16b, v2.4b[3]
|
||||
|
||||
sudot v31.2s, v1.8b, v2.4b[3]
|
||||
sudot v31.4s, v1.16b, v2.4b[3]
|
||||
// CHECK: sudot v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x0f]
|
||||
// CHECK: sudot v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x4f]
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: sudot v31.2s, v1.8b, v2.4b[3]
|
||||
// NOMATMUL: instruction requires: i8mm
|
||||
// NOMATMUL-NEXT: sudot v31.4s, v1.16b, v2.4b[3]
|
34
test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt
Normal file
34
test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt
Normal file
@ -0,0 +1,34 @@
|
||||
# RUN: llvm-mc -triple=aarch64 -mattr=+i8mm -disassemble < %s | FileCheck %s
|
||||
# RUN: llvm-mc -triple=aarch64 -mattr=+v8.6a -disassemble < %s | FileCheck %s
|
||||
# RUN: not llvm-mc -triple=aarch64 -mattr=+v8.5a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOI8MM
|
||||
|
||||
[0x01,0xa6,0x9f,0x4e]
|
||||
[0x01,0xa6,0x9f,0x6e]
|
||||
[0x01,0xae,0x9f,0x4e]
|
||||
# CHECK: smmla v1.4s, v16.16b, v31.16b
|
||||
# CHECK: ummla v1.4s, v16.16b, v31.16b
|
||||
# CHECK: usmmla v1.4s, v16.16b, v31.16b
|
||||
# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
|
||||
# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
|
||||
# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
|
||||
|
||||
[0xe3,0x9d,0x9e,0x0e]
|
||||
[0xe3,0x9d,0x9e,0x4e]
|
||||
# CHECK: usdot v3.2s, v15.8b, v30.8b
|
||||
# CHECK: usdot v3.4s, v15.16b, v30.16b
|
||||
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
|
||||
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
|
||||
|
||||
[0x3f,0xf8,0xa2,0x0f]
|
||||
[0x3f,0xf8,0xa2,0x4f]
|
||||
# CHECK: usdot v31.2s, v1.8b, v2.4b[3]
|
||||
# CHECK: usdot v31.4s, v1.16b, v2.4b[3]
|
||||
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
|
||||
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
|
||||
|
||||
[0x3f,0xf8,0x22,0x0f]
|
||||
[0x3f,0xf8,0x22,0x4f]
|
||||
# CHECK: sudot v31.2s, v1.8b, v2.4b[3]
|
||||
# CHECK: sudot v31.4s, v1.16b, v2.4b[3]
|
||||
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
|
||||
# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
|
Loading…
x
Reference in New Issue
Block a user