1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[AArch64] Add v8.1a "Rounding Double Multiply Add/Subtract" extension

Reviewers: t.p.northover, jmolloy

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D8502

llvm-svn: 233693
This commit is contained in:
Vladimir Sukharev 2015-03-31 13:15:48 +00:00
parent 272d4887f8
commit 22589e7b79
5 changed files with 960 additions and 0 deletions

View File

@ -5300,6 +5300,27 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
let Inst{4-0} = Rd;
}
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
dag oops, dag iops, string asm,
list<dag> pattern>
: I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
let Inst{21} = R;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
}
multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
@ -5327,6 +5348,16 @@ multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
}
multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
(ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
asm, []>;
def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
(ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
asm, []>;
}
multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
@ -8518,6 +8549,174 @@ multiclass SIMDLdSt4SingleAliases<string asm> {
}
} // end of 'let Predicates = [HasNEON]'
//----------------------------------------------------------------------------
// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
//----------------------------------------------------------------------------
let Predicates = [HasNEON, HasV8_1a] in {
class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand regtype, string asm,
string kind, list<dag> pattern>
: BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind,
pattern> {
let Inst{21}=0;
}
multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
SDPatternOperator Accum> {
def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
[(set (v4i16 V64:$dst),
(Accum (v4i16 V64:$Rd),
(v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
(v4i16 V64:$Rm)))))]>;
def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
[(set (v8i16 V128:$dst),
(Accum (v8i16 V128:$Rd),
(v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
(v8i16 V128:$Rm)))))]>;
def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
[(set (v2i32 V64:$dst),
(Accum (v2i32 V64:$Rd),
(v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
(v2i32 V64:$Rm)))))]>;
def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
(v4i32 V128:$Rm)))))]>;
}
multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
SDPatternOperator Accum> {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
V64, V64, V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4i16 V64:$dst),
(Accum (v4i16 V64:$Rd),
(v4i16 (int_aarch64_neon_sqrdmulh
(v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
}
def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
V128, V128, V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8i16 V128:$dst),
(Accum (v8i16 V128:$Rd),
(v8i16 (int_aarch64_neon_sqrdmulh
(v8i16 V128:$Rn),
(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
}
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
V64, V64, V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2i32 V64:$dst),
(Accum (v2i32 V64:$Rd),
(v2i32 (int_aarch64_neon_sqrdmulh
(v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
// FIXME: it would be nice to use the scalar (v1i32) instruction here, but
// an intermediate EXTRACT_SUBREG would be untyped.
// FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
// got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
(i32 (vector_extract
(v4i32 (insert_subvector
(undef),
(v2i32 (int_aarch64_neon_sqrdmulh
(v2i32 V64:$Rn),
(v2i32 (AArch64duplane32
(v4i32 V128:$Rm),
VectorIndexS:$idx)))),
(i32 0))),
(i64 0))))),
(EXTRACT_SUBREG
(v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
FPR32Op:$Rd,
ssub)),
V64:$Rn,
V128:$Rm,
VectorIndexS:$idx)),
ssub)>;
def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
V128, V128, V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqrdmulh
(v4i32 V128:$Rn),
(v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
// FIXME: it would be nice to use the scalar (v1i32) instruction here, but
// an intermediate EXTRACT_SUBREG would be untyped.
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
(i32 (vector_extract
(v4i32 (int_aarch64_neon_sqrdmulh
(v4i32 V128:$Rn),
(v4i32 (AArch64duplane32
(v4i32 V128:$Rm),
VectorIndexS:$idx)))),
(i64 0))))),
(EXTRACT_SUBREG
(v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
FPR32Op:$Rd,
ssub)),
V128:$Rn,
V128:$Rm,
VectorIndexS:$idx)),
ssub)>;
def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
FPR16Op, FPR16Op, V128_lo,
VectorIndexH, asm, ".h", "", "", ".h",
[]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
}
def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i32 FPR32Op:$dst),
(Accum (i32 FPR32Op:$Rd),
(i32 (int_aarch64_neon_sqrdmulh
(i32 FPR32Op:$Rn),
(i32 (vector_extract (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
}
} // let Predicates = [HasNeon, HasV8_1a]
//----------------------------------------------------------------------------
// Crypto extensions
//----------------------------------------------------------------------------

View File

@ -2778,6 +2778,10 @@ defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
int_aarch64_neon_sqadd>;
defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
int_aarch64_neon_sqsub>;
defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
@ -2994,6 +2998,20 @@ defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>
defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
let Predicates = [HasV8_1a] in {
defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
def : Pat<(i32 (int_aarch64_neon_sqadd
(i32 FPR32:$Rd),
(i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
(i32 FPR32:$Rm))))),
(SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
def : Pat<(i32 (int_aarch64_neon_sqsub
(i32 FPR32:$Rd),
(i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
(i32 FPR32:$Rm))))),
(SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
}
def : InstAlias<"cmls $dst, $src1, $src2",
(CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
@ -4324,6 +4342,10 @@ defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
int_aarch64_neon_sqadd>;
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
int_aarch64_neon_sqsub>;
defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
int_aarch64_neon_sqadd>;
defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
int_aarch64_neon_sqsub>;
defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;

View File

@ -0,0 +1,456 @@
; RUN: llc < %s -verify-machineinstrs -march=arm64 | FileCheck %s --check-prefix=CHECK-V8a
; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a | FileCheck %s --check-prefix=CHECK-V81a
; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
;-----------------------------------------------------------------------------
; RDMA Vector
; test for SIMDThreeSameVectorSQRDMLxHTiedHS
define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
; CHECK-LABEL: test_sqrdmlah_v4i16:
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
%retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h
; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h
; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2
ret <4 x i16> %retval
}
define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
; CHECK-LABEL: test_sqrdmlah_v8i16:
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
%retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h
; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h
; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
ret <8 x i16> %retval
}
define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlah_v2i32:
%prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
%retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s
; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s
; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2
ret <2 x i32> %retval
}
define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlah_v4i32:
%prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
%retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s
; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s
; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2
ret <4 x i32> %retval
}
define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_v4i16:
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
%retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h
; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h
; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2
ret <4 x i16> %retval
}
define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_v8i16:
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
%retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h
; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h
; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2
ret <8 x i16> %retval
}
define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_v2i32:
%prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
%retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s
; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s
; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2
ret <2 x i32> %retval
}
define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_v4i32:
%prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
%retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s
; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s
; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2
ret <4 x i32> %retval
}
;-----------------------------------------------------------------------------
; RDMA Vector, by element
; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
; CHECK-LABEL: test_sqrdmlah_lane_s16:
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
%retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3]
; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3]
; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3]
ret <4 x i16> %retval
}
define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
; CHECK-LABEL: test_sqrdmlahq_lane_s16:
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
%retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2]
; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2]
; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2]
ret <8 x i16> %retval
}
define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
; CHECK-LABEL: test_sqrdmlah_lane_s32:
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
%retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1]
; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1]
; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1]
ret <2 x i32> %retval
}
define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
; CHECK-LABEL: test_sqrdmlahq_lane_s32:
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
%prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
%retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0]
; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0]
; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0]
ret <4 x i32> %retval
}
define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
; CHECK-LABEL: test_sqrdmlsh_lane_s16:
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
%retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3]
; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3]
; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3]
ret <4 x i16> %retval
}
define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
; CHECK-LABEL: test_sqrdmlshq_lane_s16:
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
%retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2]
; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2]
; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2]
ret <8 x i16> %retval
}
define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
; CHECK-LABEL: test_sqrdmlsh_lane_s32:
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
%retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1]
; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1]
; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1]
ret <2 x i32> %retval
}
define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
; CHECK-LABEL: test_sqrdmlshq_lane_s32:
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
%prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
%retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0]
; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0]
; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0]
ret <4 x i32> %retval
}
;-----------------------------------------------------------------------------
; RDMA Vector, by element, extracted
; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied
define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
%acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
%retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
%retval = extractelement <4 x i16> %retval_vec, i64 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1]
; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1]
; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1]
ret i16 %retval
}
define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
%acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
%retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
%retval = extractelement <8 x i16> %retval_vec, i64 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1]
; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1]
; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1]
ret i16 %retval
}
define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
%extract = extractelement <2 x i32> %prod, i64 0
%retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0]
; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0]
; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0]
ret i32 %retval
}
define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
%prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
%extract = extractelement <4 x i32> %prod, i64 0
%retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0]
; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0]
; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0]
ret i32 %retval
}
define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
%acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
%retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
%retval = extractelement <4 x i16> %retval_vec, i64 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1]
; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1]
; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1]
ret i16 %retval
}
define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
%acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
%retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
%retval = extractelement <8 x i16> %retval_vec, i64 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1]
; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1]
; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1]
ret i16 %retval
}
define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
%prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
%extract = extractelement <2 x i32> %prod, i64 0
%retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0]
; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0]
; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0]
ret i32 %retval
}
define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
%prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
%extract = extractelement <4 x i32> %prod, i64 0
%retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0]
; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0]
; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0]
ret i32 %retval
}
;-----------------------------------------------------------------------------
; RDMA Scalar
; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
; CHECK-LABEL: test_sqrdmlah_v1i16:
%x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
%y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
%prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
%acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
%retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
%retval = extractelement <4 x i16> %retval_vec, i64 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
ret i16 %retval
}
define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
; CHECK-LABEL: test_sqrdmlah_v1i32:
%x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
%y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
%prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
%acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
%retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
%retval = extractelement <4 x i32> %retval_vec, i64 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
ret i32 %retval
}
define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
; CHECK-LABEL: test_sqrdmlsh_v1i16:
%x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
%y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
%prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
%acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
%retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
%retval = extractelement <4 x i16> %retval_vec, i64 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
ret i16 %retval
}
define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
; CHECK-LABEL: test_sqrdmlsh_v1i32:
%x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
%y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
%prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
%acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
%retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
%retval = extractelement <4 x i32> %retval_vec, i64 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
ret i32 %retval
}
define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
; CHECK-LABEL: test_sqrdmlah_i32:
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
%retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret i32 %retval
}
define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
; CHECK-LABEL: test_sqrdmlsh_i32:
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
%retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
ret i32 %retval
}
;-----------------------------------------------------------------------------
; RDMA Scalar, by element
; i16 tests are performed via tests in above chapter, with IR in ACLE style
; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
; CHECK-LABEL: test_sqrdmlah_extract_i16:
%shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
%x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
%acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
%retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
%retval = extractelement <4 x i16> %retval_vec, i32 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1]
ret i16 %retval
}
define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlah_extract_i32:
%extract = extractelement <4 x i32> %rhs, i32 3
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
%retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
ret i32 %retval
}
define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
; CHECK-LABEL: test_sqrdmlshq_extract_i16:
%shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
%x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
%acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
%retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
%retval = extractelement <8 x i16> %retval_vec, i32 0
; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1]
ret i16 %retval
}
define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_extract_i32:
%extract = extractelement <4 x i32> %rhs, i32 3
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
%retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
ret i32 %retval
}

View File

@ -0,0 +1,154 @@
// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a -show-encoding < %s 2> %t | FileCheck %s
// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
.text
//AdvSIMD RDMA vector
sqrdmlah v0.4h, v1.4h, v2.4h
sqrdmlsh v0.4h, v1.4h, v2.4h
sqrdmlah v0.2s, v1.2s, v2.2s
sqrdmlsh v0.2s, v1.2s, v2.2s
sqrdmlah v0.4s, v1.4s, v2.4s
sqrdmlsh v0.4s, v1.4s, v2.4s
sqrdmlah v0.8h, v1.8h, v2.8h
sqrdmlsh v0.8h, v1.8h, v2.8h
// CHECK: sqrdmlah v0.4h, v1.4h, v2.4h // encoding: [0x20,0x84,0x42,0x2e]
// CHECK: sqrdmlsh v0.4h, v1.4h, v2.4h // encoding: [0x20,0x8c,0x42,0x2e]
// CHECK: sqrdmlah v0.2s, v1.2s, v2.2s // encoding: [0x20,0x84,0x82,0x2e]
// CHECK: sqrdmlsh v0.2s, v1.2s, v2.2s // encoding: [0x20,0x8c,0x82,0x2e]
// CHECK: sqrdmlah v0.4s, v1.4s, v2.4s // encoding: [0x20,0x84,0x82,0x6e]
// CHECK: sqrdmlsh v0.4s, v1.4s, v2.4s // encoding: [0x20,0x8c,0x82,0x6e]
// CHECK: sqrdmlah v0.8h, v1.8h, v2.8h // encoding: [0x20,0x84,0x42,0x6e]
// CHECK: sqrdmlsh v0.8h, v1.8h, v2.8h // encoding: [0x20,0x8c,0x42,0x6e]
sqrdmlah v0.2h, v1.2h, v2.2h
sqrdmlsh v0.2h, v1.2h, v2.2h
sqrdmlah v0.8s, v1.8s, v2.8s
sqrdmlsh v0.8s, v1.8s, v2.8s
sqrdmlah v0.2s, v1.4h, v2.8h
sqrdmlsh v0.4s, v1.8h, v2.2s
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlah v0.2h, v1.2h, v2.2h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlah v0.2h, v1.2h, v2.2h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlah v0.2h, v1.2h, v2.2h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlah v0.2h, v1.2h, v2.2h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlsh v0.2h, v1.2h, v2.2h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlsh v0.2h, v1.2h, v2.2h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlsh v0.2h, v1.2h, v2.2h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlsh v0.2h, v1.2h, v2.2h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlah v0.8s, v1.8s, v2.8s
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlah v0.8s, v1.8s, v2.8s
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlah v0.8s, v1.8s, v2.8s
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlah v0.8s, v1.8s, v2.8s
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlsh v0.8s, v1.8s, v2.8s
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlsh v0.8s, v1.8s, v2.8s
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid vector kind qualifier
// CHECK-ERROR: sqrdmlsh v0.8s, v1.8s, v2.8s
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlsh v0.8s, v1.8s, v2.8s
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlah v0.2s, v1.4h, v2.8h
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlsh v0.4s, v1.8h, v2.2s
// CHECK-ERROR: ^
//AdvSIMD RDMA scalar
sqrdmlah h0, h1, h2
sqrdmlsh h0, h1, h2
sqrdmlah s0, s1, s2
sqrdmlsh s0, s1, s2
// CHECK: sqrdmlah h0, h1, h2 // encoding: [0x20,0x84,0x42,0x7e]
// CHECK: sqrdmlsh h0, h1, h2 // encoding: [0x20,0x8c,0x42,0x7e]
// CHECK: sqrdmlah s0, s1, s2 // encoding: [0x20,0x84,0x82,0x7e]
// CHECK: sqrdmlsh s0, s1, s2 // encoding: [0x20,0x8c,0x82,0x7e]
//AdvSIMD RDMA vector by-element
sqrdmlah v0.4h, v1.4h, v2.h[3]
sqrdmlsh v0.4h, v1.4h, v2.h[3]
sqrdmlah v0.2s, v1.2s, v2.s[1]
sqrdmlsh v0.2s, v1.2s, v2.s[1]
sqrdmlah v0.8h, v1.8h, v2.h[3]
sqrdmlsh v0.8h, v1.8h, v2.h[3]
sqrdmlah v0.4s, v1.4s, v2.s[3]
sqrdmlsh v0.4s, v1.4s, v2.s[3]
// CHECK: sqrdmlah v0.4h, v1.4h, v2.h[3] // encoding: [0x20,0xd0,0x72,0x2f]
// CHECK: sqrdmlsh v0.4h, v1.4h, v2.h[3] // encoding: [0x20,0xf0,0x72,0x2f]
// CHECK: sqrdmlah v0.2s, v1.2s, v2.s[1] // encoding: [0x20,0xd0,0xa2,0x2f]
// CHECK: sqrdmlsh v0.2s, v1.2s, v2.s[1] // encoding: [0x20,0xf0,0xa2,0x2f]
// CHECK: sqrdmlah v0.8h, v1.8h, v2.h[3] // encoding: [0x20,0xd0,0x72,0x6f]
// CHECK: sqrdmlsh v0.8h, v1.8h, v2.h[3] // encoding: [0x20,0xf0,0x72,0x6f]
// CHECK: sqrdmlah v0.4s, v1.4s, v2.s[3] // encoding: [0x20,0xd8,0xa2,0x6f]
// CHECK: sqrdmlsh v0.4s, v1.4s, v2.s[3] // encoding: [0x20,0xf8,0xa2,0x6f]
sqrdmlah v0.4s, v1.2s, v2.s[1]
sqrdmlsh v0.2s, v1.2d, v2.s[1]
sqrdmlah v0.8h, v1.8h, v2.s[3]
sqrdmlsh v0.8h, v1.8h, v2.h[8]
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlah v0.4s, v1.2s, v2.s[1]
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlsh v0.2s, v1.2d, v2.s[1]
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlah v0.8h, v1.8h, v2.s[3]
// CHECK-ERROR: ^
// CHECK-ERROR: error: vector lane must be an integer in range [0, 7].
// CHECK-ERROR: sqrdmlsh v0.8h, v1.8h, v2.h[8]
// CHECK-ERROR: ^
//AdvSIMD RDMA scalar by-element
sqrdmlah h0, h1, v2.h[3]
sqrdmlsh h0, h1, v2.h[3]
sqrdmlah s0, s1, v2.s[3]
sqrdmlsh s0, s1, v2.s[3]
// CHECK: sqrdmlah h0, h1, v2.h[3] // encoding: [0x20,0xd0,0x72,0x7f]
// CHECK: sqrdmlsh h0, h1, v2.h[3] // encoding: [0x20,0xf0,0x72,0x7f]
// CHECK: sqrdmlah s0, s1, v2.s[3] // encoding: [0x20,0xd8,0xa2,0x7f]
// CHECK: sqrdmlsh s0, s1, v2.s[3] // encoding: [0x20,0xf8,0xa2,0x7f]
sqrdmlah b0, h1, v2.h[3]
sqrdmlah s0, d1, v2.s[3]
sqrdmlsh h0, h1, v2.s[3]
sqrdmlsh s0, s1, v2.s[4]
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlah b0, h1, v2.h[3]
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlah s0, d1, v2.s[3]
// CHECK-ERROR: ^
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: sqrdmlsh h0, h1, v2.s[3]
// CHECK-ERROR: ^
// CHECK-ERROR: error: vector lane must be an integer in range [0, 3].
// CHECK-ERROR: sqrdmlsh s0, s1, v2.s[4]
// CHECK-ERROR: ^

View File

@ -0,0 +1,129 @@
# RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a --disassemble < %s 2>&1 | FileCheck %s
[0x20,0x84,0x02,0x2e] # sqrdmlah v0.8b, v1.8b, v2.8b
[0x20,0x8c,0x02,0x2e] # sqrdmlsh v0.8b, v1.8b, v2.8b
[0x20,0x84,0xc2,0x2e] # sqrdmlah v0.1d, v1.1d, v2.1d
[0x20,0x8c,0xc2,0x2e] # sqrdmlsh v0.1d, v1.1d, v2.1d
[0x20,0x84,0x02,0x6e] # sqrdmlah v0.16b, v1.16b, v2.16b
[0x20,0x8c,0x02,0x6e] # sqrdmlsh v0.16b, v1.16b, v2.16b
[0x20,0x84,0xc2,0x6e] # sqrdmlah v0.2d, v1.2d, v2.2d
[0x20,0x8c,0xc2,0x6e] # sqrdmlsh v0.2d, v1.2d, v2.2d
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x84,0x02,0x2e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x8c,0x02,0x2e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x84,0xc2,0x2e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x8c,0xc2,0x2e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x84,0x02,0x6e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x8c,0x02,0x6e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x84,0xc2,0x6e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x8c,0xc2,0x6e]
[0x20,0x84,0x02,0x7e] # sqrdmlah b0, b1, b2
[0x20,0x8c,0x02,0x7e] # sqrdmlsh b0, b1, b2
[0x20,0x84,0xc2,0x7e] # sqrdmlah d0, d1, d2
[0x20,0x8c,0xc2,0x7e] # sqrdmlsh d0, d1, d2
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x84,0x02,0x7e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x8c,0x02,0x7e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x84,0xc2,0x7e]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0x8c,0xc2,0x7e]
[0x20,0xd0,0x32,0x2f] # sqrdmlah v0.8b, v1.8b, v2.b[3]
[0x20,0xf0,0x32,0x2f] # sqrdmlsh v0.8b, v1.8b, v2.b[3]
[0x20,0xd0,0xe2,0x2f] # sqrdmlah v0.1d, v1.1d, v2.d[1]
[0x20,0xf0,0xe2,0x2f] # sqrdmlsh v0.1d, v1.1d, v2.d[1]
[0x20,0xd0,0x32,0x6f] # sqrdmlah v0.16b, v1.16b, v2.b[3]
[0x20,0xf0,0x32,0x6f] # sqrdmlsh v0.16b, v1.16b, v2.b[3]
[0x20,0xd8,0xe2,0x6f] # sqrdmlah v0.2d, v1.2d, v2.d[3]
[0x20,0xf8,0xe2,0x6f] # sqrdmlsh v0.2d, v1.2d, v2.d[3]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xd0,0x32,0x2f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xf0,0x32,0x2f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xd0,0xe2,0x2f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xf0,0xe2,0x2f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xd0,0x32,0x6f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xf0,0x32,0x6f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xd8,0xe2,0x6f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xf8,0xe2,0x6f]
[0x20,0xd0,0x32,0x7f] # sqrdmlah b0, b1, v2.b[3]
[0x20,0xf0,0x32,0x7f] # sqrdmlsh b0, b1, v2.b[3]
[0x20,0xd8,0xe2,0x7f] # sqrdmlah d0, d1, v2.d[3]
[0x20,0xf8,0xe2,0x7f] # sqrdmlsh d0, d1, v2.d[3]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xd0,0x32,0x7f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xf0,0x32,0x7f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xd8,0xe2,0x7f]
# CHECK: warning: invalid instruction encoding
# CHECK: [0x20,0xf8,0xe2,0x7f]
[0x20,0x84,0x42,0x2e]
[0x20,0x8c,0x42,0x2e]
[0x20,0x84,0x82,0x2e]
[0x20,0x8c,0x82,0x2e]
[0x20,0x84,0x42,0x6e]
[0x20,0x8c,0x42,0x6e]
[0x20,0x84,0x82,0x6e]
[0x20,0x8c,0x82,0x6e]
# CHECK: sqrdmlah v0.4h, v1.4h, v2.4h
# CHECK: sqrdmlsh v0.4h, v1.4h, v2.4h
# CHECK: sqrdmlah v0.2s, v1.2s, v2.2s
# CHECK: sqrdmlsh v0.2s, v1.2s, v2.2s
# CHECK: sqrdmlah v0.8h, v1.8h, v2.8h
# CHECK: sqrdmlsh v0.8h, v1.8h, v2.8h
# CHECK: sqrdmlah v0.4s, v1.4s, v2.4s
# CHECK: sqrdmlsh v0.4s, v1.4s, v2.4s
[0x20,0x84,0x42,0x7e]
[0x20,0x8c,0x42,0x7e]
[0x20,0x84,0x82,0x7e]
[0x20,0x8c,0x82,0x7e]
# CHECK: sqrdmlah h0, h1, h2
# CHECK: sqrdmlsh h0, h1, h2
# CHECK: sqrdmlah s0, s1, s2
# CHECK: sqrdmlsh s0, s1, s2
0x20,0xd0,0x72,0x2f
0x20,0xf0,0x72,0x2f
0x20,0xd0,0xa2,0x2f
0x20,0xf0,0xa2,0x2f
0x20,0xd0,0x72,0x6f
0x20,0xf0,0x72,0x6f
0x20,0xd8,0xa2,0x6f
0x20,0xf8,0xa2,0x6f
# CHECK: sqrdmlah v0.4h, v1.4h, v2.h[3]
# CHECK: sqrdmlsh v0.4h, v1.4h, v2.h[3]
# CHECK: sqrdmlah v0.2s, v1.2s, v2.s[1]
# CHECK: sqrdmlsh v0.2s, v1.2s, v2.s[1]
# CHECK: sqrdmlah v0.8h, v1.8h, v2.h[3]
# CHECK: sqrdmlsh v0.8h, v1.8h, v2.h[3]
# CHECK: sqrdmlah v0.4s, v1.4s, v2.s[3]
# CHECK: sqrdmlsh v0.4s, v1.4s, v2.s[3]
0x20,0xd0,0x72,0x7f
0x20,0xf0,0x72,0x7f
0x20,0xd8,0xa2,0x7f
0x20,0xf8,0xa2,0x7f
# CHECK: sqrdmlah h0, h1, v2.h[3]
# CHECK: sqrdmlsh h0, h1, v2.h[3]
# CHECK: sqrdmlah s0, s1, v2.s[3]
# CHECK: sqrdmlsh s0, s1, v2.s[3]