1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 04:02:41 +01:00

[AARCH64] Improve accumulator forwarding for Cortex-A57 model

The old CPU model only had MLA->MLA forwarding. I added some missing
MUL->MLA read advances and a missing absolute diff accumulator read
advance according to the Cortex A57 Software Optimization Guide.

The patch improves performance in EEMBC rgbyiqv2 by about 6%-7% and
spec2006/milc by 8% (repeated runs on multiple devices), causes no
significant regressions (none in SPEC).

Differential Revision: https://reviews.llvm.org/D92296
This commit is contained in:
Usman Nadeem 2021-01-04 10:58:43 +00:00 committed by David Green
parent 8956cdfa58
commit 3f17c99d80
3 changed files with 560 additions and 21 deletions

View File

@ -93,7 +93,7 @@ def : SchedAlias<WriteFCmp, A57Write_3cyc_1V>;
def : SchedAlias<WriteFCvt, A57Write_5cyc_1V>;
def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
def : SchedAlias<WriteFImm, A57Write_3cyc_1V>;
def : SchedAlias<WriteFMul, A57Write_5cyc_1V>;
def : WriteRes<WriteFMul, [A57UnitV]> { let Latency = 5;}
def : SchedAlias<WriteFDiv, A57Write_17cyc_1W>;
def : SchedAlias<WriteV, A57Write_3cyc_1V>;
def : SchedAlias<WriteVLD, A57Write_5cyc_1L>;
@ -350,12 +350,16 @@ def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$")
// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
// Cortex A57 Software Optimization Guide Sec 3.14
// Advance for absolute diff accum, pairwise add and accumulate, shift accumulate
def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_4cyc_1X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>;
// ASIMD absolute diff accum, D-form
def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
// ASIMD absolute diff accum, Q-form
def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
// ASIMD absolute diff accum long
def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>;
// ASIMD arith, reduce, 4H/4S
def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
@ -372,32 +376,41 @@ def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>
def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
// ASIMD multiply, D-form
def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
// MUL
def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^MUL(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
// PMUL, SQDMULH, SQRDMULH
def : InstRW<[A57Write_5cyc_1W], (instregex "^(PMUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
// ASIMD multiply, Q-form
def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
// MUL
def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^MUL(v16i8|v8i16|v4i32)(_indexed)?$")>;
// PMUL, SQDMULH, SQRDMULH
def : InstRW<[A57Write_6cyc_2W], (instregex "^(PMUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
// Cortex A57 Software Optimization Guide Sec 3.14
def A57ReadIVMA4 : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>;
def A57ReadIVMA3 : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>;
// ASIMD multiply accumulate, D-form
def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
// ASIMD multiply accumulate, Q-form
def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
// ASIMD multiply accumulate long
// ASIMD multiply accumulate saturating long
def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>;
def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>;
def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>;
// ASIMD multiply long
def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U)MULL")>;
def : InstRW<[A57Write_5cyc_1W], (instregex "^SQDMULL")>;
def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
// ASIMD pairwise add and accumulate
// ASIMD shift accumulate
def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>;
def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>;
def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
// ASIMD shift by immed, complex
def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
@ -474,17 +487,22 @@ def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i6
def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
// ASIMD FP multiply, D-form, FZ
def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
// ASIMD FP multiply, Q-form, FZ
def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
// ASIMD FP multiply accumulate, D-form, FZ
// ASIMD FP multiply accumulate, Q-form, FZ
def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; }
def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
// Cortex A57 Software Optimization Guide Sec 3.15
// Advances from FP mul and mul-accum to mul-accum
def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>;
def A57ReadFPVMA6 : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>;
def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
// ASIMD FP round, D-form
def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
@ -547,8 +565,9 @@ def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v
def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
// Cortex A57 Software Optimization Guide Sec 3.10
def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>;
def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>;
def A57ReadFPM : SchedReadAdvance<0>;
def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;

View File

@ -13,6 +13,10 @@
// Prefix: A57Write
// Latency: #cyc
// MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
// Postfix (optional): (XYZ)_Forward
//
// The postfix is added to differentiate SchedWriteRes that are used in
// subsequent SchedReadAdvances.
//
// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
// 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
@ -25,7 +29,9 @@
def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; }
def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; }
def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
def A57Write_5cyc_1V_FP_Forward : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
def A57Write_5cyc_1W_Mul_Forward : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
let ResourceCycles = [17]; }
@ -45,6 +51,7 @@ def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; }
def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; }
def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; }
def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
def A57Write_4cyc_1X_NonMul_Forward : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; }
def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; }
@ -93,6 +100,10 @@ def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> {
let Latency = 6;
let NumMicroOps = 2;
}
def A57Write_6cyc_2W_Mul_Forward : SchedWriteRes<[A57UnitW, A57UnitW]> {
let Latency = 6;
let NumMicroOps = 2;
}
def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI,
A57UnitL]> {
let Latency = 5;
@ -102,10 +113,18 @@ def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 5;
let NumMicroOps = 2;
}
def A57Write_5cyc_2V_FP_Forward : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 5;
let NumMicroOps = 2;
}
def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 5;
let NumMicroOps = 2;
}
def A57Write_5cyc_2X_NonMul_Forward : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 5;
let NumMicroOps = 2;
}
def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
A57UnitV]> {
let Latency = 10;

View File

@ -0,0 +1,501 @@
# RUN: llvm-mca -march=aarch64 -mcpu=cortex-a57 -iterations=1 -timeline < %s | FileCheck %s
# CHECK: [0] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 12
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER .. fmul v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] DeeeeeeeeeER fmla v0.2s, v1.2s, v2.2s
# CHECK: [1] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 13
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . . fmul v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,1] DeeeeeeeeeeER fmla v0.4s, v1.4s, v2.4s
# CHECK: [2] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 12
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER .. fmulx v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] DeeeeeeeeeER fmls v0.2s, v1.2s, v2.2s
# CHECK: [3] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 13
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . . fmulx v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,1] DeeeeeeeeeeER fmls v0.4s, v1.4s, v2.4s
# CHECK: [4] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 16
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeeeeeER . fmla v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D====eeeeeeeeeER fmla v0.2s, v3.2s, v4.2s
# CHECK: [5] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 16
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeeeeeER . fmls v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D====eeeeeeeeeER fmls v0.2s, v3.2s, v4.2s
# CHECK: [6] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 12
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER .. fmul d4, d5, d6
# CHECK-NEXT: [0,1] DeeeeeeeeeER fmadd d1, d2, d3, d4
# CHECK: [7] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 12
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER .. fmul d4, d5, d6
# CHECK-NEXT: [0,1] DeeeeeeeeeER fmadd d1, d2, d3, d4
# CHECK: [8] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 16
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeeeeeER . fmadd d4, d5, d6, d7
# CHECK-NEXT: [0,1] D====eeeeeeeeeER fmadd d1, d2, d3, d4
# CHECK: [9] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 16
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeeeeeER . fmsub d4, d5, d6, d7
# CHECK-NEXT: [0,1] D====eeeeeeeeeER fmsub d1, d2, d3, d4
# CHECK: [10] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 16
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeeeeeER . fnmadd d4, d5, d6, d7
# CHECK-NEXT: [0,1] D====eeeeeeeeeER fnmadd d1, d2, d3, d4
# CHECK: [11] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 16
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeeeeeER . fnmsub d4, d5, d6, d7
# CHECK-NEXT: [0,1] D====eeeeeeeeeER fnmsub d1, d2, d3, d4
# CHECK: [12] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. saba v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeER saba v0.2s, v3.2s, v4.2s
# CHECK: [13] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. sabal v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeER sabal v0.2d, v3.2s, v4.2s
# CHECK: [14] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. uaba v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeER uaba v0.2s, v3.2s, v4.2s
# CHECK: [15] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. uabal v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeER uabal v0.2d, v3.2s, v4.2s
# CHECK: [16] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. sadalp v0.1d, v1.2s
# CHECK-NEXT: [0,1] D=eeeeER sadalp v0.1d, v2.2s
# CHECK: [17] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. uadalp v0.1d, v1.2s
# CHECK-NEXT: [0,1] D=eeeeER uadalp v0.1d, v2.2s
# CHECK: [18] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. srsra v0.8b, v1.8b, #3
# CHECK-NEXT: [0,1] D=eeeeER srsra v0.8b, v2.8b, #3
# CHECK: [19] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. ursra v0.8b, v1.8b, #3
# CHECK-NEXT: [0,1] D=eeeeER ursra v0.8b, v2.8b, #3
# CHECK: [20] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 8
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeER. usra v0.4s, v1.4s, #3
# CHECK-NEXT: [0,1] D=eeeeER usra v0.4s, v2.4s, #3
# CHECK: [21] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. mul v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER mla v0.2s, v1.2s, v2.2s
# CHECK: [22] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 13
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . . pmul v0.8b, v1.8b, v2.8b
# CHECK-NEXT: [0,1] D=====eeeeeER mla v0.8b, v1.8b, v2.8b
# CHECK: [23] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 13
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . . sqdmulh v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=====eeeeeER mla v0.2s, v1.2s, v2.2s
# CHECK: [24] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 13
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . . sqrdmulh v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=====eeeeeER mla v0.2s, v1.2s, v2.2s
# CHECK: [25] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. smull v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER smlal v0.2d, v1.2s, v2.2s
# CHECK: [26] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. umull v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER umlal v0.2d, v1.2s, v2.2s
# CHECK: [27] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 13
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . . sqdmull v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=====eeeeeER smlal v0.2d, v1.2s, v2.2s
# CHECK: [28] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 13
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . . pmull v0.8h, v1.8b, v2.8b
# CHECK-NEXT: [0,1] D=====eeeeeER smlal v0.8h, v1.8b, v2.8b
# CHECK: [29] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 13
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . . pmull2 v0.8h, v1.16b, v2.16b
# CHECK-NEXT: [0,1] D=====eeeeeER smlal v0.8h, v1.8b, v2.8b
# CHECK: [30] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. mla v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER mla v0.2s, v1.2s, v2.2s
# CHECK: [31] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 11
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeeER . mla v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,1] .D=eeeeeeER mla v0.4s, v1.4s, v2.4s
# CHECK: [32] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. mls v0.2s, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER mls v0.2s, v1.2s, v2.2s
# CHECK: [33] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 11
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeeER . mls v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,1] .D=eeeeeeER mls v0.4s, v1.4s, v2.4s
# CHECK: [34] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. smlal v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER smlal v0.2d, v1.2s, v2.2s
# CHECK: [35] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. smlsl v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER smlsl v0.2d, v1.2s, v2.2s
# CHECK: [36] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. umlal v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER umlal v0.2d, v1.2s, v2.2s
# CHECK: [37] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 9
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER. umlsl v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D=eeeeeER umlsl v0.2d, v1.2s, v2.2s
# CHECK: [38] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 10
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . sqdmlal v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D==eeeeeER sqdmlal v0.2d, v1.2s, v2.2s
# CHECK: [39] Code Region
# CHECK: Instructions: 2
# CHECK-NEXT: Total Cycles: 10
# CHECK: Timeline view:
# CHECK: [0,0] DeeeeeER . sqdmlsl v0.2d, v1.2s, v2.2s
# CHECK-NEXT: [0,1] D==eeeeeER sqdmlsl v0.2d, v1.2s, v2.2s
# ASIMD FP Instructions
# FMUL, FMULX, FMLA, FMLS are impacted
# testing only a subset of combinations
# LLVM-MCA-BEGIN
fmul v0.2s, v1.2s, v2.2s
fmla v0.2s, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fmul v0.4s, v1.4s, v2.4s
fmla v0.4s, v1.4s, v2.4s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fmulx v0.2s, v1.2s, v2.2s
fmls v0.2s, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fmulx v0.4s, v1.4s, v2.4s
fmls v0.4s, v1.4s, v2.4s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fmla v0.2s, v1.2s, v2.2s
fmla v0.2s, v3.2s, v4.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fmls v0.2s, v1.2s, v2.2s
fmls v0.2s, v3.2s, v4.2s
# LLVM-MCA-END
# FP Multiply Instructions
# FMUL, FMUL, FNMUL, FMADD, FMSUB, FNMADD, FNMSUB are impacted
# testing only a subset of combinations
# LLVM-MCA-BEGIN
fmul d4, d5, d6
fmadd d1, d2, d3, d4
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fmul d4, d5, d6
fmadd d1, d2, d3, d4
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fmadd d4, d5, d6, d7
fmadd d1, d2, d3, d4
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fmsub d4, d5, d6, d7
fmsub d1, d2, d3, d4
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fnmadd d4, d5, d6, d7
fnmadd d1, d2, d3, d4
# LLVM-MCA-END
# LLVM-MCA-BEGIN
fnmsub d4, d5, d6, d7
fnmsub d1, d2, d3, d4
# LLVM-MCA-END
# ASIMD Integer Instructions X-Unit
# SABA, UABA, SABAL, UABAL, SADALP, UADALP, SRSRA, USRA, URSRA are impacted
# testing only a subset of combinations
# LLVM-MCA-BEGIN
saba v0.2s, v1.2s, v2.2s
saba v0.2s, v3.2s, v4.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
sabal v0.2d, v1.2s, v2.2s
sabal v0.2d, v3.2s, v4.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
uaba v0.2s, v1.2s, v2.2s
uaba v0.2s, v3.2s, v4.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
uabal v0.2d, v1.2s, v2.2s
uabal v0.2d, v3.2s, v4.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
sadalp v0.1d, v1.2s
sadalp v0.1d, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
uadalp v0.1d, v1.2s
uadalp v0.1d, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
srsra v0.8b, v1.8b, #3
srsra v0.8b, v2.8b, #3
# LLVM-MCA-END
# LLVM-MCA-BEGIN
ursra v0.8b, v1.8b, #3
ursra v0.8b, v2.8b, #3
# LLVM-MCA-END
# LLVM-MCA-BEGIN
usra v0.4s, v1.4s, #3
usra v0.4s, v2.4s, #3
# LLVM-MCA-END
# ASIMD Multiply Instructions X-Unit
# pmuls and sqd/sqrdmuls dont forward
# MULs
# LLVM-MCA-BEGIN
mul v0.2s, v1.2s, v2.2s
mla v0.2s, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
pmul v0.8b, v1.8b, v2.8b
mla v0.8b, v1.8b, v2.8b
# LLVM-MCA-END
# LLVM-MCA-BEGIN
sqdmulh v0.2s, v1.2s, v2.2s
mla v0.2s, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
sqrdmulh v0.2s, v1.2s, v2.2s
mla v0.2s, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
smull v0.2d, v1.2s, v2.2s
smlal v0.2d, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
umull v0.2d, v1.2s, v2.2s
umlal v0.2d, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
sqdmull v0.2d, v1.2s, v2.2s
smlal v0.2d, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
pmull.8h v0, v1, v2
smlal.8h v0, v1, v2
# LLVM-MCA-END
# LLVM-MCA-BEGIN
pmull2.8h v0, v1, v2
smlal.8h v0, v1, v2
# LLVM-MCA-END
# MLAs
# LLVM-MCA-BEGIN
mla v0.2s, v1.2s, v2.2s
mla v0.2s, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
mla v0.4s, v1.4s, v2.4s
mla v0.4s, v1.4s, v2.4s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
mls v0.2s, v1.2s, v2.2s
mls v0.2s, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
mls v0.4s, v1.4s, v2.4s
mls v0.4s, v1.4s, v2.4s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
smlal v0.2d, v1.2s, v2.2s
smlal v0.2d, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
smlsl v0.2d, v1.2s, v2.2s
smlsl v0.2d, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
umlal v0.2d, v1.2s, v2.2s
umlal v0.2d, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
umlsl v0.2d, v1.2s, v2.2s
umlsl v0.2d, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
sqdmlal v0.2d, v1.2s, v2.2s
sqdmlal v0.2d, v1.2s, v2.2s
# LLVM-MCA-END
# LLVM-MCA-BEGIN
sqdmlsl v0.2d, v1.2s, v2.2s
sqdmlsl v0.2d, v1.2s, v2.2s
# LLVM-MCA-END