From 9c918327f6e32a0dcb9cc7ff76d8359b70fb32f4 Mon Sep 17 00:00:00 2001
From: Andrew Savonichev <andrew.savonichev@gmail.com>
Date: Fri, 15 Jan 2021 16:08:58 +0300
Subject: [PATCH] [MCA] Add tests for IPC on Cortex-A55

The tests compare IPC statistics that MCA provides with IPC values
measured on Cortex-A55 hardware. For hardware tests, each snippet is
run in a loop unrolled by 1000, and IPC is measured by linux-perf.

Several tests do not match the hardware: the skewed ALU is not
supported, LDR seem to be missing a forwarding path.

Differential Revision: https://reviews.llvm.org/D98174
---
 .../AArch64/Cortex/IPC/A55-0-single-add.s     | 14 +++++++++++
 .../AArch64/Cortex/IPC/A55-1-add-seq.s        | 15 +++++++++++
 .../llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s  | 15 +++++++++++
 .../AArch64/Cortex/IPC/A55-11-fma-mix.s       | 19 ++++++++++++++
 .../AArch64/Cortex/IPC/A55-2-skewed-alu.s     | 18 +++++++++++++
 .../llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s   | 16 ++++++++++++
 .../llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s  | 21 ++++++++++++++++
 .../AArch64/Cortex/IPC/A55-5-mul-sdiv.s       | 22 ++++++++++++++++
 .../llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s   | 25 +++++++++++++++++++
 .../llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s   | 17 +++++++++++++
 .../llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s   | 19 ++++++++++++++
 .../llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s  | 15 +++++++++++
 12 files changed, 216 insertions(+)
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s
 create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s

diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s
new file mode 100644
index 00000000000..4bb6d44393f
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s
@@ -0,0 +1,14 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+add	w8, w8, #1
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 0.5
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s
new file mode 100644
index 00000000000..5ef72ee08e7
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s
@@ -0,0 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+add	w8, w8, #1
+add	w9, w9, #1
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      2000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 1.0
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s
new file mode 100644
index 00000000000..2f892c54abb
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s
@@ -0,0 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+fmadd  s3, s5, s6, s7
+fmadd  s8, s9, s10, s11
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      2000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 1.0
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s
new file mode 100644
index 00000000000..b9489852a92
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s
@@ -0,0 +1,19 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+# FMADD writes and retires out-of-order
+fmadd  s3, s5, s6, s7
+# ADD instructions are issued and retire in-order
+add    w8, w8, #1
+add    w9, w9, #1
+add    w10, w10, #1
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      4000
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    2.00
+# CHECK-NEXT: IPC:               2.00
+# CHECK-NEXT: Block RThroughput: 2.0
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s
new file mode 100644
index 00000000000..ba18c8f2bf4
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+# CHECK: IPC:
+# CHECK-SAME: 2.00
+#
+# XFAIL: *
+#
+# Cortex-A55 has a secondary skewed ALU in the Ex1 stage for simple
+# ALU instructions that do not require shifting or saturation
+# resources. Results from the skewed ALU are available 1 cycle earlier.
+#
+# This features allows the first and the second instruction to be
+# dual-issued despite a register dependency (w8).
+#
+# MCA and LLVM scheduling model do not support this yet.
+
+add	w8, w8, #1
+add	w10, w8, #1
+add	w12, w8, #1
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s
new file mode 100644
index 00000000000..e029b05617e
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s
@@ -0,0 +1,16 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+add	w8, w8, #1
+add	w12, w8, #1
+mul	w10, w10, w10
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      3000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 1.5
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s
new file mode 100644
index 00000000000..b7b7a510b9b
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s
@@ -0,0 +1,21 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+# DIV is not modeled precisely: on hardware it takes variable
+# number of cycles depending on its operands, but LLVM scheduling
+# model only provides an average latency.
+
+add	w8, w8, #1
+movz    w10, #1, lsl #16
+movz    w12, #32768, lsl #16
+sdiv	w10, w12, w10
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      4000
+# CHECK-NEXT: Total Cycles:      8004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 8.0
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s
new file mode 100644
index 00000000000..37d1829e2b4
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s
@@ -0,0 +1,22 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+# DIV is not modeled precisely: on hardware it takes variable
+# number of cycles depending on its operands. LLVM scheduling model
+# only provides an average latency.
+
+add	w8, w8, #1
+movz    w10, #1, lsl #16
+movz    w12, #32768, lsl #16
+mul	w11, w8, w8
+sdiv	w10, w12, w10
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      5000
+# CHECK-NEXT: Total Cycles:      8004
+# CHECK-NEXT: Total uOps:        5000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 8.0
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s
new file mode 100644
index 00000000000..95b40264f9e
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+# It appears that ADD and MUL fuse together, if both can be issued in
+# one cycle:
+#
+#     add  w12, w8, #1
+#     mul  w10, w12, w10
+#
+# FIXME: MCA (and LLVM scheduling model) do not support this. The test
+# case uses different registers to break the pattern.
+
+add	w8, w8, #1
+add	w13, w8, #1
+mul	w10, w12, w10
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      3000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 1.5
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s
new file mode 100644
index 00000000000..024834d97e0
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s
@@ -0,0 +1,17 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+add	w8, w8, #1
+add	w12, w9, #1
+cmp	w9, #42
+mul	w10, w12, w10
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      4000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               1.33
+# CHECK-NEXT: Block RThroughput: 2.0
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s
new file mode 100644
index 00000000000..52bd9b0968a
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+# CHECK: IPC:
+# CHECK-SAME: 1.50
+#
+# XFAIL: *
+#
+# MCA reports IPC = 0.60, while hardware shows IPC = 1.50.
+#
+# 1) The skewed ALU on Cortex-A55 is not modeled: ADD and AND
+#    instructions should be issued in the same cycle.
+#    See A55-2.s test for more details.
+#
+# 2) Cortex-A55 manual mentions that there is a forwarding path from
+#    the ALU pipeline to the LD/ST pipeline. This is not implemented in
+#    the LLVM scheduling model.
+
+add	w8, w8, #1
+and	w12, w8, #0x3f
+ldr	w14, [x10, w12, uxtw #2]
diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s
new file mode 100644
index 00000000000..b2366012828
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s
@@ -0,0 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
+
+fabs s0, s1
+fabs s2, s3
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      2000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 1.0