From 9c918327f6e32a0dcb9cc7ff76d8359b70fb32f4 Mon Sep 17 00:00:00 2001 From: Andrew Savonichev Date: Fri, 15 Jan 2021 16:08:58 +0300 Subject: [PATCH] [MCA] Add tests for IPC on Cortex-A55 The tests compare IPC statistics that MCA provides with IPC values measured on Cortex-A55 hardware. For hardware tests, each snippet is run in a loop unrolled by 1000, and IPC is measured by linux-perf. Several tests do not match the hardware: the skewed ALU is not supported, LDR seem to be missing a forwarding path. Differential Revision: https://reviews.llvm.org/D98174 --- .../AArch64/Cortex/IPC/A55-0-single-add.s | 14 +++++++++++ .../AArch64/Cortex/IPC/A55-1-add-seq.s | 15 +++++++++++ .../llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s | 15 +++++++++++ .../AArch64/Cortex/IPC/A55-11-fma-mix.s | 19 ++++++++++++++ .../AArch64/Cortex/IPC/A55-2-skewed-alu.s | 18 +++++++++++++ .../llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s | 16 ++++++++++++ .../llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s | 21 ++++++++++++++++ .../AArch64/Cortex/IPC/A55-5-mul-sdiv.s | 22 ++++++++++++++++ .../llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s | 25 +++++++++++++++++++ .../llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s | 17 +++++++++++++ .../llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s | 19 ++++++++++++++ .../llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s | 15 +++++++++++ 12 files changed, 216 insertions(+) create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s create mode 100644 test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s new file mode 100644 index 00000000000..4bb6d44393f --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s @@ -0,0 +1,14 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +add w8, w8, #1 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 1000 +# CHECK-NEXT: Total Cycles: 1003 +# CHECK-NEXT: Total uOps: 1000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.00 +# CHECK-NEXT: IPC: 1.00 +# CHECK-NEXT: Block RThroughput: 0.5 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s new file mode 100644 index 00000000000..5ef72ee08e7 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s @@ -0,0 +1,15 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +add w8, w8, #1 +add w9, w9, #1 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 2000 +# CHECK-NEXT: Total Cycles: 1003 +# CHECK-NEXT: Total uOps: 2000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.99 +# CHECK-NEXT: Block RThroughput: 1.0 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s new file mode 100644 index 00000000000..2f892c54abb --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s @@ -0,0 +1,15 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +fmadd s3, s5, s6, s7 +fmadd s8, s9, s10, s11 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 2000 +# CHECK-NEXT: Total Cycles: 1004 +# CHECK-NEXT: Total uOps: 2000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.99 +# CHECK-NEXT: Block RThroughput: 1.0 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s new file mode 100644 index 00000000000..b9489852a92 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s @@ -0,0 +1,19 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +# FMADD writes and retires out-of-order +fmadd s3, s5, s6, s7 +# ADD instructions are issued and retire in-order +add w8, w8, #1 +add w9, w9, #1 +add w10, w10, #1 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 4000 +# CHECK-NEXT: Total Cycles: 2003 +# CHECK-NEXT: Total uOps: 4000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 2.00 +# CHECK-NEXT: IPC: 2.00 +# CHECK-NEXT: Block RThroughput: 2.0 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s new file mode 100644 index 00000000000..ba18c8f2bf4 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s @@ -0,0 +1,18 @@ +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s +# CHECK: IPC: +# CHECK-SAME: 2.00 +# +# XFAIL: * +# +# Cortex-A55 has a secondary skewed ALU in the Ex1 stage for simple +# ALU instructions that do not require shifting or saturation +# resources. Results from the skewed ALU are available 1 cycle earlier. +# +# This features allows the first and the second instruction to be +# dual-issued despite a register dependency (w8). +# +# MCA and LLVM scheduling model do not support this yet. + +add w8, w8, #1 +add w10, w8, #1 +add w12, w8, #1 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s new file mode 100644 index 00000000000..e029b05617e --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s @@ -0,0 +1,16 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +add w8, w8, #1 +add w12, w8, #1 +mul w10, w10, w10 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 3000 +# CHECK-NEXT: Total Cycles: 3003 +# CHECK-NEXT: Total uOps: 3000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.00 +# CHECK-NEXT: IPC: 1.00 +# CHECK-NEXT: Block RThroughput: 1.5 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s new file mode 100644 index 00000000000..b7b7a510b9b --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s @@ -0,0 +1,21 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +# DIV is not modeled precisely: on hardware it takes variable +# number of cycles depending on its operands, but LLVM scheduling +# model only provides an average latency. + +add w8, w8, #1 +movz w10, #1, lsl #16 +movz w12, #32768, lsl #16 +sdiv w10, w12, w10 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 4000 +# CHECK-NEXT: Total Cycles: 8004 +# CHECK-NEXT: Total uOps: 4000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.50 +# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: Block RThroughput: 8.0 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s new file mode 100644 index 00000000000..37d1829e2b4 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s @@ -0,0 +1,22 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +# DIV is not modeled precisely: on hardware it takes variable +# number of cycles depending on its operands. LLVM scheduling model +# only provides an average latency. + +add w8, w8, #1 +movz w10, #1, lsl #16 +movz w12, #32768, lsl #16 +mul w11, w8, w8 +sdiv w10, w12, w10 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 5000 +# CHECK-NEXT: Total Cycles: 8004 +# CHECK-NEXT: Total uOps: 5000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.62 +# CHECK-NEXT: IPC: 0.62 +# CHECK-NEXT: Block RThroughput: 8.0 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s new file mode 100644 index 00000000000..95b40264f9e --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s @@ -0,0 +1,25 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +# It appears that ADD and MUL fuse together, if both can be issued in +# one cycle: +# +# add w12, w8, #1 +# mul w10, w12, w10 +# +# FIXME: MCA (and LLVM scheduling model) do not support this. The test +# case uses different registers to break the pattern. + +add w8, w8, #1 +add w13, w8, #1 +mul w10, w12, w10 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 3000 +# CHECK-NEXT: Total Cycles: 3003 +# CHECK-NEXT: Total uOps: 3000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.00 +# CHECK-NEXT: IPC: 1.00 +# CHECK-NEXT: Block RThroughput: 1.5 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s new file mode 100644 index 00000000000..024834d97e0 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s @@ -0,0 +1,17 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +add w8, w8, #1 +add w12, w9, #1 +cmp w9, #42 +mul w10, w12, w10 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 4000 +# CHECK-NEXT: Total Cycles: 3004 +# CHECK-NEXT: Total uOps: 4000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.33 +# CHECK-NEXT: IPC: 1.33 +# CHECK-NEXT: Block RThroughput: 2.0 diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s new file mode 100644 index 00000000000..52bd9b0968a --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s @@ -0,0 +1,19 @@ +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s +# CHECK: IPC: +# CHECK-SAME: 1.50 +# +# XFAIL: * +# +# MCA reports IPC = 0.60, while hardware shows IPC = 1.50. +# +# 1) The skewed ALU on Cortex-A55 is not modeled: ADD and AND +# instructions should be issued in the same cycle. +# See A55-2.s test for more details. +# +# 2) Cortex-A55 manual mentions that there is a forwarding path from +# the ALU pipeline to the LD/ST pipeline. This is not implemented in +# the LLVM scheduling model. + +add w8, w8, #1 +and w12, w8, #0x3f +ldr w14, [x10, w12, uxtw #2] diff --git a/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s new file mode 100644 index 00000000000..b2366012828 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s @@ -0,0 +1,15 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s + +fabs s0, s1 +fabs s2, s3 + +# CHECK: Iterations: 1000 +# CHECK-NEXT: Instructions: 2000 +# CHECK-NEXT: Total Cycles: 1004 +# CHECK-NEXT: Total uOps: 2000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.99 +# CHECK-NEXT: Block RThroughput: 1.0