From db1708480af4cd00a7e80176a26bab36c9a7f330 Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Mon, 23 Sep 2019 12:59:25 -0500 Subject: [PATCH] [AArch64] Add the pipeline model for Exynos M5 Add the scheduling and cost models for Exynos M5. --- lib/Target/AArch64/AArch64.td | 3 +- lib/Target/AArch64/AArch64SchedExynosM5.td | 1012 +++++++++++++++++ test/tools/llvm-mca/AArch64/Exynos/aes.s | 57 + .../tools/llvm-mca/AArch64/Exynos/asimd-ld1.s | 189 +++ .../tools/llvm-mca/AArch64/Exynos/asimd-ld2.s | 118 ++ .../tools/llvm-mca/AArch64/Exynos/asimd-ld3.s | 118 ++ .../tools/llvm-mca/AArch64/Exynos/asimd-ld4.s | 118 ++ .../tools/llvm-mca/AArch64/Exynos/asimd-st1.s | 169 +++ .../tools/llvm-mca/AArch64/Exynos/asimd-st2.s | 97 ++ .../tools/llvm-mca/AArch64/Exynos/asimd-st3.s | 97 ++ .../tools/llvm-mca/AArch64/Exynos/asimd-st4.s | 97 ++ test/tools/llvm-mca/AArch64/Exynos/crc.s | 58 + .../llvm-mca/AArch64/Exynos/direct-branch.s | 8 + .../llvm-mca/AArch64/Exynos/divide-multiply.s | 67 ++ .../llvm-mca/AArch64/Exynos/double-recp.s | 66 ++ .../llvm-mca/AArch64/Exynos/double-rsqrt.s | 78 ++ .../llvm-mca/AArch64/Exynos/double-sqrt.s | 79 ++ .../AArch64/Exynos/extended-register.s | 16 + .../AArch64/Exynos/float-divide-multiply.s | 94 ++ .../llvm-mca/AArch64/Exynos/float-integer.s | 114 ++ .../llvm-mca/AArch64/Exynos/float-load.s | 153 +++ .../llvm-mca/AArch64/Exynos/float-recp.s | 62 + .../llvm-mca/AArch64/Exynos/float-rsqrt.s | 72 ++ .../llvm-mca/AArch64/Exynos/float-sqrt.s | 73 ++ .../llvm-mca/AArch64/Exynos/float-store.s | 142 +++ test/tools/llvm-mca/AArch64/Exynos/load.s | 66 ++ .../AArch64/Exynos/scheduler-queue-usage.s | 6 + .../AArch64/Exynos/shifted-register.s | 22 +- test/tools/llvm-mca/AArch64/Exynos/store.s | 82 ++ .../AArch64/Exynos/zero-latency-move.s | 51 +- 30 files changed, 3349 insertions(+), 35 deletions(-) create mode 100644 lib/Target/AArch64/AArch64SchedExynosM5.td create mode 100644 test/tools/llvm-mca/AArch64/Exynos/aes.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/asimd-ld1.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/asimd-ld2.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/asimd-ld3.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/asimd-ld4.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/crc.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/divide-multiply.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/double-recp.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/double-rsqrt.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/double-sqrt.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/float-integer.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/float-load.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/float-recp.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/float-rsqrt.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/float-sqrt.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/float-store.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/load.s create mode 100644 test/tools/llvm-mca/AArch64/Exynos/store.s diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 8a397d568cd..a2f824960c1 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -450,6 +450,7 @@ include "AArch64SchedFalkor.td" include "AArch64SchedKryo.td" include "AArch64SchedExynosM3.td" include "AArch64SchedExynosM4.td" +include "AArch64SchedExynosM5.td" include "AArch64SchedThunderX.td" include "AArch64SchedThunderX2T99.td" @@ -790,7 +791,7 @@ def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>; def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>; -def : ProcessorModel<"exynos-m5", ExynosM4Model, [ProcExynosM4]>; +def : ProcessorModel<"exynos-m5", ExynosM5Model, [ProcExynosM4]>; def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>; def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>; def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; diff --git a/lib/Target/AArch64/AArch64SchedExynosM5.td b/lib/Target/AArch64/AArch64SchedExynosM5.td new file mode 100644 index 00000000000..df7402591e7 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -0,0 +1,1012 @@ +//=- AArch64SchedExynosM5.td - Samsung Exynos M5 Sched Defs --*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Samsung Exynos M5 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The Exynos-M5 is an advanced superscalar microprocessor with a 6-wide +// in-order stage for decode and dispatch and a wider issue stage. +// The execution units and loads and stores are out-of-order. + +def ExynosM5Model : SchedMachineModel { + let IssueWidth = 6; // Up to 6 uops per cycle. + let MicroOpBufferSize = 228; // ROB size. + let LoopMicroOpBufferSize = 60; // Based on the instruction queue size. + let LoadLatency = 4; // Optimistic load cases. + let MispredictPenalty = 15; // Minimum branch misprediction penalty. + let CompleteModel = 1; // Use the default model otherwise. + + list UnsupportedFeatures = SVEUnsupported.F; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on the Exynos-M5. + +let SchedModel = ExynosM5Model in { + +def M5UnitA : ProcResource<2>; // Simple integer +def M5UnitC : ProcResource<2>; // Simple and complex integer +let Super = M5UnitC, BufferSize = 1 in +def M5UnitD : ProcResource<1>; // Integer division (inside C0, serialized) +def M5UnitE : ProcResource<2>; // Simple 32-bit integer +let Super = M5UnitC in +def M5UnitF : ProcResource<2>; // CRC (inside C) +def M5UnitB : ProcResource<1>; // Branch +def M5UnitL0 : ProcResource<1>; // Load +def M5UnitS0 : ProcResource<1>; // Store +def M5PipeLS : ProcResource<1>; // Load/Store +let Super = M5PipeLS in { + def M5UnitL1 : ProcResource<1>; + def M5UnitS1 : ProcResource<1>; +} +def M5PipeF0 : ProcResource<1>; // FP #0 +let Super = M5PipeF0 in { + def M5UnitFMAC0 : ProcResource<1>; // FP multiplication + def M5UnitFADD0 : ProcResource<1>; // Simple FP + def M5UnitNALU0 : ProcResource<1>; // Simple vector + def M5UnitNDOT0 : ProcResource<1>; // Dot product vector + def M5UnitNHAD : ProcResource<1>; // Horizontal vector + def M5UnitNMSC : ProcResource<1>; // FP and vector miscellanea + def M5UnitNMUL0 : ProcResource<1>; // Vector multiplication + def M5UnitNSHT0 : ProcResource<1>; // Vector shifting + def M5UnitNSHF0 : ProcResource<1>; // Vector shuffling + def M5UnitNCRY0 : ProcResource<1>; // Cryptographic +} +def M5PipeF1 : ProcResource<1>; // FP #1 +let Super = M5PipeF1 in { + def M5UnitFMAC1 : ProcResource<1>; // FP multiplication + def M5UnitFADD1 : ProcResource<1>; // Simple FP + def M5UnitFCVT0 : ProcResource<1>; // FP conversion + def M5UnitFDIV0 : ProcResource<2>; // FP division (serialized) + def M5UnitFSQR0 : ProcResource<2>; // FP square root (serialized) + def M5UnitFST0 : ProcResource<1>; // FP store + def M5UnitNALU1 : ProcResource<1>; // Simple vector + def M5UnitNDOT1 : ProcResource<1>; // Dot product vector + def M5UnitNSHT1 : ProcResource<1>; // Vector shifting + def M5UnitNSHF1 : ProcResource<1>; // Vector shuffling +} +def M5PipeF2 : ProcResource<1>; // FP #2 +let Super = M5PipeF2 in { + def M5UnitFMAC2 : ProcResource<1>; // FP multiplication + def M5UnitFADD2 : ProcResource<1>; // Simple FP + def M5UnitFCVT1 : ProcResource<1>; // FP conversion + def M5UnitFDIV1 : ProcResource<2>; // FP division (serialized) + def M5UnitFSQR1 : ProcResource<2>; // FP square root (serialized) + def M5UnitFST1 : ProcResource<1>; // FP store + def M5UnitNALU2 : ProcResource<1>; // Simple vector + def M5UnitNDOT2 : ProcResource<1>; // Dot product vector + def M5UnitNMUL1 : ProcResource<1>; // Vector multiplication + def M5UnitNSHT2 : ProcResource<1>; // Vector shifting + def M5UnitNCRY1 : ProcResource<1>; // Cryptographic +} + +def M5UnitAX : ProcResGroup<[M5UnitA, + M5UnitC]>; +def M5UnitAW : ProcResGroup<[M5UnitA, + M5UnitC, + M5UnitE]>; +def M5UnitL : ProcResGroup<[M5UnitL0, + M5UnitL1]>; +def M5UnitS : ProcResGroup<[M5UnitS0, + M5UnitS1]>; +def M5UnitFMAC : ProcResGroup<[M5UnitFMAC0, + M5UnitFMAC1, + M5UnitFMAC2]>; +def M5UnitFADD : ProcResGroup<[M5UnitFADD0, + M5UnitFADD1, + M5UnitFADD2]>; +def M5UnitFCVT : ProcResGroup<[M5UnitFCVT0, + M5UnitFCVT1]>; +def M5UnitFDIV : ProcResGroup<[M5UnitFDIV0, + M5UnitFDIV1]>; +def M5UnitFSQR : ProcResGroup<[M5UnitFSQR0, + M5UnitFSQR1]>; +def M5UnitFST : ProcResGroup<[M5UnitFST0, + M5UnitFST1]>; +def M5UnitNALU : ProcResGroup<[M5UnitNALU0, + M5UnitNALU1, + M5UnitNALU2]>; +def M5UnitNDOT : ProcResGroup<[M5UnitNDOT0, + M5UnitNDOT1, + M5UnitNDOT2]>; +def M5UnitNMUL : ProcResGroup<[M5UnitNMUL0, + M5UnitNMUL1]>; +def M5UnitNSHT : ProcResGroup<[M5UnitNSHT0, + M5UnitNSHT1, + M5UnitNSHT2]>; +def M5UnitNSHF : ProcResGroup<[M5UnitNSHF0, + M5UnitNSHF1]>; +def M5UnitNCRY : ProcResGroup<[M5UnitNCRY0, + M5UnitNCRY1]>; + +//===----------------------------------------------------------------------===// +// Resources details. + +def M5WriteZ0 : SchedWriteRes<[]> { let Latency = 0; } +def M5WriteZ1 : SchedWriteRes<[]> { let Latency = 1; + let NumMicroOps = 0; } +def M5WriteZ4 : SchedWriteRes<[]> { let Latency = 4; + let NumMicroOps = 0; } + +def M5WriteA1W : SchedWriteRes<[M5UnitAW]> { let Latency = 1; } +def M5WriteA1X : SchedWriteRes<[M5UnitAX]> { let Latency = 1; } +def M5WriteAAW : SchedWriteRes<[M5UnitAW]> { let Latency = 2; + let ResourceCycles = [2]; } +def M5WriteAAX : SchedWriteRes<[M5UnitAX]> { let Latency = 2; + let ResourceCycles = [2]; } +def M5WriteAB : SchedWriteRes<[M5UnitAX, + M5UnitC, + M5UnitE]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteAC : SchedWriteRes<[M5UnitAX, + M5UnitAX, + M5UnitC]> { let Latency = 3; + let NumMicroOps = 3; } +def M5WriteAD : SchedWriteRes<[M5UnitAW, + M5UnitC]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteAFW : SchedWriteRes<[M5UnitAW]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteAFX : SchedWriteRes<[M5UnitAX]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteAUW : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAUX : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAVW : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAVX : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAXW : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAXX : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAYW : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M5WriteAYX : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5WriteB1 : SchedWriteRes<[M5UnitB]> { let Latency = 1; } +def M5WriteBX : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5WriteC1 : SchedWriteRes<[M5UnitC]> { let Latency = 1; } +def M5WriteC2 : SchedWriteRes<[M5UnitC]> { let Latency = 2; } +def M5WriteCA : SchedWriteRes<[M5UnitC]> { let Latency = 3; + let ResourceCycles = [2]; } + +def M5WriteD10 : SchedWriteRes<[M5UnitD]> { let Latency = 10; + let ResourceCycles = [10]; } +def M5WriteD16 : SchedWriteRes<[M5UnitD]> { let Latency = 16; + let ResourceCycles = [16]; } + +def M5WriteF2 : SchedWriteRes<[M5UnitF]> { let Latency = 2; } + +def M5WriteL4 : SchedWriteRes<[M5UnitL]> { let Latency = 4; } +def M5WriteL5 : SchedWriteRes<[M5UnitL]> { let Latency = 5; } +def M5WriteL6 : SchedWriteRes<[M5UnitL]> { let Latency = 6; } +def M5WriteLA : SchedWriteRes<[M5UnitL, + M5UnitL]> { let Latency = 6; + let NumMicroOps = 1; } +def M5WriteLB : SchedWriteRes<[M5UnitAX, + M5UnitL]> { let Latency = 6; + let NumMicroOps = 2; } +def M5WriteLC : SchedWriteRes<[M5UnitAX, + M5UnitL, + M5UnitL]> { let Latency = 6; + let NumMicroOps = 2; } +def M5WriteLD : SchedWriteRes<[M5UnitAX, + M5UnitL]> { let Latency = 4; + let NumMicroOps = 2; } +def M5WriteLE : SchedWriteRes<[M5UnitAX, + M5UnitL]> { let Latency = 7; + let NumMicroOps = 2; } +def M5WriteLFW : SchedWriteRes<[M5UnitAW, + M5UnitAW, + M5UnitAW, + M5UnitAW, + M5UnitL]> { let Latency = 15; + let NumMicroOps = 6; + let ResourceCycles = [1, 1, 1, 1, 15]; } +def M5WriteLFX : SchedWriteRes<[M5UnitAX, + M5UnitAX, + M5UnitAX, + M5UnitAX, + M5UnitL]> { let Latency = 15; + let NumMicroOps = 6; + let ResourceCycles = [1, 1, 1, 1, 15]; } +def M5WriteLGW : SchedWriteRes<[M5UnitAW, + M5UnitL]> { let Latency = 13; + let NumMicroOps = 1; + let ResourceCycles = [1, 13]; } +def M5WriteLGX : SchedWriteRes<[M5UnitAX, + M5UnitL]> { let Latency = 13; + let NumMicroOps = 1; + let ResourceCycles = [1, 13]; } +def M5WriteLH : SchedWriteRes<[]> { let Latency = 6; + let NumMicroOps = 0; } +def M5WriteLX : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M5WriteLY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5WriteS1 : SchedWriteRes<[M5UnitS]> { let Latency = 1; } +def M5WriteSA : SchedWriteRes<[M5UnitS0]> { let Latency = 4; } +def M5WriteSB : SchedWriteRes<[M5UnitAX, + M5UnitS]> { let Latency = 2; + let NumMicroOps = 1; } +def M5WriteSX : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5ReadAdrBase : SchedReadVariant<[SchedVar< + MCSchedPredicate< + CheckAny< + [ScaledIdxFn, + ExynosScaledIdxFn]>>, [ReadDefault]>, + SchedVar]>; + +def M5WriteNEONB : SchedWriteRes<[M5UnitNALU, + M5UnitS0]> { let Latency = 5; + let NumMicroOps = 2; } +def M5WriteNEONH : SchedWriteRes<[M5UnitNALU, + M5UnitS0]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteNEONI : SchedWriteRes<[M5UnitS0, + M5UnitNSHF]> { let Latency = 6; + let NumMicroOps = 2; } +def M5WriteNEONK : SchedWriteRes<[M5UnitNSHF, + M5UnitFCVT0, + M5UnitS0]> { let Latency = 5; + let NumMicroOps = 2; } +def M5WriteNEONN : SchedWriteRes<[M5UnitNMSC, + M5UnitNMSC]> { let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [7, 7]; } +def M5WriteNEONO : SchedWriteRes<[M5UnitNMSC, + M5UnitNMSC, + M5UnitNMSC]> { let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [10, 10, 10]; } +def M5WriteNEONP : SchedWriteRes<[M5UnitNSHF, + M5UnitS0, + M5UnitFCVT]> { let Latency = 7; + let NumMicroOps = 2; } +def M5WriteNEONQ : SchedWriteRes<[M5UnitNMSC, + M5UnitC]> { let Latency = 3; + let NumMicroOps = 1; } +def M5WriteNEONU : SchedWriteRes<[M5UnitFSQR, + M5UnitFSQR]> { let Latency = 7; + let ResourceCycles = [4, 4]; } +def M5WriteNEONV : SchedWriteRes<[M5UnitFDIV, + M5UnitFDIV]> { let Latency = 7; + let ResourceCycles = [6, 6]; } +def M5WriteNEONW : SchedWriteRes<[M5UnitFDIV, + M5UnitFDIV]> { let Latency = 12; + let ResourceCycles = [9, 9]; } +def M5WriteNEONX : SchedWriteRes<[M5UnitFSQR, + M5UnitFSQR]> { let Latency = 8; + let ResourceCycles = [5, 5]; } +def M5WriteNEONY : SchedWriteRes<[M5UnitFSQR, + M5UnitFSQR]> { let Latency = 12; + let ResourceCycles = [9, 9]; } +def M5WriteNEONZ : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5WriteFADD2 : SchedWriteRes<[M5UnitFADD]> { let Latency = 2; } + +def M5WriteFCVT2 : SchedWriteRes<[M5UnitFCVT]> { let Latency = 2; } +def M5WriteFCVT2A : SchedWriteRes<[M5UnitFCVT0]> { let Latency = 2; } +def M5WriteFCVT3 : SchedWriteRes<[M5UnitFCVT]> { let Latency = 3; } +def M5WriteFCVT3A : SchedWriteRes<[M5UnitFCVT0]> { let Latency = 3; } +def M5WriteFCVTA : SchedWriteRes<[M5UnitFCVT0, + M5UnitS0]> { let Latency = 3; + let NumMicroOps = 1; } +def M5WriteFCVTB : SchedWriteRes<[M5UnitFCVT, + M5UnitS0]> { let Latency = 4; + let NumMicroOps = 1; } +def M5WriteFCVTC : SchedWriteRes<[M5UnitFCVT, + M5UnitS0]> { let Latency = 6; + let NumMicroOps = 1; } + +def M5WriteFDIV5 : SchedWriteRes<[M5UnitFDIV]> { let Latency = 5; + let ResourceCycles = [2]; } +def M5WriteFDIV7 : SchedWriteRes<[M5UnitFDIV]> { let Latency = 7; + let ResourceCycles = [4]; } +def M5WriteFDIV12 : SchedWriteRes<[M5UnitFDIV]> { let Latency = 12; + let ResourceCycles = [9]; } + +def M5WriteFMAC3 : SchedWriteRes<[M5UnitFMAC]> { let Latency = 3; } +def M5WriteFMAC4 : SchedWriteRes<[M5UnitFMAC]> { let Latency = 4; } +def M5WriteFMAC5 : SchedWriteRes<[M5UnitFMAC]> { let Latency = 5; } + +def M5WriteFSQR5 : SchedWriteRes<[M5UnitFSQR]> { let Latency = 5; + let ResourceCycles = [2]; } +def M5WriteFSQR7 : SchedWriteRes<[M5UnitFSQR]> { let Latency = 7; + let ResourceCycles = [4]; } +def M5WriteFSQR8 : SchedWriteRes<[M5UnitFSQR]> { let Latency = 8; + let ResourceCycles = [5]; } +def M5WriteFSQR12 : SchedWriteRes<[M5UnitFSQR]> { let Latency = 12; + let ResourceCycles = [9]; } + +def M5WriteNALU1 : SchedWriteRes<[M5UnitNALU]> { let Latency = 1; } +def M5WriteNALU2 : SchedWriteRes<[M5UnitNALU]> { let Latency = 2; } + +def M5WriteNDOT2 : SchedWriteRes<[M5UnitNDOT]> { let Latency = 2; } + +def M5WriteNCRY2 : SchedWriteRes<[M5UnitNCRY]> { let Latency = 2; } +def M5WriteNCRY1A : SchedWriteRes<[M5UnitNCRY0]> { let Latency = 1; } +def M5WriteNCRY2A : SchedWriteRes<[M5UnitNCRY0]> { let Latency = 2; } +def M5WriteNCRY3A : SchedWriteRes<[M5UnitNCRY0]> { let Latency = 3; } +def M5WriteNCRY5A : SchedWriteRes<[M5UnitNCRY]> { let Latency = 5; } + +def M5WriteNHAD1 : SchedWriteRes<[M5UnitNHAD]> { let Latency = 1; } +def M5WriteNHAD3 : SchedWriteRes<[M5UnitNHAD]> { let Latency = 3; } + +def M5WriteNMSC1 : SchedWriteRes<[M5UnitNMSC]> { let Latency = 1; } +def M5WriteNMSC2 : SchedWriteRes<[M5UnitNMSC]> { let Latency = 2; } + +def M5WriteNMUL3 : SchedWriteRes<[M5UnitNMUL]> { let Latency = 3; } + +def M5WriteNSHF1 : SchedWriteRes<[M5UnitNSHF]> { let Latency = 1; } +def M5WriteNSHF2 : SchedWriteRes<[M5UnitNSHF]> { let Latency = 2; } +def M5WriteNSHFA : SchedWriteRes<[M5UnitNSHF]> { let Latency = 2; } +def M5WriteNSHFB : SchedWriteRes<[M5UnitNSHF]> { let Latency = 4; + let NumMicroOps = 2; } +def M5WriteNSHFC : SchedWriteRes<[M5UnitNSHF]> { let Latency = 6; + let NumMicroOps = 3; } +def M5WriteNSHFD : SchedWriteRes<[M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 4; } + +def M5WriteNSHT2 : SchedWriteRes<[M5UnitNSHT]> { let Latency = 2; } +def M5WriteNSHT4A : SchedWriteRes<[M5UnitNSHT1]> { let Latency = 4; } + +def M5WriteVLDA : SchedWriteRes<[M5UnitL, + M5UnitL]> { let Latency = 6; + let NumMicroOps = 2; } +def M5WriteVLDB : SchedWriteRes<[M5UnitL, + M5UnitL, + M5UnitL]> { let Latency = 7; + let NumMicroOps = 3; } +def M5WriteVLDC : SchedWriteRes<[M5UnitL, + M5UnitL, + M5UnitL, + M5UnitL]> { let Latency = 7; + let NumMicroOps = 4; } +def M5WriteVLDD : SchedWriteRes<[M5UnitL, + M5UnitNSHF]> { let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [2, 1]; } +def M5WriteVLDF : SchedWriteRes<[M5UnitL, + M5UnitL]> { let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [6, 5]; } +def M5WriteVLDG : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF]> { let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2, 1, 1]; } +def M5WriteVLDI : SchedWriteRes<[M5UnitL, + M5UnitL, + M5UnitL]> { let Latency = 13; + let NumMicroOps = 3; } +def M5WriteVLDJ : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 4; } +def M5WriteVLDK : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 5; } +def M5WriteVLDL : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF, + M5UnitL, + M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 5; } +def M5WriteVLDM : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF, + M5UnitL, + M5UnitNSHF, + M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 6; } +def M5WriteVLDN : SchedWriteRes<[M5UnitL, + M5UnitL, + M5UnitL, + M5UnitL]> { let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [2, 2, 2, 2]; } + +def M5WriteVST1 : SchedWriteRes<[M5UnitS, + M5UnitFST]> { let Latency = 1; + let NumMicroOps = 1; } +def M5WriteVSTA : SchedWriteRes<[M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteVSTB : SchedWriteRes<[M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 3; + let NumMicroOps = 3; } +def M5WriteVSTC : SchedWriteRes<[M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 4; + let NumMicroOps = 4; } +def M5WriteVSTD : SchedWriteRes<[M5UnitS, + M5UnitFST]> { let Latency = 2; } +def M5WriteVSTE : SchedWriteRes<[M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 2; + let NumMicroOps = 1; } +def M5WriteVSTF : SchedWriteRes<[M5UnitNSHF, + M5UnitNSHF, + M5UnitS, + M5UnitFST]> { let Latency = 4; + let NumMicroOps = 3; } +def M5WriteVSTG : SchedWriteRes<[M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 4; + let NumMicroOps = 5; } +def M5WriteVSTH : SchedWriteRes<[M5UnitS0, + M5UnitFST]> { let Latency = 1; + let NumMicroOps = 1; } +def M5WriteVSTI : SchedWriteRes<[M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1]; } +def M5WriteVSTJ : SchedWriteRes<[M5UnitA, + M5UnitS0, + M5UnitFST]> { let Latency = 1; + let NumMicroOps = 1; } +def M5WriteVSTK : SchedWriteRes<[M5UnitAX, + M5UnitS, + M5UnitFST]> { let Latency = 3; + let NumMicroOps = 2; } +def M5WriteVSTL : SchedWriteRes<[M5UnitNSHF, + M5UnitNSHF, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 2, 1, 2, 1]; } +def M5WriteVSTY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Special cases. +def M5WriteCOPY : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M5WriteMOVI : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Fast forwarding. +def M5ReadFM1 : SchedReadAdvance<+1, [M5WriteF2]>; +def M5ReadAESM2 : SchedReadAdvance<+2, [M5WriteNCRY2]>; +def M5ReadFMACM1 : SchedReadAdvance<+1, [M5WriteFMAC4, + M5WriteFMAC5]>; +def M5ReadNMULM1 : SchedReadAdvance<+1, [M5WriteNMUL3]>; + +//===----------------------------------------------------------------------===// +// Coarse scheduling model. + +// Branch instructions. +def : SchedAlias; +def : SchedAlias; + +// Arithmetic and logical integer instructions. +def : SchedAlias; +def : SchedAlias; // FIXME: M5WriteAX crashes TableGen. +def : SchedAlias; // FIXME: M5WriteAX crashes TableGen. +def : SchedAlias; + +// Move instructions. +def : SchedAlias; + +// Divide and multiply instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// Miscellaneous instructions. +def : SchedAlias; + +// Addressing modes. +def : SchedAlias; +def : SchedAlias; + +// Load instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// Store instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// Atomic load and store instructions. +def : SchedAlias; + +// FP data instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// FP miscellaneous instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// FP load instructions. +def : SchedAlias; + +// FP store instructions. +def : SchedAlias; + +// ASIMD FP instructions. +def : SchedAlias; + +// Other miscellaneous instructions. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Generic fast forwarding. + +// TODO: Add FP register forwarding rules. + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +// TODO: The forwarding for 32 bits actually saves 2 cycles. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Finer scheduling model. + +// Branch instructions +def : InstRW<[M5WriteB1], (instrs Bcc)>; +def : InstRW<[M5WriteAFX], (instrs BL)>; +def : InstRW<[M5WriteBX], (instrs BLR)>; +def : InstRW<[M5WriteC1], (instregex "^CBN?Z[WX]")>; +def : InstRW<[M5WriteAD], (instregex "^TBN?ZW")>; +def : InstRW<[M5WriteAB], (instregex "^TBN?ZX")>; + +// Arithmetic and logical integer instructions. +def : InstRW<[M5WriteA1W], (instregex "^(ADC|SBC)S?Wr$")>; +def : InstRW<[M5WriteA1X], (instregex "^(ADC|SBC)S?Xr$")>; +def : InstRW<[M5WriteAXW], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)Wrs$")>; +def : InstRW<[M5WriteAXX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)Xrs$")>; +def : InstRW<[M5WriteAUW], (instrs ORRWrs)>; +def : InstRW<[M5WriteAUX], (instrs ORRXrs)>; +def : InstRW<[M5WriteAXW], (instregex "^(ADD|AND|BIC|SUB)SWrs$")>; +def : InstRW<[M5WriteAXX], (instregex "^(ADD|AND|BIC|SUB)SXrs$")>; +def : InstRW<[M5WriteAXW], (instregex "^(ADD|SUB)S?Wrx(64)?$")>; +def : InstRW<[M5WriteAXX], (instregex "^(ADD|SUB)S?Xrx(64)?$")>; +def : InstRW<[M5WriteAVW], (instrs ADDWri, ORRWri)>; +def : InstRW<[M5WriteAVX], (instrs ADDXri, ORRXri)>; +def : InstRW<[M5WriteA1W], (instregex "^CCM[NP]W[ir]$")>; +def : InstRW<[M5WriteA1X], (instregex "^CCM[NP]X[ir]$")>; +def : InstRW<[M5WriteA1W], (instrs CSELWr, CSINCWr, CSINVWr, CSNEGWr)>; +def : InstRW<[M5WriteA1X], (instrs CSELXr, CSINCXr, CSINVXr, CSNEGXr)>; + +// Move instructions. +def : InstRW<[M5WriteCOPY], (instrs COPY)>; +def : InstRW<[M5WriteZ0], (instrs ADR, ADRP)>; +def : InstRW<[M5WriteZ0], (instregex "^MOV[NZ][WX]i$")>; + +// Shift instructions. +def : InstRW<[M5WriteA1W], (instrs ASRVWr, LSLVWr, LSRVWr, RORVWr)>; +def : InstRW<[M5WriteA1X], (instrs ASRVXr, LSLVXr, LSRVXr, RORVXr)>; + +// Miscellaneous instructions. +def : InstRW<[M5WriteAYW], (instrs EXTRWrri)>; +def : InstRW<[M5WriteAYX], (instrs EXTRXrri)>; +def : InstRW<[M5WriteA1W], (instrs BFMWri, SBFMWri, UBFMWri)>; +def : InstRW<[M5WriteA1X], (instrs BFMXri, SBFMXri, UBFMXri)>; +def : InstRW<[M5WriteA1W], (instrs CLSWr, CLZWr)>; +def : InstRW<[M5WriteA1X], (instrs CLSXr, CLZXr)>; +def : InstRW<[M5WriteA1W], (instrs RBITWr, REVWr, REV16Wr)>; +def : InstRW<[M5WriteA1X], (instrs RBITXr, REVXr, REV16Xr, REV32Xr)>; + +// Load instructions. +def : InstRW<[M5WriteLD, + WriteLDHi, + WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; +def : InstRW<[M5WriteL5, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>; +def : InstRW<[WriteLDIdx, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>; +def : InstRW<[M5WriteL5, + ReadAdrBase], (instrs PRFMroW)>; +def : InstRW<[WriteLDIdx, + ReadAdrBase], (instrs PRFMroX)>; + +// Store instructions. +def : InstRW<[M5WriteSB, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; +def : InstRW<[WriteST, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; + +// Atomic load and store instructions. +def : InstRW<[M5WriteLGW], (instregex "^CAS(A|AL|L)?[BHW]$")>; +def : InstRW<[M5WriteLGX], (instregex "^CAS(A|AL|L)?X$")>; +def : InstRW<[M5WriteLFW], (instregex "^CASP(A|AL|L)?W$")>; +def : InstRW<[M5WriteLFX], (instregex "^CASP(A|AL|L)?X$")>; +def : InstRW<[M5WriteLGW], (instregex "^LD(ADD|CLR|EOR|SET|[SU]MAX|[SU]MIN)(A|AL|L)?[BHW]$")>; +def : InstRW<[M5WriteLGX], (instregex "^LD(ADD|CLR|EOR|SET|[SU]MAX|[SU]MIN)(A|AL|L)?X$")>; +def : InstRW<[M5WriteLGW], (instregex "^SWP(A|AL|L)?[BHW]$")>; +def : InstRW<[M5WriteLGX], (instregex "^SWP(A|AL|L)?X$")>; + +// FP data instructions. +def : InstRW<[M5WriteNSHF1], (instrs FABSHr, FABSSr,FABSDr)>; +def : InstRW<[M5WriteFADD2], (instregex "^F(ADD|SUB)[HSD]rr")>; +def : InstRW<[M5WriteFADD2], (instregex "^FADDPv.i(16|32|64)")>; +def : InstRW<[M5WriteNEONQ], (instregex "^FCCMPE?[HSD]rr")>; +def : InstRW<[M5WriteNMSC2], (instregex "^FCMPE?[HSD]r[ir]")>; +def : InstRW<[M5WriteNMSC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(16|32|64|v1)")>; +def : InstRW<[M5WriteFDIV5], (instrs FDIVHrr)>; +def : InstRW<[M5WriteFDIV7], (instrs FDIVSrr)>; +def : InstRW<[M5WriteFDIV12], (instrs FDIVDrr)>; +def : InstRW<[M5WriteNMSC1], (instregex "^F(MAX|MIN)(NM)?[HSD]rr")>; +def : InstRW<[M5WriteFMAC3], (instregex "^FN?MUL[HSD]rr")>; +def : InstRW<[M5WriteFMAC3], (instrs FMULX16, FMULX32, FMULX64)>; +def : InstRW<[M5WriteFMAC4, + M5ReadFMACM1], (instregex "^FN?M(ADD|SUB)[HSD]rrr")>; +def : InstRW<[M5WriteNALU2], (instrs FNEGHr, FNEGSr, FNEGDr)>; +def : InstRW<[M5WriteFCVT3A], (instregex "^FRINT.+r")>; +def : InstRW<[M5WriteNEONH], (instregex "^FCSEL[HSD]rrr")>; +def : InstRW<[M5WriteFSQR5], (instrs FSQRTHr)>; +def : InstRW<[M5WriteFSQR8], (instrs FSQRTSr)>; +def : InstRW<[M5WriteFSQR12], (instrs FSQRTDr)>; + +// FP miscellaneous instructions. +def : InstRW<[M5WriteFCVT2], (instregex "^FCVT[HSD][HSD]r")>; +def : InstRW<[M5WriteFCVTC], (instregex "^[SU]CVTF[SU][XW][HSD]ri")>; +def : InstRW<[M5WriteFCVTB], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[M5WriteNALU1], (instregex "^FMOV[HSD]i")>; +def : InstRW<[M5WriteNALU2], (instregex "^FMOV[HSD]r")>; +def : InstRW<[M5WriteSA], (instregex "^FMOV[WX][HSD]r")>; +def : InstRW<[M5WriteFCVTA], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[M5WriteNEONI], (instregex "^FMOVXDHighr")>; +def : InstRW<[M5WriteNEONK], (instregex "^FMOVDXHighr")>; +def : InstRW<[M5WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev1(f16|i32|i64)")>; +def : InstRW<[M5WriteNMSC1], (instregex "^FRECPXv1")>; +def : InstRW<[M5WriteFMAC4], (instregex "^F(RECP|RSQRT)S(16|32|64)")>; + +// FP load instructions. +def : InstRW<[WriteVLD], (instregex "^LDR[SDQ]l")>; +def : InstRW<[WriteVLD], (instregex "^LDUR[BHSDQ]i")>; +def : InstRW<[WriteVLD, + WriteAdr], (instregex "^LDR[BHSDQ](post|pre)")>; +def : InstRW<[WriteVLD], (instregex "^LDR[BHSDQ]ui")>; +def : InstRW<[M5WriteLE, + ReadAdrBase], (instregex "^LDR[BHSDQ]roW")>; +def : InstRW<[WriteVLD, + ReadAdrBase], (instregex "^LDR[BHSD]roX")>; +def : InstRW<[M5WriteLY, + ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[WriteVLD, + M5WriteLH], (instregex "^LDN?P[SD]i")>; +def : InstRW<[M5WriteLA, + M5WriteLH], (instregex "^LDN?PQi")>; +def : InstRW<[M5WriteLB, + M5WriteLH, + WriteAdr], (instregex "^LDP[SD](post|pre)")>; +def : InstRW<[M5WriteLC, + M5WriteLH, + WriteAdr], (instregex "^LDPQ(post|pre)")>; + +// FP store instructions. +def : InstRW<[WriteVST], (instregex "^STUR[BHSDQ]i")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "^STR[BHSDQ](post|pre)")>; +def : InstRW<[WriteVST], (instregex "^STR[BHSDQ]ui")>; +def : InstRW<[WriteVST, + ReadAdrBase], (instregex "^STR[BHSD]ro[WX]")>; +def : InstRW<[M5WriteVSTK, + ReadAdrBase], (instregex "^STRQroW")>; +def : InstRW<[M5WriteVSTY, + ReadAdrBase], (instregex "^STRQroX")>; +def : InstRW<[WriteVST], (instregex "^STN?P[SD]i")>; +def : InstRW<[M5WriteVSTH], (instregex "^STN?PQi")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "^STP[SD](post|pre)")>; +def : InstRW<[M5WriteVSTJ, + WriteAdr], (instregex "^STPQ(post|pre)")>; + +// ASIMD instructions. +def : InstRW<[M5WriteNHAD1], (instregex "^[SU]ABDL?v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]ABAL?v")>; +def : InstRW<[M5WriteNMSC1], (instregex "^ABSv")>; +def : InstRW<[M5WriteNALU2], (instregex "^(ADD|NEG|SUB)v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]?ADDL?Pv")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]H(ADD|SUB)v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU](ADD|SUB)[LW]v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^R?(ADD|SUB)HN2?v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]Q(ADD|SUB)v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^(SU|US)QADDv")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]RHADDv")>; +def : InstRW<[M5WriteNMSC1], (instregex "^SQ(ABS|NEG)v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]?ADDL?Vv")>; +def : InstRW<[M5WriteNMSC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>; +def : InstRW<[M5WriteNALU2], (instregex "^CMTSTv")>; +def : InstRW<[M5WriteNALU2], (instregex "^(AND|BIC|EOR|NOT|ORN|ORR)v")>; +def : InstRW<[M5WriteNMSC1], (instregex "^[SU](MIN|MAX)v")>; +def : InstRW<[M5WriteNMSC2], (instregex "^[SU](MIN|MAX)Pv")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU](MIN|MAX)Vv")>; +def : InstRW<[M5WriteNMUL3], (instregex "^(SQR?D)?MULH?v")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^ML[AS]v")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^SQRDML[AS]H")>; +def : InstRW<[M5WriteNMUL3], (instregex "^(S|U|SQD)ML[AS]L(v1(i32|i64)|v2i32|v4i16|v8i8)")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^(S|U|SQD)ML[AS]L(v4i32|v8i16|v16i8)")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^(S|U|SQD)MULL(v1(i32|i64)|v2i32|v4i16|v8i8)")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^(S|U|SQD)MULL(v4i32|v8i16|v16i8)")>; +def : InstRW<[M5WriteNDOT2], (instregex "^[SU]DOT(lane)?v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]ADALPv")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]R?SRA[dv]")>; +def : InstRW<[M5WriteNSHT2], (instregex "^SHL[dv]")>; +def : InstRW<[M5WriteNSHT2], (instregex "^S[LR]I[dv]")>; +def : InstRW<[M5WriteNSHT2], (instregex "^[SU]SH[LR][dv]")>; +def : InstRW<[M5WriteNSHT2], (instregex "^[SU]?SHLLv")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]?Q?R?SHRU?N[bhsv]")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]RSH[LR][dv]")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]QR?SHLU?[bhsdv]")>; + +// ASIMD FP instructions. +def : InstRW<[M5WriteNSHF2], (instregex "^FABSv.f(16|32|64)")>; +def : InstRW<[M5WriteFADD2], (instregex "^F(ABD|ADD|SUB)v.f(16|32|64)")>; +def : InstRW<[M5WriteFADD2], (instregex "^FADDPv.f(16|32|64)")>; +def : InstRW<[M5WriteNMSC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>; +def : InstRW<[M5WriteFCVT2], (instregex "^FCVT(L|N|XN)v")>; +def : InstRW<[M5WriteFCVT2A], (instregex "^FCVT[AMNPZ][SU]v")>; +def : InstRW<[M5WriteFCVT2], (instregex "^[SU]CVTFv.[fi](16|32|64)")>; +def : InstRW<[M5WriteFDIV7], (instrs FDIVv4f16)>; +def : InstRW<[M5WriteNEONV], (instrs FDIVv8f16)>; +def : InstRW<[M5WriteFDIV7], (instrs FDIVv2f32)>; +def : InstRW<[M5WriteNEONV], (instrs FDIVv4f32)>; +def : InstRW<[M5WriteNEONW], (instrs FDIVv2f64)>; +def : InstRW<[M5WriteNMSC1], (instregex "^F(MAX|MIN)(NM)?v")>; +def : InstRW<[M5WriteNMSC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; +def : InstRW<[M5WriteNEONZ], (instregex "^F(MAX|MIN)(NM)?Vv")>; +def : InstRW<[M5WriteFMAC3], (instregex "^FMULX?v.[fi](16|32|64)")>; +def : InstRW<[M5WriteFMAC4, + M5ReadFMACM1], (instregex "^FML[AS]v.[fi](16|32|64)")>; +def : InstRW<[M5WriteNALU2], (instregex "^FNEGv.f(16|32|64)")>; +def : InstRW<[M5WriteFCVT3A], (instregex "^FRINT[AIMNPXZ]v")>; +def : InstRW<[M5WriteFSQR7], (instrs FSQRTv4f16)>; +def : InstRW<[M5WriteNEONU], (instrs FSQRTv8f16)>; +def : InstRW<[M5WriteFSQR8], (instrs FSQRTv2f32)>; +def : InstRW<[M5WriteNEONX], (instrs FSQRTv4f32)>; +def : InstRW<[M5WriteNEONY], (instrs FSQRTv2f64)>; + +// ASIMD miscellaneous instructions. +def : InstRW<[M5WriteNALU2], (instregex "^RBITv")>; +def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M5WriteNALU2], (instregex "^CL[STZ]v")>; +def : InstRW<[M5WriteNEONB], (instregex "^DUPv.+gpr")>; +def : InstRW<[M5WriteNSHF2], (instregex "^CPY")>; +def : InstRW<[M5WriteNSHF2], (instregex "^DUPv.+lane")>; +def : InstRW<[M5WriteNSHF2], (instregex "^EXTv")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^XTNv")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]?QXTU?Nv")>; +def : InstRW<[M5WriteNEONB], (instregex "^INSv.+gpr")>; +def : InstRW<[M5WriteNSHF2], (instregex "^INSv.+lane")>; +def : InstRW<[M5WriteMOVI], (instregex "^(MOV|MVN)I")>; +def : InstRW<[M5WriteNALU1], (instregex "^FMOVv.f(16|32|64)")>; +def : InstRW<[M5WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev[248]f(16|32|64)")>; +def : InstRW<[M5WriteFCVT3], (instregex "^U(RECP|RSQRT)Ev[24]i32")>; +def : InstRW<[M5WriteFMAC4], (instregex "^F(RECP|RSQRT)Sv.f(16|32|64)")>; +def : InstRW<[M5WriteNSHF2], (instregex "^REV(16|32|64)v")>; +def : InstRW<[M5WriteNSHFA], (instregex "^TB[LX]v(8|16)i8One")>; +def : InstRW<[M5WriteNSHFB], (instregex "^TB[LX]v(8|16)i8Two")>; +def : InstRW<[M5WriteNSHFC], (instregex "^TB[LX]v(8|16)i8Three")>; +def : InstRW<[M5WriteNSHFD], (instregex "^TB[LX]v(8|16)i8Four")>; +def : InstRW<[M5WriteNEONP], (instregex "^[SU]MOVv")>; +def : InstRW<[M5WriteNSHF2], (instregex "^(TRN|UZP|ZIP)[12]v")>; + +// ASIMD load instructions. +def : InstRW<[WriteVLD], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[WriteVLD, + M5WriteA1X, + WriteAdr], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDA], (instregex "LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDA, + M5WriteA1X, + WriteAdr], (instregex "LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDB], (instregex "LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDB, + M5WriteA1X, + WriteAdr], (instregex "LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDC], (instregex "LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDC, + M5WriteA1X, + WriteAdr], (instregex "LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDD], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[M5WriteVLDD, + M5WriteA1X, + WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[WriteVLD, + M5WriteA1X, + WriteAdr], (instregex "LD1Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDF], (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|2d)$")>; +def : InstRW<[M5WriteVLDF, + M5WriteA1X, + WriteAdr], (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|2d)_POST$")>; +def : InstRW<[M5WriteVLDG], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[M5WriteVLDG, + M5WriteA1X, + WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>; +def : InstRW<[M5WriteVLDA], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDA, + M5WriteA1X, + WriteAdr], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDI], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|2d)$")>; +def : InstRW<[M5WriteVLDI, + M5WriteA1X, + WriteAdr], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|2d)_POST$")>; +def : InstRW<[M5WriteVLDJ], (instregex "LD3i(8|16|32)$")>; +def : InstRW<[M5WriteVLDJ, + M5WriteA1X, + WriteAdr], (instregex "LD3i(8|16|32)_POST$")>; +def : InstRW<[M5WriteVLDL], (instregex "LD3i64$")>; +def : InstRW<[M5WriteVLDL, + M5WriteA1X, + WriteAdr], (instregex "LD3i64_POST$")>; +def : InstRW<[M5WriteVLDB], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDB, + M5WriteA1X], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDN], (instregex "LD4Fourv(8b|16b|4h|8h|2s|4s|2d)$")>; +def : InstRW<[M5WriteVLDN, + M5WriteA1X, + WriteAdr], (instregex "LD4Fourv(8b|16b|4h|8h|2s|4s|2d)_POST$")>; +def : InstRW<[M5WriteVLDK], (instregex "LD4i(8|16|32)$")>; +def : InstRW<[M5WriteVLDK, + M5WriteA1X, + WriteAdr], (instregex "LD4i(8|16|32)_POST$")>; +def : InstRW<[M5WriteVLDM], (instregex "LD4i64$")>; +def : InstRW<[M5WriteVLDM, + M5WriteA1X, + WriteAdr], (instregex "LD4i64_POST$")>; +def : InstRW<[M5WriteVLDC], (instregex "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDC, + M5WriteA1X, + WriteAdr], (instregex "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; + +// ASIMD store instructions. +def : InstRW<[WriteVST], (instregex "ST1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[WriteVST, + M5WriteA1X, + WriteAdr], (instregex "ST1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVSTA], (instregex "ST1Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVSTA, + M5WriteA1X, + WriteAdr], (instregex "ST1Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; + +def : InstRW<[M5WriteVSTB], (instregex "ST1Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVSTB, + M5WriteA1X, + WriteAdr], (instregex "ST1Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVSTC], (instregex "ST1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVSTC, + M5WriteA1X, + WriteAdr], (instregex "ST1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteVST], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[WriteVST, + M5WriteA1X, + WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[M5WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[M5WriteVSTD, + M5WriteA1X, + WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M5WriteVSTE], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M5WriteVSTE, + M5WriteA1X, + WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M5WriteVSTD], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[M5WriteVSTD, + M5WriteA1X, + WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[M5WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[M5WriteVSTF, + M5WriteA1X, + WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M5WriteVSTG], (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M5WriteVSTG, + M5WriteA1X, + WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M5WriteVSTA], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[M5WriteVSTA, + M5WriteA1X, + WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[M5WriteVSTL], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[M5WriteVSTL, + M5WriteA1X, + WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M5WriteVSTI], (instregex "ST4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M5WriteVSTI, + M5WriteA1X, + WriteAdr], (instregex "ST4Fourv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M5WriteVSTA], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[M5WriteVSTA, + M5WriteA1X, + WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; + +// Cryptography instructions. +def : InstRW<[M5WriteNCRY2], (instregex "^AES[DE]")>; +def : InstRW<[M5WriteNCRY2, + M5ReadAESM2], (instregex "^AESI?MC")>; +def : InstRW<[M5WriteNCRY2A], (instregex "^PMULv")>; +def : InstRW<[M5WriteNCRY1A], (instregex "^PMULLv(1|8)i")>; +def : InstRW<[M5WriteNCRY3A], (instregex "^PMULLv(2|16)i")>; +def : InstRW<[M5WriteNCRY2A], (instregex "^SHA1(H|SU[01])")>; +def : InstRW<[M5WriteNCRY5A], (instregex "^SHA1[CMP]")>; +def : InstRW<[M5WriteNCRY2A], (instrs SHA256SU0rr)>; +def : InstRW<[M5WriteNCRY5A], (instrs SHA256SU1rrr)>; +def : InstRW<[M5WriteNCRY5A], (instregex "^SHA256H2?")>; + +// CRC instructions. +def : InstRW<[M5WriteF2, + M5ReadFM1], (instregex "^CRC32C?[BHWX]")>; + +} // SchedModel = ExynosM5Model diff --git a/test/tools/llvm-mca/AArch64/Exynos/aes.s b/test/tools/llvm-mca/AArch64/Exynos/aes.s new file mode 100644 index 00000000000..e981b2aa4bd --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/aes.s @@ -0,0 +1,57 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +aese v0.16b, v1.16b +aesmc v0.16b, v0.16b + +aesd v0.16b, v1.16b +aesimc v0.16b, v0.16b + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 400 + +# M3-NEXT: Total Cycles: 203 +# M4-NEXT: Total Cycles: 203 +# M5-NEXT: Total Cycles: 403 + +# ALL-NEXT: Total uOps: 400 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 1.97 +# M3-NEXT: IPC: 1.97 + +# M4-NEXT: uOps Per Cycle: 1.97 +# M4-NEXT: IPC: 1.97 + +# M5-NEXT: uOps Per Cycle: 0.99 +# M5-NEXT: IPC: 0.99 + +# ALL-NEXT: Block RThroughput: 2.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 1 0.50 aese v0.16b, v1.16b +# M3-NEXT: 1 1 0.50 aesmc v0.16b, v0.16b +# M3-NEXT: 1 1 0.50 aesd v0.16b, v1.16b +# M3-NEXT: 1 1 0.50 aesimc v0.16b, v0.16b + +# M4-NEXT: 1 1 0.50 aese v0.16b, v1.16b +# M4-NEXT: 1 1 0.50 aesmc v0.16b, v0.16b +# M4-NEXT: 1 1 0.50 aesd v0.16b, v1.16b +# M4-NEXT: 1 1 0.50 aesimc v0.16b, v0.16b + +# M5-NEXT: 1 2 0.50 aese v0.16b, v1.16b +# M5-NEXT: 1 2 0.50 aesmc v0.16b, v0.16b +# M5-NEXT: 1 2 0.50 aesd v0.16b, v1.16b +# M5-NEXT: 1 2 0.50 aesimc v0.16b, v0.16b diff --git a/test/tools/llvm-mca/AArch64/Exynos/asimd-ld1.s b/test/tools/llvm-mca/AArch64/Exynos/asimd-ld1.s new file mode 100644 index 00000000000..f23b1f71c53 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/asimd-ld1.s @@ -0,0 +1,189 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +ld1 {v0.s}[0], [sp] +ld1r {v0.2s}, [sp] +ld1 {v0.2s}, [sp] +ld1 {v0.2s, v1.2s}, [sp] +ld1 {v0.2s, v1.2s, v2.2s}, [sp] +ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp] + +ld1 {v0.d}[0], [sp] +ld1r {v0.2d}, [sp] +ld1 {v0.2d}, [sp] +ld1 {v0.2d, v1.2d}, [sp] +ld1 {v0.2d, v1.2d, v2.2d}, [sp] +ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp] + +ld1 {v0.s}[0], [sp], #4 +ld1r {v0.2s}, [sp], #4 +ld1 {v0.2s}, [sp], #8 +ld1 {v0.2s, v1.2s}, [sp], #16 +ld1 {v0.2s, v1.2s, v2.2s}, [sp], #24 +ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], #32 + +ld1 {v0.d}[0], [sp], #8 +ld1r {v0.2d}, [sp], #8 +ld1 {v0.2d}, [sp], #16 +ld1 {v0.2d, v1.2d}, [sp], #32 +ld1 {v0.2d, v1.2d, v2.2d}, [sp], #48 +ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], #64 + +ld1 {v0.s}[0], [sp], x0 +ld1r {v0.2s}, [sp], x0 +ld1 {v0.2s}, [sp], x0 +ld1 {v0.2s, v1.2s}, [sp], x0 +ld1 {v0.2s, v1.2s, v2.2s}, [sp], x0 +ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], x0 + +ld1 {v0.d}[0], [sp], x0 +ld1r {v0.2d}, [sp], x0 +ld1 {v0.2d}, [sp], x0 +ld1 {v0.2d, v1.2d}, [sp], x0 +ld1 {v0.2d, v1.2d, v2.2d}, [sp], x0 +ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], x0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 3600 + +# M3-NEXT: Total Cycles: 14903 +# M4-NEXT: Total Cycles: 14703 +# M5-NEXT: Total Cycles: 17203 + +# ALL-NEXT: Total uOps: 10200 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.68 +# M3-NEXT: IPC: 0.24 + +# M4-NEXT: uOps Per Cycle: 0.69 +# M4-NEXT: IPC: 0.24 + +# M5-NEXT: uOps Per Cycle: 0.59 +# M5-NEXT: IPC: 0.21 + +# ALL-NEXT: Block RThroughput: 39.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 2 7 1.00 * ld1 { v0.s }[0], [sp] +# M3-NEXT: 1 5 0.50 * ld1r { v0.2s }, [sp] +# M3-NEXT: 1 5 0.50 * ld1 { v0.2s }, [sp] +# M3-NEXT: 2 5 1.00 * ld1 { v0.2s, v1.2s }, [sp] +# M3-NEXT: 3 6 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp] +# M3-NEXT: 4 6 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M3-NEXT: 2 6 1.00 * ld1 { v0.d }[0], [sp] +# M3-NEXT: 1 5 0.50 * ld1r { v0.2d }, [sp] +# M3-NEXT: 1 5 0.50 * ld1 { v0.2d }, [sp] +# M3-NEXT: 2 5 1.00 * ld1 { v0.2d, v1.2d }, [sp] +# M3-NEXT: 3 6 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp] +# M3-NEXT: 4 6 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M3-NEXT: 3 7 1.00 * ld1 { v0.s }[0], [sp], #4 +# M3-NEXT: 2 5 0.50 * ld1r { v0.2s }, [sp], #4 +# M3-NEXT: 2 5 0.50 * ld1 { v0.2s }, [sp], #8 +# M3-NEXT: 3 5 1.00 * ld1 { v0.2s, v1.2s }, [sp], #16 +# M3-NEXT: 4 6 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M3-NEXT: 5 6 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M3-NEXT: 3 6 1.00 * ld1 { v0.d }[0], [sp], #8 +# M3-NEXT: 2 5 0.50 * ld1r { v0.2d }, [sp], #8 +# M3-NEXT: 2 5 0.50 * ld1 { v0.2d }, [sp], #16 +# M3-NEXT: 3 5 1.00 * ld1 { v0.2d, v1.2d }, [sp], #32 +# M3-NEXT: 4 6 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M3-NEXT: 5 6 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M3-NEXT: 3 7 1.00 * ld1 { v0.s }[0], [sp], x0 +# M3-NEXT: 2 5 0.50 * ld1r { v0.2s }, [sp], x0 +# M3-NEXT: 2 5 0.50 * ld1 { v0.2s }, [sp], x0 +# M3-NEXT: 3 5 1.00 * ld1 { v0.2s, v1.2s }, [sp], x0 +# M3-NEXT: 4 6 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M3-NEXT: 5 6 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M3-NEXT: 3 6 1.00 * ld1 { v0.d }[0], [sp], x0 +# M3-NEXT: 2 5 0.50 * ld1r { v0.2d }, [sp], x0 +# M3-NEXT: 2 5 0.50 * ld1 { v0.2d }, [sp], x0 +# M3-NEXT: 3 5 1.00 * ld1 { v0.2d, v1.2d }, [sp], x0 +# M3-NEXT: 4 6 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M3-NEXT: 5 6 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 + +# M4-NEXT: 2 6 1.00 * ld1 { v0.s }[0], [sp] +# M4-NEXT: 1 5 0.50 * ld1r { v0.2s }, [sp] +# M4-NEXT: 1 5 0.50 * ld1 { v0.2s }, [sp] +# M4-NEXT: 2 5 1.00 * ld1 { v0.2s, v1.2s }, [sp] +# M4-NEXT: 3 6 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp] +# M4-NEXT: 4 6 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M4-NEXT: 2 6 1.00 * ld1 { v0.d }[0], [sp] +# M4-NEXT: 1 5 0.50 * ld1r { v0.2d }, [sp] +# M4-NEXT: 1 5 0.50 * ld1 { v0.2d }, [sp] +# M4-NEXT: 2 5 1.00 * ld1 { v0.2d, v1.2d }, [sp] +# M4-NEXT: 3 6 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp] +# M4-NEXT: 4 6 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M4-NEXT: 3 6 1.00 * ld1 { v0.s }[0], [sp], #4 +# M4-NEXT: 2 5 0.50 * ld1r { v0.2s }, [sp], #4 +# M4-NEXT: 2 5 0.50 * ld1 { v0.2s }, [sp], #8 +# M4-NEXT: 3 5 1.00 * ld1 { v0.2s, v1.2s }, [sp], #16 +# M4-NEXT: 4 6 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M4-NEXT: 5 6 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M4-NEXT: 3 6 1.00 * ld1 { v0.d }[0], [sp], #8 +# M4-NEXT: 2 5 0.50 * ld1r { v0.2d }, [sp], #8 +# M4-NEXT: 2 5 0.50 * ld1 { v0.2d }, [sp], #16 +# M4-NEXT: 3 5 1.00 * ld1 { v0.2d, v1.2d }, [sp], #32 +# M4-NEXT: 4 6 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M4-NEXT: 5 6 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M4-NEXT: 3 6 1.00 * ld1 { v0.s }[0], [sp], x0 +# M4-NEXT: 2 5 0.50 * ld1r { v0.2s }, [sp], x0 +# M4-NEXT: 2 5 0.50 * ld1 { v0.2s }, [sp], x0 +# M4-NEXT: 3 5 1.00 * ld1 { v0.2s, v1.2s }, [sp], x0 +# M4-NEXT: 4 6 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M4-NEXT: 5 6 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M4-NEXT: 3 6 1.00 * ld1 { v0.d }[0], [sp], x0 +# M4-NEXT: 2 5 0.50 * ld1r { v0.2d }, [sp], x0 +# M4-NEXT: 2 5 0.50 * ld1 { v0.2d }, [sp], x0 +# M4-NEXT: 3 5 1.00 * ld1 { v0.2d, v1.2d }, [sp], x0 +# M4-NEXT: 4 6 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M4-NEXT: 5 6 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 + +# M5-NEXT: 2 7 1.00 * ld1 { v0.s }[0], [sp] +# M5-NEXT: 1 6 0.50 * ld1r { v0.2s }, [sp] +# M5-NEXT: 1 6 0.50 * ld1 { v0.2s }, [sp] +# M5-NEXT: 2 6 1.00 * ld1 { v0.2s, v1.2s }, [sp] +# M5-NEXT: 3 7 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp] +# M5-NEXT: 4 7 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M5-NEXT: 2 7 1.00 * ld1 { v0.d }[0], [sp] +# M5-NEXT: 1 6 0.50 * ld1r { v0.2d }, [sp] +# M5-NEXT: 1 6 0.50 * ld1 { v0.2d }, [sp] +# M5-NEXT: 2 6 1.00 * ld1 { v0.2d, v1.2d }, [sp] +# M5-NEXT: 3 7 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp] +# M5-NEXT: 4 7 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M5-NEXT: 3 7 1.00 * ld1 { v0.s }[0], [sp], #4 +# M5-NEXT: 2 6 0.50 * ld1r { v0.2s }, [sp], #4 +# M5-NEXT: 2 6 0.50 * ld1 { v0.2s }, [sp], #8 +# M5-NEXT: 3 6 1.00 * ld1 { v0.2s, v1.2s }, [sp], #16 +# M5-NEXT: 4 7 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M5-NEXT: 5 7 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M5-NEXT: 3 7 1.00 * ld1 { v0.d }[0], [sp], #8 +# M5-NEXT: 2 6 0.50 * ld1r { v0.2d }, [sp], #8 +# M5-NEXT: 2 6 0.50 * ld1 { v0.2d }, [sp], #16 +# M5-NEXT: 3 6 1.00 * ld1 { v0.2d, v1.2d }, [sp], #32 +# M5-NEXT: 4 7 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M5-NEXT: 5 7 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M5-NEXT: 3 7 1.00 * ld1 { v0.s }[0], [sp], x0 +# M5-NEXT: 2 6 0.50 * ld1r { v0.2s }, [sp], x0 +# M5-NEXT: 2 6 0.50 * ld1 { v0.2s }, [sp], x0 +# M5-NEXT: 3 6 1.00 * ld1 { v0.2s, v1.2s }, [sp], x0 +# M5-NEXT: 4 7 1.50 * ld1 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M5-NEXT: 5 7 2.00 * ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M5-NEXT: 3 7 1.00 * ld1 { v0.d }[0], [sp], x0 +# M5-NEXT: 2 6 0.50 * ld1r { v0.2d }, [sp], x0 +# M5-NEXT: 2 6 0.50 * ld1 { v0.2d }, [sp], x0 +# M5-NEXT: 3 6 1.00 * ld1 { v0.2d, v1.2d }, [sp], x0 +# M5-NEXT: 4 7 1.50 * ld1 { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M5-NEXT: 5 7 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/asimd-ld2.s b/test/tools/llvm-mca/AArch64/Exynos/asimd-ld2.s new file mode 100644 index 00000000000..2ca640dbea7 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/asimd-ld2.s @@ -0,0 +1,118 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +ld2 {v0.s, v1.s}[0], [sp] +ld2r {v0.2s, v1.2s}, [sp] +ld2 {v0.2s, v1.2s}, [sp] + +ld2 {v0.d, v1.d}[0], [sp] +ld2r {v0.2d, v1.2d}, [sp] +ld2 {v0.2d, v1.2d}, [sp] + +ld2 {v0.s, v1.s}[0], [sp], #8 +ld2r {v0.2s, v1.2s}, [sp], #8 +ld2 {v0.2s, v1.2s}, [sp], #16 + +ld2 {v0.d, v1.d}[0], [sp], #16 +ld2r {v0.2d, v1.2d}, [sp], #16 +ld2 {v0.2d, v1.2d}, [sp], #32 + +ld2 {v0.s, v1.s}[0], [sp], x0 +ld2r {v0.2s, v1.2s}, [sp], x0 +ld2 {v0.2s, v1.2s}, [sp], x0 + +ld2 {v0.d, v1.d}[0], [sp], x0 +ld2r {v0.2d, v1.2d}, [sp], x0 +ld2 {v0.2d, v1.2d}, [sp], x0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1800 + +# M3-NEXT: Total Cycles: 10003 +# M4-NEXT: Total Cycles: 9803 +# M5-NEXT: Total Cycles: 11103 + +# ALL-NEXT: Total uOps: 5400 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.54 +# M3-NEXT: IPC: 0.18 +# M3-NEXT: Block RThroughput: 42.0 + +# M4-NEXT: uOps Per Cycle: 0.55 +# M4-NEXT: IPC: 0.18 +# M4-NEXT: Block RThroughput: 30.0 + +# M5-NEXT: uOps Per Cycle: 0.49 +# M5-NEXT: IPC: 0.16 +# M5-NEXT: Block RThroughput: 45.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 3 7 1.00 * ld2 { v0.s, v1.s }[0], [sp] +# M3-NEXT: 2 5 1.00 * ld2r { v0.2s, v1.2s }, [sp] +# M3-NEXT: 2 10 5.00 * ld2 { v0.2s, v1.2s }, [sp] +# M3-NEXT: 3 6 1.00 * ld2 { v0.d, v1.d }[0], [sp] +# M3-NEXT: 2 5 1.00 * ld2r { v0.2d, v1.2d }, [sp] +# M3-NEXT: 2 10 5.00 * ld2 { v0.2d, v1.2d }, [sp] +# M3-NEXT: 4 7 1.00 * ld2 { v0.s, v1.s }[0], [sp], #8 +# M3-NEXT: 3 5 1.00 * ld2r { v0.2s, v1.2s }, [sp], #8 +# M3-NEXT: 3 10 5.00 * ld2 { v0.2s, v1.2s }, [sp], #16 +# M3-NEXT: 4 6 1.00 * ld2 { v0.d, v1.d }[0], [sp], #16 +# M3-NEXT: 3 5 1.00 * ld2r { v0.2d, v1.2d }, [sp], #16 +# M3-NEXT: 3 10 5.00 * ld2 { v0.2d, v1.2d }, [sp], #32 +# M3-NEXT: 4 7 1.00 * ld2 { v0.s, v1.s }[0], [sp], x0 +# M3-NEXT: 3 5 1.00 * ld2r { v0.2s, v1.2s }, [sp], x0 +# M3-NEXT: 3 10 5.00 * ld2 { v0.2s, v1.2s }, [sp], x0 +# M3-NEXT: 4 6 1.00 * ld2 { v0.d, v1.d }[0], [sp], x0 +# M3-NEXT: 3 5 1.00 * ld2r { v0.2d, v1.2d }, [sp], x0 +# M3-NEXT: 3 10 5.00 * ld2 { v0.2d, v1.2d }, [sp], x0 + +# M4-NEXT: 3 6 1.00 * ld2 { v0.s, v1.s }[0], [sp] +# M4-NEXT: 2 5 1.00 * ld2r { v0.2s, v1.2s }, [sp] +# M4-NEXT: 2 10 3.00 * ld2 { v0.2s, v1.2s }, [sp] +# M4-NEXT: 3 6 1.00 * ld2 { v0.d, v1.d }[0], [sp] +# M4-NEXT: 2 5 1.00 * ld2r { v0.2d, v1.2d }, [sp] +# M4-NEXT: 2 10 3.00 * ld2 { v0.2d, v1.2d }, [sp] +# M4-NEXT: 4 6 1.00 * ld2 { v0.s, v1.s }[0], [sp], #8 +# M4-NEXT: 3 5 1.00 * ld2r { v0.2s, v1.2s }, [sp], #8 +# M4-NEXT: 3 10 3.00 * ld2 { v0.2s, v1.2s }, [sp], #16 +# M4-NEXT: 4 6 1.00 * ld2 { v0.d, v1.d }[0], [sp], #16 +# M4-NEXT: 3 5 1.00 * ld2r { v0.2d, v1.2d }, [sp], #16 +# M4-NEXT: 3 10 3.00 * ld2 { v0.2d, v1.2d }, [sp], #32 +# M4-NEXT: 4 6 1.00 * ld2 { v0.s, v1.s }[0], [sp], x0 +# M4-NEXT: 3 5 1.00 * ld2r { v0.2s, v1.2s }, [sp], x0 +# M4-NEXT: 3 10 3.00 * ld2 { v0.2s, v1.2s }, [sp], x0 +# M4-NEXT: 4 6 1.00 * ld2 { v0.d, v1.d }[0], [sp], x0 +# M4-NEXT: 3 5 1.00 * ld2r { v0.2d, v1.2d }, [sp], x0 +# M4-NEXT: 3 10 3.00 * ld2 { v0.2d, v1.2d }, [sp], x0 + +# M5-NEXT: 3 7 1.00 * ld2 { v0.s, v1.s }[0], [sp] +# M5-NEXT: 2 6 1.00 * ld2r { v0.2s, v1.2s }, [sp] +# M5-NEXT: 2 11 5.50 * ld2 { v0.2s, v1.2s }, [sp] +# M5-NEXT: 3 7 1.00 * ld2 { v0.d, v1.d }[0], [sp] +# M5-NEXT: 2 6 1.00 * ld2r { v0.2d, v1.2d }, [sp] +# M5-NEXT: 2 11 5.50 * ld2 { v0.2d, v1.2d }, [sp] +# M5-NEXT: 4 7 1.00 * ld2 { v0.s, v1.s }[0], [sp], #8 +# M5-NEXT: 3 6 1.00 * ld2r { v0.2s, v1.2s }, [sp], #8 +# M5-NEXT: 3 11 5.50 * ld2 { v0.2s, v1.2s }, [sp], #16 +# M5-NEXT: 4 7 1.00 * ld2 { v0.d, v1.d }[0], [sp], #16 +# M5-NEXT: 3 6 1.00 * ld2r { v0.2d, v1.2d }, [sp], #16 +# M5-NEXT: 3 11 5.50 * ld2 { v0.2d, v1.2d }, [sp], #32 +# M5-NEXT: 4 7 1.00 * ld2 { v0.s, v1.s }[0], [sp], x0 +# M5-NEXT: 3 6 1.00 * ld2r { v0.2s, v1.2s }, [sp], x0 +# M5-NEXT: 3 11 5.50 * ld2 { v0.2s, v1.2s }, [sp], x0 +# M5-NEXT: 4 7 1.00 * ld2 { v0.d, v1.d }[0], [sp], x0 +# M5-NEXT: 3 6 1.00 * ld2r { v0.2d, v1.2d }, [sp], x0 +# M5-NEXT: 3 11 5.50 * ld2 { v0.2d, v1.2d }, [sp], x0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/asimd-ld3.s b/test/tools/llvm-mca/AArch64/Exynos/asimd-ld3.s new file mode 100644 index 00000000000..a6a89434754 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/asimd-ld3.s @@ -0,0 +1,118 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +ld3 {v0.s, v1.s, v2.s}[0], [sp] +ld3r {v0.2s, v1.2s, v2.2s}, [sp] +ld3 {v0.2s, v1.2s, v2.2s}, [sp] + +ld3 {v0.d, v1.d, v2.d}[0], [sp] +ld3r {v0.2d, v1.2d, v2.2d}, [sp] +ld3 {v0.2d, v1.2d, v2.2d}, [sp] + +ld3 {v0.s, v1.s, v2.s}[0], [sp], #12 +ld3r {v0.2s, v1.2s, v2.2s}, [sp], #12 +ld3 {v0.2s, v1.2s, v2.2s}, [sp], #24 + +ld3 {v0.d, v1.d, v2.d}[0], [sp], #24 +ld3r {v0.2d, v1.2d, v2.2d}, [sp], #24 +ld3 {v0.2d, v1.2d, v2.2d}, [sp], #48 + +ld3 {v0.s, v1.s, v2.s}[0], [sp], x0 +ld3r {v0.2s, v1.2s, v2.2s}, [sp], x0 +ld3 {v0.2s, v1.2s, v2.2s}, [sp], x0 + +ld3 {v0.d, v1.d, v2.d}[0], [sp], x0 +ld3r {v0.2d, v1.2d, v2.2d}, [sp], x0 +ld3 {v0.2d, v1.2d, v2.2d}, [sp], x0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1800 + +# M3-NEXT: Total Cycles: 12501 +# M4-NEXT: Total Cycles: 11804 +# M5-NEXT: Total Cycles: 12903 + +# ALL-NEXT: Total uOps: 7500 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.60 +# M3-NEXT: IPC: 0.14 +# M3-NEXT: Block RThroughput: 84.0 + +# M4-NEXT: uOps Per Cycle: 0.64 +# M4-NEXT: IPC: 0.15 +# M4-NEXT: Block RThroughput: 54.0 + +# M5-NEXT: uOps Per Cycle: 0.58 +# M5-NEXT: IPC: 0.14 +# M5-NEXT: Block RThroughput: 22.5 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 4 7 1.00 * ld3 { v0.s, v1.s, v2.s }[0], [sp] +# M3-NEXT: 3 6 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp] +# M3-NEXT: 3 12 9.00 * ld3 { v0.2s, v1.2s, v2.2s }, [sp] +# M3-NEXT: 5 6 6.00 * ld3 { v0.d, v1.d, v2.d }[0], [sp] +# M3-NEXT: 3 6 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp] +# M3-NEXT: 3 12 9.00 * ld3 { v0.2d, v1.2d, v2.2d }, [sp] +# M3-NEXT: 5 7 1.00 * ld3 { v0.s, v1.s, v2.s }[0], [sp], #12 +# M3-NEXT: 4 6 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp], #12 +# M3-NEXT: 4 12 9.00 * ld3 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M3-NEXT: 6 6 6.00 * ld3 { v0.d, v1.d, v2.d }[0], [sp], #24 +# M3-NEXT: 4 6 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp], #24 +# M3-NEXT: 4 12 9.00 * ld3 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M3-NEXT: 5 7 1.00 * ld3 { v0.s, v1.s, v2.s }[0], [sp], x0 +# M3-NEXT: 4 6 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M3-NEXT: 4 12 9.00 * ld3 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M3-NEXT: 6 6 6.00 * ld3 { v0.d, v1.d, v2.d }[0], [sp], x0 +# M3-NEXT: 4 6 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M3-NEXT: 4 12 9.00 * ld3 { v0.2d, v1.2d, v2.2d }, [sp], x0 + +# M4-NEXT: 4 7 1.50 * ld3 { v0.s, v1.s, v2.s }[0], [sp] +# M4-NEXT: 3 6 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp] +# M4-NEXT: 3 12 4.50 * ld3 { v0.2s, v1.2s, v2.2s }, [sp] +# M4-NEXT: 5 7 4.50 * ld3 { v0.d, v1.d, v2.d }[0], [sp] +# M4-NEXT: 3 6 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp] +# M4-NEXT: 3 12 4.50 * ld3 { v0.2d, v1.2d, v2.2d }, [sp] +# M4-NEXT: 5 7 1.50 * ld3 { v0.s, v1.s, v2.s }[0], [sp], #12 +# M4-NEXT: 4 6 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp], #12 +# M4-NEXT: 4 12 4.50 * ld3 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M4-NEXT: 6 7 4.50 * ld3 { v0.d, v1.d, v2.d }[0], [sp], #24 +# M4-NEXT: 4 6 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp], #24 +# M4-NEXT: 4 12 4.50 * ld3 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M4-NEXT: 5 7 1.50 * ld3 { v0.s, v1.s, v2.s }[0], [sp], x0 +# M4-NEXT: 4 6 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M4-NEXT: 4 12 4.50 * ld3 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M4-NEXT: 6 7 4.50 * ld3 { v0.d, v1.d, v2.d }[0], [sp], x0 +# M4-NEXT: 4 6 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M4-NEXT: 4 12 4.50 * ld3 { v0.2d, v1.2d, v2.2d }, [sp], x0 + +# M5-NEXT: 4 8 1.50 * ld3 { v0.s, v1.s, v2.s }[0], [sp] +# M5-NEXT: 3 7 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp] +# M5-NEXT: 3 13 1.50 * ld3 { v0.2s, v1.2s, v2.2s }, [sp] +# M5-NEXT: 5 8 1.50 * ld3 { v0.d, v1.d, v2.d }[0], [sp] +# M5-NEXT: 3 7 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp] +# M5-NEXT: 3 13 1.50 * ld3 { v0.2d, v1.2d, v2.2d }, [sp] +# M5-NEXT: 5 8 1.50 * ld3 { v0.s, v1.s, v2.s }[0], [sp], #12 +# M5-NEXT: 4 7 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp], #12 +# M5-NEXT: 4 13 1.50 * ld3 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M5-NEXT: 6 8 1.50 * ld3 { v0.d, v1.d, v2.d }[0], [sp], #24 +# M5-NEXT: 4 7 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp], #24 +# M5-NEXT: 4 13 1.50 * ld3 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M5-NEXT: 5 8 1.50 * ld3 { v0.s, v1.s, v2.s }[0], [sp], x0 +# M5-NEXT: 4 7 1.50 * ld3r { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M5-NEXT: 4 13 1.50 * ld3 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M5-NEXT: 6 8 1.50 * ld3 { v0.d, v1.d, v2.d }[0], [sp], x0 +# M5-NEXT: 4 7 1.50 * ld3r { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M5-NEXT: 4 13 1.50 * ld3 { v0.2d, v1.2d, v2.2d }, [sp], x0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/asimd-ld4.s b/test/tools/llvm-mca/AArch64/Exynos/asimd-ld4.s new file mode 100644 index 00000000000..c5f2c9b7ec4 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/asimd-ld4.s @@ -0,0 +1,118 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +ld4 {v0.s, v1.s, v2.s, v3.s}[0], [sp] +ld4r {v0.2s, v1.2s, v2.2s, v3.2s}, [sp] +ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp] + +ld4 {v0.d, v1.d, v2.d, v3.d}[0], [sp] +ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [sp] +ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp] + +ld4 {v0.s, v1.s, v2.s, v3.s}[0], [sp], #16 +ld4r {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], #16 +ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], #32 + +ld4 {v0.d, v1.d, v2.d, v3.d}[0], [sp], #32 +ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], #32 +ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], #64 + +ld4 {v0.s, v1.s, v2.s, v3.s}[0], [sp], x0 +ld4r {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], x0 +ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], x0 + +ld4 {v0.d, v1.d, v2.d, v3.d}[0], [sp], x0 +ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], x0 +ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], x0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1800 + +# M3-NEXT: Total Cycles: 15598 +# M4-NEXT: Total Cycles: 13004 +# M5-NEXT: Total Cycles: 14304 + +# ALL-NEXT: Total uOps: 9300 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.60 +# M3-NEXT: IPC: 0.12 +# M3-NEXT: Block RThroughput: 108.0 + +# M4-NEXT: uOps Per Cycle: 0.72 +# M4-NEXT: IPC: 0.14 +# M4-NEXT: Block RThroughput: 61.5 + +# M5-NEXT: uOps Per Cycle: 0.65 +# M5-NEXT: IPC: 0.13 +# M5-NEXT: Block RThroughput: 40.5 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 5 9 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp] +# M3-NEXT: 4 6 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M3-NEXT: 4 14 12.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M3-NEXT: 6 7 6.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp] +# M3-NEXT: 4 6 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M3-NEXT: 4 14 12.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M3-NEXT: 6 9 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], #16 +# M3-NEXT: 5 6 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 +# M3-NEXT: 5 14 12.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M3-NEXT: 7 7 6.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], #32 +# M3-NEXT: 5 6 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #32 +# M3-NEXT: 5 14 12.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M3-NEXT: 6 9 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], x0 +# M3-NEXT: 5 6 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M3-NEXT: 5 14 12.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M3-NEXT: 7 7 6.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], x0 +# M3-NEXT: 5 6 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 +# M3-NEXT: 5 14 12.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 + +# M4-NEXT: 5 7 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp] +# M4-NEXT: 4 6 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M4-NEXT: 4 14 6.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M4-NEXT: 6 7 3.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp] +# M4-NEXT: 4 6 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M4-NEXT: 4 14 6.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M4-NEXT: 6 7 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], #16 +# M4-NEXT: 5 6 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 +# M4-NEXT: 5 14 6.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M4-NEXT: 7 7 3.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], #32 +# M4-NEXT: 5 6 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #32 +# M4-NEXT: 5 14 6.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M4-NEXT: 6 7 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], x0 +# M4-NEXT: 5 6 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M4-NEXT: 5 14 6.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M4-NEXT: 7 7 3.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], x0 +# M4-NEXT: 5 6 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 +# M4-NEXT: 5 14 6.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 + +# M5-NEXT: 5 8 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp] +# M5-NEXT: 4 7 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M5-NEXT: 4 15 4.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M5-NEXT: 6 8 2.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp] +# M5-NEXT: 4 7 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M5-NEXT: 4 15 4.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M5-NEXT: 6 8 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], #16 +# M5-NEXT: 5 7 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 +# M5-NEXT: 5 15 4.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M5-NEXT: 7 8 2.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], #32 +# M5-NEXT: 5 7 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #32 +# M5-NEXT: 5 15 4.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M5-NEXT: 6 8 2.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], x0 +# M5-NEXT: 5 7 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M5-NEXT: 5 15 4.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M5-NEXT: 7 8 2.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], x0 +# M5-NEXT: 5 7 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 +# M5-NEXT: 5 15 4.00 * ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s b/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s new file mode 100644 index 00000000000..81e5fe84ad0 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s @@ -0,0 +1,169 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +st1 {v0.s}[0], [sp] +st1 {v0.2s}, [sp] +st1 {v0.2s, v1.2s}, [sp] +st1 {v0.2s, v1.2s, v2.2s}, [sp] +st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp] + +st1 {v0.d}[0], [sp] +st1 {v0.2d}, [sp] +st1 {v0.2d, v1.2d}, [sp] +st1 {v0.2d, v1.2d, v2.2d}, [sp] +st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp] + +st1 {v0.s}[0], [sp], #4 +st1 {v0.2s}, [sp], #8 +st1 {v0.2s, v1.2s}, [sp], #16 +st1 {v0.2s, v1.2s, v2.2s}, [sp], #24 +st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], #32 + +st1 {v0.d}[0], [sp], #8 +st1 {v0.2d}, [sp], #16 +st1 {v0.2d, v1.2d}, [sp], #32 +st1 {v0.2d, v1.2d, v2.2d}, [sp], #48 +st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], #64 + +st1 {v0.s}[0], [sp], x0 +st1 {v0.2s}, [sp], x0 +st1 {v0.2s, v1.2s}, [sp], x0 +st1 {v0.2s, v1.2s, v2.2s}, [sp], x0 +st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], x0 + +st1 {v0.d}[0], [sp], x0 +st1 {v0.2d}, [sp], x0 +st1 {v0.2d, v1.2d}, [sp], x0 +st1 {v0.2d, v1.2d, v2.2d}, [sp], x0 +st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], x0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 3000 + +# M3-NEXT: Total Cycles: 10203 +# M3-NEXT: Total uOps: 8400 + +# M4-NEXT: Total Cycles: 6603 +# M4-NEXT: Total uOps: 8600 + +# M5-NEXT: Total Cycles: 6603 +# M5-NEXT: Total uOps: 8600 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.82 +# M3-NEXT: IPC: 0.29 +# M3-NEXT: Block RThroughput: 72.0 + +# M4-NEXT: uOps Per Cycle: 1.30 +# M4-NEXT: IPC: 0.45 +# M4-NEXT: Block RThroughput: 33.0 + +# M5-NEXT: uOps Per Cycle: 1.30 +# M5-NEXT: IPC: 0.45 +# M5-NEXT: Block RThroughput: 33.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 4 7 3.00 * st1 { v0.s }[0], [sp] +# M3-NEXT: 1 1 1.00 * st1 { v0.2s }, [sp] +# M3-NEXT: 2 2 2.00 * st1 { v0.2s, v1.2s }, [sp] +# M3-NEXT: 3 3 3.00 * st1 { v0.2s, v1.2s, v2.2s }, [sp] +# M3-NEXT: 4 4 4.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M3-NEXT: 4 7 3.00 * st1 { v0.d }[0], [sp] +# M3-NEXT: 1 1 1.00 * st1 { v0.2d }, [sp] +# M3-NEXT: 2 2 2.00 * st1 { v0.2d, v1.2d }, [sp] +# M3-NEXT: 3 3 3.00 * st1 { v0.2d, v1.2d, v2.2d }, [sp] +# M3-NEXT: 4 4 4.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M3-NEXT: 4 7 3.00 * st1 { v0.s }[0], [sp], #4 +# M3-NEXT: 1 1 1.00 * st1 { v0.2s }, [sp], #8 +# M3-NEXT: 2 2 2.00 * st1 { v0.2s, v1.2s }, [sp], #16 +# M3-NEXT: 3 3 3.00 * st1 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M3-NEXT: 4 4 4.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M3-NEXT: 4 7 3.00 * st1 { v0.d }[0], [sp], #8 +# M3-NEXT: 1 1 1.00 * st1 { v0.2d }, [sp], #16 +# M3-NEXT: 2 2 2.00 * st1 { v0.2d, v1.2d }, [sp], #32 +# M3-NEXT: 3 3 3.00 * st1 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M3-NEXT: 4 4 4.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M3-NEXT: 4 7 3.00 * st1 { v0.s }[0], [sp], x0 +# M3-NEXT: 1 1 1.00 * st1 { v0.2s }, [sp], x0 +# M3-NEXT: 2 2 2.00 * st1 { v0.2s, v1.2s }, [sp], x0 +# M3-NEXT: 3 3 3.00 * st1 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M3-NEXT: 4 4 4.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M3-NEXT: 4 7 3.00 * st1 { v0.d }[0], [sp], x0 +# M3-NEXT: 1 1 1.00 * st1 { v0.2d }, [sp], x0 +# M3-NEXT: 2 2 2.00 * st1 { v0.2d, v1.2d }, [sp], x0 +# M3-NEXT: 3 3 3.00 * st1 { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M3-NEXT: 4 4 4.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 + +# M4-NEXT: 1 1 0.50 * st1 { v0.s }[0], [sp] +# M4-NEXT: 1 1 0.50 * st1 { v0.2s }, [sp] +# M4-NEXT: 2 2 1.00 * st1 { v0.2s, v1.2s }, [sp] +# M4-NEXT: 3 3 1.50 * st1 { v0.2s, v1.2s, v2.2s }, [sp] +# M4-NEXT: 4 4 2.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M4-NEXT: 1 1 0.50 * st1 { v0.d }[0], [sp] +# M4-NEXT: 1 1 0.50 * st1 { v0.2d }, [sp] +# M4-NEXT: 2 2 1.00 * st1 { v0.2d, v1.2d }, [sp] +# M4-NEXT: 3 3 1.50 * st1 { v0.2d, v1.2d, v2.2d }, [sp] +# M4-NEXT: 4 4 2.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M4-NEXT: 2 1 0.50 * st1 { v0.s }[0], [sp], #4 +# M4-NEXT: 2 1 0.50 * st1 { v0.2s }, [sp], #8 +# M4-NEXT: 3 2 1.00 * st1 { v0.2s, v1.2s }, [sp], #16 +# M4-NEXT: 4 3 1.50 * st1 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M4-NEXT: 5 4 2.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M4-NEXT: 2 1 0.50 * st1 { v0.d }[0], [sp], #8 +# M4-NEXT: 2 1 0.50 * st1 { v0.2d }, [sp], #16 +# M4-NEXT: 3 2 1.00 * st1 { v0.2d, v1.2d }, [sp], #32 +# M4-NEXT: 4 3 1.50 * st1 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M4-NEXT: 5 4 2.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M4-NEXT: 2 1 0.50 * st1 { v0.s }[0], [sp], x0 +# M4-NEXT: 2 1 0.50 * st1 { v0.2s }, [sp], x0 +# M4-NEXT: 3 2 1.00 * st1 { v0.2s, v1.2s }, [sp], x0 +# M4-NEXT: 4 3 1.50 * st1 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M4-NEXT: 5 4 2.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M4-NEXT: 2 1 0.50 * st1 { v0.d }[0], [sp], x0 +# M4-NEXT: 2 1 0.50 * st1 { v0.2d }, [sp], x0 +# M4-NEXT: 3 2 1.00 * st1 { v0.2d, v1.2d }, [sp], x0 +# M4-NEXT: 4 3 1.50 * st1 { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M4-NEXT: 5 4 2.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 + +# M5-NEXT: 1 1 0.50 * st1 { v0.s }[0], [sp] +# M5-NEXT: 1 1 0.50 * st1 { v0.2s }, [sp] +# M5-NEXT: 2 2 1.00 * st1 { v0.2s, v1.2s }, [sp] +# M5-NEXT: 3 3 1.50 * st1 { v0.2s, v1.2s, v2.2s }, [sp] +# M5-NEXT: 4 4 2.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M5-NEXT: 1 1 0.50 * st1 { v0.d }[0], [sp] +# M5-NEXT: 1 1 0.50 * st1 { v0.2d }, [sp] +# M5-NEXT: 2 2 1.00 * st1 { v0.2d, v1.2d }, [sp] +# M5-NEXT: 3 3 1.50 * st1 { v0.2d, v1.2d, v2.2d }, [sp] +# M5-NEXT: 4 4 2.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M5-NEXT: 2 1 0.50 * st1 { v0.s }[0], [sp], #4 +# M5-NEXT: 2 1 0.50 * st1 { v0.2s }, [sp], #8 +# M5-NEXT: 3 2 1.00 * st1 { v0.2s, v1.2s }, [sp], #16 +# M5-NEXT: 4 3 1.50 * st1 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M5-NEXT: 5 4 2.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M5-NEXT: 2 1 0.50 * st1 { v0.d }[0], [sp], #8 +# M5-NEXT: 2 1 0.50 * st1 { v0.2d }, [sp], #16 +# M5-NEXT: 3 2 1.00 * st1 { v0.2d, v1.2d }, [sp], #32 +# M5-NEXT: 4 3 1.50 * st1 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M5-NEXT: 5 4 2.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M5-NEXT: 2 1 0.50 * st1 { v0.s }[0], [sp], x0 +# M5-NEXT: 2 1 0.50 * st1 { v0.2s }, [sp], x0 +# M5-NEXT: 3 2 1.00 * st1 { v0.2s, v1.2s }, [sp], x0 +# M5-NEXT: 4 3 1.50 * st1 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M5-NEXT: 5 4 2.00 * st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M5-NEXT: 2 1 0.50 * st1 { v0.d }[0], [sp], x0 +# M5-NEXT: 2 1 0.50 * st1 { v0.2d }, [sp], x0 +# M5-NEXT: 3 2 1.00 * st1 { v0.2d, v1.2d }, [sp], x0 +# M5-NEXT: 4 3 1.50 * st1 { v0.2d, v1.2d, v2.2d }, [sp], x0 +# M5-NEXT: 5 4 2.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s b/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s new file mode 100644 index 00000000000..9506241fef2 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s @@ -0,0 +1,97 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +st2 {v0.s, v1.s}[0], [sp] +st2 {v0.2s, v1.2s}, [sp] + +st2 {v0.d, v1.d}[0], [sp] +st2 {v0.2d, v1.2d}, [sp] + +st2 {v0.s, v1.s}[0], [sp], #8 +st2 {v0.2s, v1.2s}, [sp], #16 + +st2 {v0.d, v1.d}[0], [sp], #16 +st2 {v0.2d, v1.2d}, [sp], #32 + +st2 {v0.s, v1.s}[0], [sp], x0 +st2 {v0.2s, v1.2s}, [sp], x0 + +st2 {v0.d, v1.d}[0], [sp], x0 +st2 {v0.2d, v1.2d}, [sp], x0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1200 + +# M3-NEXT: Total Cycles: 8703 +# M3-NEXT: Total uOps: 5400 + +# M4-NEXT: Total Cycles: 2403 +# M4-NEXT: Total uOps: 2300 + +# M5-NEXT: Total Cycles: 2403 +# M5-NEXT: Total uOps: 2000 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.62 +# M3-NEXT: IPC: 0.14 +# M3-NEXT: Block RThroughput: 40.5 + +# M4-NEXT: uOps Per Cycle: 0.96 +# M4-NEXT: IPC: 0.50 +# M4-NEXT: Block RThroughput: 7.5 + +# M5-NEXT: uOps Per Cycle: 0.83 +# M5-NEXT: IPC: 0.50 +# M5-NEXT: Block RThroughput: 7.5 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 4 7 3.00 * st2 { v0.s, v1.s }[0], [sp] +# M3-NEXT: 4 7 3.00 * st2 { v0.2s, v1.2s }, [sp] +# M3-NEXT: 4 7 3.00 * st2 { v0.d, v1.d }[0], [sp] +# M3-NEXT: 6 8 4.50 * st2 { v0.2d, v1.2d }, [sp] +# M3-NEXT: 4 7 3.00 * st2 { v0.s, v1.s }[0], [sp], #8 +# M3-NEXT: 4 7 3.00 * st2 { v0.2s, v1.2s }, [sp], #16 +# M3-NEXT: 4 7 3.00 * st2 { v0.d, v1.d }[0], [sp], #16 +# M3-NEXT: 6 8 4.50 * st2 { v0.2d, v1.2d }, [sp], #32 +# M3-NEXT: 4 7 3.00 * st2 { v0.s, v1.s }[0], [sp], x0 +# M3-NEXT: 4 7 3.00 * st2 { v0.2s, v1.2s }, [sp], x0 +# M3-NEXT: 4 7 3.00 * st2 { v0.d, v1.d }[0], [sp], x0 +# M3-NEXT: 6 8 4.50 * st2 { v0.2d, v1.2d }, [sp], x0 + +# M4-NEXT: 1 2 0.50 * st2 { v0.s, v1.s }[0], [sp] +# M4-NEXT: 1 2 0.50 * st2 { v0.2s, v1.2s }, [sp] +# M4-NEXT: 1 2 0.50 * st2 { v0.d, v1.d }[0], [sp] +# M4-NEXT: 2 2 1.00 * st2 { v0.2d, v1.2d }, [sp] +# M4-NEXT: 2 2 0.50 * st2 { v0.s, v1.s }[0], [sp], #8 +# M4-NEXT: 2 2 0.50 * st2 { v0.2s, v1.2s }, [sp], #16 +# M4-NEXT: 2 2 0.50 * st2 { v0.d, v1.d }[0], [sp], #16 +# M4-NEXT: 3 2 1.00 * st2 { v0.2d, v1.2d }, [sp], #32 +# M4-NEXT: 2 2 0.50 * st2 { v0.s, v1.s }[0], [sp], x0 +# M4-NEXT: 2 2 0.50 * st2 { v0.2s, v1.2s }, [sp], x0 +# M4-NEXT: 2 2 0.50 * st2 { v0.d, v1.d }[0], [sp], x0 +# M4-NEXT: 3 2 1.00 * st2 { v0.2d, v1.2d }, [sp], x0 + +# M5-NEXT: 1 2 0.50 * st2 { v0.s, v1.s }[0], [sp] +# M5-NEXT: 1 2 0.50 * st2 { v0.2s, v1.2s }, [sp] +# M5-NEXT: 1 2 0.50 * st2 { v0.d, v1.d }[0], [sp] +# M5-NEXT: 1 2 1.00 * st2 { v0.2d, v1.2d }, [sp] +# M5-NEXT: 2 2 0.50 * st2 { v0.s, v1.s }[0], [sp], #8 +# M5-NEXT: 2 2 0.50 * st2 { v0.2s, v1.2s }, [sp], #16 +# M5-NEXT: 2 2 0.50 * st2 { v0.d, v1.d }[0], [sp], #16 +# M5-NEXT: 2 2 1.00 * st2 { v0.2d, v1.2d }, [sp], #32 +# M5-NEXT: 2 2 0.50 * st2 { v0.s, v1.s }[0], [sp], x0 +# M5-NEXT: 2 2 0.50 * st2 { v0.2s, v1.2s }, [sp], x0 +# M5-NEXT: 2 2 0.50 * st2 { v0.d, v1.d }[0], [sp], x0 +# M5-NEXT: 2 2 1.00 * st2 { v0.2d, v1.2d }, [sp], x0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s b/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s new file mode 100644 index 00000000000..4de5213d526 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s @@ -0,0 +1,97 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +st3 {v0.s, v1.s, v2.s}[0], [sp] +st3 {v0.2s, v1.2s, v2.2s}, [sp] + +st3 {v0.d, v1.d, v2.d}[0], [sp] +st3 {v0.2d, v1.2d, v2.2d}, [sp] + +st3 {v0.s, v1.s, v2.s}[0], [sp], #12 +st3 {v0.2s, v1.2s, v2.2s}, [sp], #24 + +st3 {v0.d, v1.d, v2.d}[0], [sp], #24 +st3 {v0.2d, v1.2d, v2.2d}, [sp], #48 + +st3 {v0.s, v1.s, v2.s}[0], [sp], x0 +st3 {v0.2s, v1.2s, v2.2s}, [sp], x0 + +st3 {v0.d, v1.d, v2.d}[0], [sp], x0 +st3 {v0.2d, v1.2d, v2.2d}, [sp], x0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1200 + +# M3-NEXT: Total Cycles: 18003 +# M3-NEXT: Total uOps: 8400 + +# M4-NEXT: Total Cycles: 3903 +# M4-NEXT: Total uOps: 5000 + +# M5-NEXT: Total Cycles: 3603 +# M5-NEXT: Total uOps: 4400 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.47 +# M3-NEXT: IPC: 0.07 +# M3-NEXT: Block RThroughput: 72.0 + +# M4-NEXT: uOps Per Cycle: 1.28 +# M4-NEXT: IPC: 0.31 +# M4-NEXT: Block RThroughput: 21.0 + +# M5-NEXT: uOps Per Cycle: 1.22 +# M5-NEXT: IPC: 0.33 +# M5-NEXT: Block RThroughput: 10.5 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 5 14 4.50 * st3 { v0.s, v1.s, v2.s }[0], [sp] +# M3-NEXT: 7 15 6.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp] +# M3-NEXT: 7 15 6.00 * st3 { v0.d, v1.d, v2.d }[0], [sp] +# M3-NEXT: 9 16 7.50 * st3 { v0.2d, v1.2d, v2.2d }, [sp] +# M3-NEXT: 5 14 4.50 * st3 { v0.s, v1.s, v2.s }[0], [sp], #12 +# M3-NEXT: 7 15 6.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M3-NEXT: 7 15 6.00 * st3 { v0.d, v1.d, v2.d }[0], [sp], #24 +# M3-NEXT: 9 16 7.50 * st3 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M3-NEXT: 5 14 4.50 * st3 { v0.s, v1.s, v2.s }[0], [sp], x0 +# M3-NEXT: 7 15 6.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M3-NEXT: 7 15 6.00 * st3 { v0.d, v1.d, v2.d }[0], [sp], x0 +# M3-NEXT: 9 16 7.50 * st3 { v0.2d, v1.2d, v2.2d }, [sp], x0 + +# M4-NEXT: 2 2 1.00 * st3 { v0.s, v1.s, v2.s }[0], [sp] +# M4-NEXT: 4 4 2.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp] +# M4-NEXT: 2 2 1.00 * st3 { v0.d, v1.d, v2.d }[0], [sp] +# M4-NEXT: 6 5 3.00 * st3 { v0.2d, v1.2d, v2.2d }, [sp] +# M4-NEXT: 3 2 1.00 * st3 { v0.s, v1.s, v2.s }[0], [sp], #12 +# M4-NEXT: 5 4 2.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M4-NEXT: 3 2 1.00 * st3 { v0.d, v1.d, v2.d }[0], [sp], #24 +# M4-NEXT: 7 5 3.00 * st3 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M4-NEXT: 3 2 1.00 * st3 { v0.s, v1.s, v2.s }[0], [sp], x0 +# M4-NEXT: 5 4 2.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M4-NEXT: 3 2 1.00 * st3 { v0.d, v1.d, v2.d }[0], [sp], x0 +# M4-NEXT: 7 5 3.00 * st3 { v0.2d, v1.2d, v2.2d }, [sp], x0 + +# M5-NEXT: 2 2 1.00 * st3 { v0.s, v1.s, v2.s }[0], [sp] +# M5-NEXT: 3 4 1.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp] +# M5-NEXT: 2 2 1.00 * st3 { v0.d, v1.d, v2.d }[0], [sp] +# M5-NEXT: 5 4 1.50 * st3 { v0.2d, v1.2d, v2.2d }, [sp] +# M5-NEXT: 3 2 1.00 * st3 { v0.s, v1.s, v2.s }[0], [sp], #12 +# M5-NEXT: 4 4 1.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp], #24 +# M5-NEXT: 3 2 1.00 * st3 { v0.d, v1.d, v2.d }[0], [sp], #24 +# M5-NEXT: 6 4 1.50 * st3 { v0.2d, v1.2d, v2.2d }, [sp], #48 +# M5-NEXT: 3 2 1.00 * st3 { v0.s, v1.s, v2.s }[0], [sp], x0 +# M5-NEXT: 4 4 1.00 * st3 { v0.2s, v1.2s, v2.2s }, [sp], x0 +# M5-NEXT: 3 2 1.00 * st3 { v0.d, v1.d, v2.d }[0], [sp], x0 +# M5-NEXT: 6 4 1.50 * st3 { v0.2d, v1.2d, v2.2d }, [sp], x0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s b/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s new file mode 100644 index 00000000000..7dfe59f78c4 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s @@ -0,0 +1,97 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +st4 {v0.s, v1.s, v2.s, v3.s}[0], [sp] +st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp] + +st4 {v0.d, v1.d, v2.d, v3.d}[0], [sp] +st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp] + +st4 {v0.s, v1.s, v2.s, v3.s}[0], [sp], #16 +st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], #32 + +st4 {v0.d, v1.d, v2.d, v3.d}[0], [sp], #32 +st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], #64 + +st4 {v0.s, v1.s, v2.s, v3.s}[0], [sp], x0 +st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp], x0 + +st4 {v0.d, v1.d, v2.d, v3.d}[0], [sp], x0 +st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], x0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1200 + +# M3-NEXT: Total Cycles: 18603 +# M3-NEXT: Total uOps: 9000 + +# M4-NEXT: Total Cycles: 4803 +# M4-NEXT: Total uOps: 4700 + +# M5-NEXT: Total Cycles: 4803 +# M5-NEXT: Total uOps: 4700 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.48 +# M3-NEXT: IPC: 0.06 +# M3-NEXT: Block RThroughput: 76.5 + +# M4-NEXT: uOps Per Cycle: 0.98 +# M4-NEXT: IPC: 0.25 +# M4-NEXT: Block RThroughput: 24.0 + +# M5-NEXT: uOps Per Cycle: 0.98 +# M5-NEXT: IPC: 0.25 +# M5-NEXT: Block RThroughput: 24.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 7 15 6.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp] +# M3-NEXT: 7 15 6.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M3-NEXT: 7 15 6.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp] +# M3-NEXT: 9 17 7.50 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M3-NEXT: 7 15 6.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], #16 +# M3-NEXT: 7 15 6.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M3-NEXT: 7 15 6.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], #32 +# M3-NEXT: 9 17 7.50 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M3-NEXT: 7 15 6.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], x0 +# M3-NEXT: 7 15 6.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M3-NEXT: 7 15 6.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], x0 +# M3-NEXT: 9 17 7.50 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 + +# M4-NEXT: 2 2 1.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp] +# M4-NEXT: 4 4 2.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M4-NEXT: 2 2 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp] +# M4-NEXT: 5 8 4.00 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M4-NEXT: 3 2 1.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], #16 +# M4-NEXT: 5 4 2.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M4-NEXT: 3 2 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], #32 +# M4-NEXT: 6 8 4.00 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M4-NEXT: 3 2 1.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], x0 +# M4-NEXT: 5 4 2.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M4-NEXT: 3 2 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], x0 +# M4-NEXT: 6 8 4.00 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 + +# M5-NEXT: 2 2 1.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp] +# M5-NEXT: 4 4 2.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# M5-NEXT: 2 2 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp] +# M5-NEXT: 5 8 4.00 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# M5-NEXT: 3 2 1.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], #16 +# M5-NEXT: 5 4 2.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #32 +# M5-NEXT: 3 2 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], #32 +# M5-NEXT: 6 8 4.00 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], #64 +# M5-NEXT: 3 2 1.00 * st4 { v0.s, v1.s, v2.s, v3.s }[0], [sp], x0 +# M5-NEXT: 5 4 2.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x0 +# M5-NEXT: 3 2 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[0], [sp], x0 +# M5-NEXT: 6 8 4.00 * st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [sp], x0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/crc.s b/test/tools/llvm-mca/AArch64/Exynos/crc.s new file mode 100644 index 00000000000..27aa0075c57 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/crc.s @@ -0,0 +1,58 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +crc32w w0, w1, w2 +crc32w w0, w0, w3 + +crc32cx w0, w1, x2 +crc32cx w0, w0, x3 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 400 + +# M3-NEXT: Total Cycles: 204 +# M4-NEXT: Total Cycles: 404 +# M5-NEXT: Total Cycles: 204 + +# ALL-NEXT: Total uOps: 400 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 1.96 +# M3-NEXT: IPC: 1.96 +# M3-NEXT: Block RThroughput: 2.0 + +# M4-NEXT: uOps Per Cycle: 0.99 +# M4-NEXT: IPC: 0.99 +# M4-NEXT: Block RThroughput: 4.0 + +# M5-NEXT: uOps Per Cycle: 1.96 +# M5-NEXT: IPC: 1.96 +# M5-NEXT: Block RThroughput: 2.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 2 0.50 crc32w w0, w1, w2 +# M3-NEXT: 1 2 0.50 crc32w w0, w0, w3 +# M3-NEXT: 1 2 0.50 crc32cx w0, w1, x2 +# M3-NEXT: 1 2 0.50 crc32cx w0, w0, x3 + +# M4-NEXT: 1 2 1.00 crc32w w0, w1, w2 +# M4-NEXT: 1 2 1.00 crc32w w0, w0, w3 +# M4-NEXT: 1 2 1.00 crc32cx w0, w1, x2 +# M4-NEXT: 1 2 1.00 crc32cx w0, w0, x3 + +# M5-NEXT: 1 2 0.50 crc32w w0, w1, w2 +# M5-NEXT: 1 2 0.50 crc32w w0, w0, w3 +# M5-NEXT: 1 2 0.50 crc32cx w0, w1, x2 +# M5-NEXT: 1 2 0.50 crc32cx w0, w0, x3 diff --git a/test/tools/llvm-mca/AArch64/Exynos/direct-branch.s b/test/tools/llvm-mca/AArch64/Exynos/direct-branch.s index 0819170c68b..79f810c95f0 100644 --- a/test/tools/llvm-mca/AArch64/Exynos/direct-branch.s +++ b/test/tools/llvm-mca/AArch64/Exynos/direct-branch.s @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py # RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 # RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 b main @@ -9,6 +10,7 @@ # M3-NEXT: Total Cycles: 18 # M4-NEXT: Total Cycles: 18 +# M5-NEXT: Total Cycles: 18 # ALL-NEXT: Total uOps: 100 @@ -22,6 +24,11 @@ # M4-NEXT: IPC: 5.56 # M4-NEXT: Block RThroughput: 0.2 +# M5: Dispatch Width: 6 +# M5-NEXT: uOps Per Cycle: 5.56 +# M5-NEXT: IPC: 5.56 +# M5-NEXT: Block RThroughput: 0.2 + # ALL: Instruction Info: # ALL-NEXT: [1]: #uOps # ALL-NEXT: [2]: Latency @@ -34,3 +41,4 @@ # M3-NEXT: 1 0 0.17 b main # M4-NEXT: 1 0 0.17 b main +# M5-NEXT: 1 0 0.17 b main diff --git a/test/tools/llvm-mca/AArch64/Exynos/divide-multiply.s b/test/tools/llvm-mca/AArch64/Exynos/divide-multiply.s new file mode 100644 index 00000000000..c74d1923c4a --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/divide-multiply.s @@ -0,0 +1,67 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM5 + +sdiv w0, w1, w2 +udiv x1, x2, x3 + +mul w2, w3, w4 +msub x3, x4, x5, x6 + +smull x4, w5, w6 +umulh x5, x6, x7 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 600 + +# EM3-NEXT: Total Cycles: 3305 +# EM4-NEXT: Total Cycles: 3303 +# EM5-NEXT: Total Cycles: 2603 + +# ALL-NEXT: Total uOps: 600 + +# ALL: Dispatch Width: 6 + +# EM3-NEXT: uOps Per Cycle: 0.18 +# EM3-NEXT: IPC: 0.18 +# EM3-NEXT: Block RThroughput: 33.0 + +# EM4-NEXT: uOps Per Cycle: 0.18 +# EM4-NEXT: IPC: 0.18 +# EM4-NEXT: Block RThroughput: 33.0 + +# EM5-NEXT: uOps Per Cycle: 0.23 +# EM5-NEXT: IPC: 0.23 +# EM5-NEXT: Block RThroughput: 26.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# EM3-NEXT: 1 12 12.00 sdiv w0, w1, w2 +# EM3-NEXT: 1 21 21.00 udiv x1, x2, x3 +# EM3-NEXT: 1 3 0.50 mul w2, w3, w4 +# EM3-NEXT: 1 4 1.00 msub x3, x4, x5, x6 +# EM3-NEXT: 1 3 0.50 smull x4, w5, w6 +# EM3-NEXT: 1 4 1.00 umulh x5, x6, x7 + +# EM4-NEXT: 1 12 12.00 sdiv w0, w1, w2 +# EM4-NEXT: 1 21 21.00 udiv x1, x2, x3 +# EM4-NEXT: 1 3 0.50 mul w2, w3, w4 +# EM4-NEXT: 1 4 1.00 msub x3, x4, x5, x6 +# EM4-NEXT: 1 3 0.50 smull x4, w5, w6 +# EM4-NEXT: 1 4 1.00 umulh x5, x6, x7 + +# EM5-NEXT: 1 10 10.00 sdiv w0, w1, w2 +# EM5-NEXT: 1 16 16.00 udiv x1, x2, x3 +# EM5-NEXT: 1 2 0.50 mul w2, w3, w4 +# EM5-NEXT: 1 3 1.00 msub x3, x4, x5, x6 +# EM5-NEXT: 1 2 0.50 smull x4, w5, w6 +# EM5-NEXT: 1 3 1.00 umulh x5, x6, x7 diff --git a/test/tools/llvm-mca/AArch64/Exynos/double-recp.s b/test/tools/llvm-mca/AArch64/Exynos/double-recp.s new file mode 100644 index 00000000000..872f6ab7948 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/double-recp.s @@ -0,0 +1,66 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +fmov d31, #1.00000000 +fdiv d30, d31, d30 + +# Newton series for 1 / x. +frecpe d1, d0 +frecps d2, d0, d1 +fmul d1, d1, d2 +frecps d2, d0, d1 +fmul d1, d1, d2 +frecps d0, d0, d1 +fmul d0, d1, d0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 900 + +# M3-NEXT: Total Cycles: 2503 +# M4-NEXT: Total Cycles: 2403 +# M5-NEXT: Total Cycles: 2403 + +# ALL-NEXT: Total uOps: 900 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.36 +# M3-NEXT: IPC: 0.36 +# M3-NEXT: Block RThroughput: 3.3 + +# M4-NEXT: uOps Per Cycle: 0.37 +# M4-NEXT: IPC: 0.37 +# M4-NEXT: Block RThroughput: 2.3 + +# M5-NEXT: uOps Per Cycle: 0.37 +# M5-NEXT: IPC: 0.37 +# M5-NEXT: Block RThroughput: 2.3 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: +# ALL-NEXT: 1 1 0.33 fmov d31, #1.00000000 + +# M3-NEXT: 1 12 3.25 fdiv d30, d31, d30 +# M3-NEXT: 1 4 0.50 frecpe d1, d0 + +# M4-NEXT: 1 12 2.25 fdiv d30, d31, d30 +# M4-NEXT: 1 3 0.50 frecpe d1, d0 + +# M5-NEXT: 1 12 2.25 fdiv d30, d31, d30 +# M5-NEXT: 1 3 0.50 frecpe d1, d0 + +# ALL-NEXT: 1 4 0.33 frecps d2, d0, d1 +# ALL-NEXT: 1 3 0.33 fmul d1, d1, d2 +# ALL-NEXT: 1 4 0.33 frecps d2, d0, d1 +# ALL-NEXT: 1 3 0.33 fmul d1, d1, d2 +# ALL-NEXT: 1 4 0.33 frecps d0, d0, d1 +# ALL-NEXT: 1 3 0.33 fmul d0, d1, d0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/double-rsqrt.s b/test/tools/llvm-mca/AArch64/Exynos/double-rsqrt.s new file mode 100644 index 00000000000..98fa404bb94 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/double-rsqrt.s @@ -0,0 +1,78 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +fsqrt d30, d30 +fmov d31, #1.00000000 +fdiv d30, d31, d30 + +# Newton series for 1 / sqrt(). +frsqrte d1, d0 +fmul d2, d1, d1 +frsqrts d2, d0, d2 +fmul d1, d1, d2 +fmul d2, d1, d1 +frsqrts d2, d0, d2 +fmul d1, d1, d2 +fmul d2, d1, d1 +frsqrts d0, d0, d2 +fmul d0, d1, d0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1300 + +# M3-NEXT: Total Cycles: 3703 +# M4-NEXT: Total Cycles: 3303 +# M5-NEXT: Total Cycles: 3303 + +# ALL-NEXT: Total uOps: 1300 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.35 +# M3-NEXT: IPC: 0.35 +# M3-NEXT: Block RThroughput: 26.0 + +# M4-NEXT: uOps Per Cycle: 0.39 +# M4-NEXT: IPC: 0.39 +# M4-NEXT: Block RThroughput: 3.0 + +# M5-NEXT: uOps Per Cycle: 0.39 +# M5-NEXT: IPC: 0.39 +# M5-NEXT: Block RThroughput: 3.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 25 26.00 fsqrt d30, d30 +# M4-NEXT: 1 12 2.25 fsqrt d30, d30 +# M5-NEXT: 1 12 2.25 fsqrt d30, d30 + +# ALL-NEXT: 1 1 0.33 fmov d31, #1.00000000 + +# M3-NEXT: 1 12 3.25 fdiv d30, d31, d30 +# M3-NEXT: 1 4 0.50 frsqrte d1, d0 + +# M4-NEXT: 1 12 2.25 fdiv d30, d31, d30 +# M4-NEXT: 1 3 0.50 frsqrte d1, d0 + +# M5-NEXT: 1 12 2.25 fdiv d30, d31, d30 +# M5-NEXT: 1 3 0.50 frsqrte d1, d0 + +# ALL-NEXT: 1 3 0.33 fmul d2, d1, d1 +# ALL-NEXT: 1 4 0.33 frsqrts d2, d0, d2 +# ALL-NEXT: 1 3 0.33 fmul d1, d1, d2 +# ALL-NEXT: 1 3 0.33 fmul d2, d1, d1 +# ALL-NEXT: 1 4 0.33 frsqrts d2, d0, d2 +# ALL-NEXT: 1 3 0.33 fmul d1, d1, d2 +# ALL-NEXT: 1 3 0.33 fmul d2, d1, d1 +# ALL-NEXT: 1 4 0.33 frsqrts d0, d0, d2 +# ALL-NEXT: 1 3 0.33 fmul d0, d1, d0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/double-sqrt.s b/test/tools/llvm-mca/AArch64/Exynos/double-sqrt.s new file mode 100644 index 00000000000..b9aceff3913 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/double-sqrt.s @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +fsqrt d31, d31 + +# Newton series for sqrt(). +frsqrte d1, d0 +fmul d2, d1, d1 +frsqrts d2, d0, d2 +fmul d1, d1, d2 +fmul d2, d1, d1 +frsqrts d2, d0, d2 +fmul d1, d1, d2 +fmul d2, d1, d1 +frsqrts d2, d0, d2 +fmul d2, d2, d0 +fmul d1, d1, d2 +fcmp d0, #0.0 +fcsel d0, d0, d1, eq + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1400 + +# M3-NEXT: Total Cycles: 4203 +# M4-NEXT: Total Cycles: 4103 +# M5-NEXT: Total Cycles: 3803 + +# ALL-NEXT: Total uOps: 1500 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.36 +# M3-NEXT: IPC: 0.33 +# M3-NEXT: Block RThroughput: 27.0 + +# M4-NEXT: uOps Per Cycle: 0.37 +# M4-NEXT: IPC: 0.34 +# M4-NEXT: Block RThroughput: 3.3 + +# M5-NEXT: uOps Per Cycle: 0.39 +# M5-NEXT: IPC: 0.37 +# M5-NEXT: Block RThroughput: 3.3 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 25 26.00 fsqrt d31, d31 +# M3-NEXT: 1 4 0.50 frsqrte d1, d0 + +# M4-NEXT: 1 12 2.25 fsqrt d31, d31 +# M4-NEXT: 1 3 0.50 frsqrte d1, d0 + +# M5-NEXT: 1 12 2.25 fsqrt d31, d31 +# M5-NEXT: 1 3 0.50 frsqrte d1, d0 + +# ALL-NEXT: 1 3 0.33 fmul d2, d1, d1 +# ALL-NEXT: 1 4 0.33 frsqrts d2, d0, d2 +# ALL-NEXT: 1 3 0.33 fmul d1, d1, d2 +# ALL-NEXT: 1 3 0.33 fmul d2, d1, d1 +# ALL-NEXT: 1 4 0.33 frsqrts d2, d0, d2 +# ALL-NEXT: 1 3 0.33 fmul d1, d1, d2 +# ALL-NEXT: 1 3 0.33 fmul d2, d1, d1 +# ALL-NEXT: 1 4 0.33 frsqrts d2, d0, d2 +# ALL-NEXT: 1 3 0.33 fmul d2, d2, d0 +# ALL-NEXT: 1 3 0.33 fmul d1, d1, d2 +# ALL-NEXT: 1 2 1.00 fcmp d0, #0.0 + +# M3-NEXT: 2 5 1.00 fcsel d0, d0, d1, eq +# M4-NEXT: 2 5 1.00 fcsel d0, d0, d1, eq +# M5-NEXT: 2 2 1.00 fcsel d0, d0, d1, eq diff --git a/test/tools/llvm-mca/AArch64/Exynos/extended-register.s b/test/tools/llvm-mca/AArch64/Exynos/extended-register.s index aa14531577a..03522cd96e7 100644 --- a/test/tools/llvm-mca/AArch64/Exynos/extended-register.s +++ b/test/tools/llvm-mca/AArch64/Exynos/extended-register.s @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3 # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM5 sub w0, w1, w2, sxtb #0 add x3, x4, w5, sxth #1 @@ -16,6 +17,7 @@ # EM3-NEXT: Total Cycles: 304 # EM4-NEXT: Total Cycles: 304 +# EM5-NEXT: Total Cycles: 254 # ALL-NEXT: Total uOps: 800 @@ -29,6 +31,11 @@ # EM4-NEXT: IPC: 2.63 # EM4-NEXT: Block RThroughput: 3.0 +# EM5: Dispatch Width: 6 +# EM5-NEXT: uOps Per Cycle: 3.15 +# EM5-NEXT: IPC: 3.15 +# EM5-NEXT: Block RThroughput: 2.5 + # ALL: Instruction Info: # ALL-NEXT: [1]: #uOps # ALL-NEXT: [2]: Latency @@ -56,3 +63,12 @@ # EM4-NEXT: 1 2 0.50 add x15, x16, w17, uxth #1 # EM4-NEXT: 1 2 0.50 subs x18, x19, w20, sxtw #2 # EM4-NEXT: 1 2 0.50 adds x21, x22, x23, sxtx #3 + +# EM5-NEXT: 1 1 0.17 sub w0, w1, w2, sxtb +# EM5-NEXT: 1 2 0.50 add x3, x4, w5, sxth #1 +# EM5-NEXT: 1 1 0.25 subs x6, x7, w8, uxtw #2 +# EM5-NEXT: 1 1 0.25 adds x9, x10, x11, uxtx #3 +# EM5-NEXT: 1 1 0.17 sub w12, w13, w14, uxtb +# EM5-NEXT: 1 2 0.50 add x15, x16, w17, uxth #1 +# EM5-NEXT: 1 2 0.50 subs x18, x19, w20, sxtw #2 +# EM5-NEXT: 1 2 0.50 adds x21, x22, x23, sxtx #3 diff --git a/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s b/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s new file mode 100644 index 00000000000..a24d8a27960 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s @@ -0,0 +1,94 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM5 + +fdiv h0, h1, h2 +fdiv s1, s2, s3 +fdiv d2, d3, d4 + +fmul h3, h4, h5 +fmul s4, s5, s6 +fmul d5, d6, d7 + +fmadd h6, h7, h8, h9 +fmadd s7, s8, s9, s10 +fmadd d8, d9, d10, d11 + +fsqrt h9, h10 +fsqrt s10, s11 +fsqrt d11, d12 + +# ALL: Iterations: 100 + +# EM3-NEXT: Instructions: 800 +# EM3-NEXT: Total Cycles: 4503 +# EM3-NEXT: Total uOps: 800 + +# EM4-NEXT: Instructions: 1200 +# EM4-NEXT: Total Cycles: 575 +# EM4-NEXT: Total uOps: 1200 + +# EM5-NEXT: Instructions: 1200 +# EM5-NEXT: Total Cycles: 433 +# EM5-NEXT: Total uOps: 1200 + +# ALL: Dispatch Width: 6 + +# EM3-NEXT: uOps Per Cycle: 0.18 +# EM3-NEXT: IPC: 0.18 +# EM3-NEXT: Block RThroughput: 45.0 + +# EM4-NEXT: uOps Per Cycle: 2.09 +# EM4-NEXT: IPC: 2.09 +# EM4-NEXT: Block RThroughput: 4.0 + +# EM5-NEXT: uOps Per Cycle: 2.77 +# EM5-NEXT: IPC: 2.77 +# EM5-NEXT: Block RThroughput: 4.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# EM3: [1] [2] [3] [4] [5] [6] Instructions: +# EM3-NEXT: 1 7 2.00 fdiv s1, s2, s3 +# EM3-NEXT: 1 12 3.25 fdiv d2, d3, d4 +# EM3-NEXT: 1 3 0.33 fmul s4, s5, s6 +# EM3-NEXT: 1 3 0.33 fmul d5, d6, d7 +# EM3-NEXT: 1 4 0.33 fmadd s7, s8, s9, s10 +# EM3-NEXT: 1 4 0.33 fmadd d8, d9, d10, d11 +# EM3-NEXT: 1 18 19.00 fsqrt s10, s11 +# EM3-NEXT: 1 25 26.00 fsqrt d11, d12 + +# EM4: [1] [2] [3] [4] [5] [6] Instructions: +# EM4-NEXT: 1 7 3.00 fdiv h0, h1, h2 +# EM4-NEXT: 1 7 1.50 fdiv s1, s2, s3 +# EM4-NEXT: 1 12 2.25 fdiv d2, d3, d4 +# EM4-NEXT: 1 3 0.50 fmul h3, h4, h5 +# EM4-NEXT: 1 3 0.33 fmul s4, s5, s6 +# EM4-NEXT: 1 3 0.33 fmul d5, d6, d7 +# EM4-NEXT: 1 4 0.50 fmadd h6, h7, h8, h9 +# EM4-NEXT: 1 4 0.33 fmadd s7, s8, s9, s10 +# EM4-NEXT: 1 4 0.33 fmadd d8, d9, d10, d11 +# EM4-NEXT: 1 7 3.00 fsqrt h9, h10 +# EM4-NEXT: 1 8 1.75 fsqrt s10, s11 +# EM4-NEXT: 1 12 2.25 fsqrt d11, d12 + +# EM5: [1] [2] [3] [4] [5] [6] Instructions: +# EM5-NEXT: 1 5 0.50 fdiv h0, h1, h2 +# EM5-NEXT: 1 7 1.00 fdiv s1, s2, s3 +# EM5-NEXT: 1 12 2.25 fdiv d2, d3, d4 +# EM5-NEXT: 1 3 0.33 fmul h3, h4, h5 +# EM5-NEXT: 1 3 0.33 fmul s4, s5, s6 +# EM5-NEXT: 1 3 0.33 fmul d5, d6, d7 +# EM5-NEXT: 1 4 0.33 fmadd h6, h7, h8, h9 +# EM5-NEXT: 1 4 0.33 fmadd s7, s8, s9, s10 +# EM5-NEXT: 1 4 0.33 fmadd d8, d9, d10, d11 +# EM5-NEXT: 1 5 0.50 fsqrt h9, h10 +# EM5-NEXT: 1 8 1.25 fsqrt s10, s11 +# EM5-NEXT: 1 12 2.25 fsqrt d11, d12 diff --git a/test/tools/llvm-mca/AArch64/Exynos/float-integer.s b/test/tools/llvm-mca/AArch64/Exynos/float-integer.s new file mode 100644 index 00000000000..65aed321dd7 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/float-integer.s @@ -0,0 +1,114 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM5 + +scvtf h0, w0 +scvtf s1, w1 +scvtf d2, x2 + +fcvtzs w3, h3 +fcvtzs w4, s4 +fcvtzs x5, d5 + +fmov h6, #2.0 +fmov s7, #4.0 +fmov d8, #8.0 + +fmov h9, w9 +fmov s10, w10 +fmov d11, x11 +fmov v12.d[1], x12 + +fmov w13, h13 +fmov w14, s14 +fmov x15, d15 +fmov x16, v16.d[1] + +# ALL: Iterations: 100 + +# EM3-NEXT: Instructions: 1200 +# EM3-NEXT: Total Cycles: 405 +# EM3-NEXT: Total uOps: 1400 + +# EM4-NEXT: Instructions: 1700 +# EM4-NEXT: Total Cycles: 1108 +# EM4-NEXT: Total uOps: 1900 + +# EM5-NEXT: Instructions: 1700 +# EM5-NEXT: Total Cycles: 1407 +# EM5-NEXT: Total uOps: 1900 + +# ALL: Dispatch Width: 6 + +# EM3-NEXT: uOps Per Cycle: 3.46 +# EM3-NEXT: IPC: 2.96 +# EM3-NEXT: Block RThroughput: 4.0 + +# EM4-NEXT: uOps Per Cycle: 1.71 +# EM4-NEXT: IPC: 1.53 +# EM4-NEXT: Block RThroughput: 11.0 + +# EM5-NEXT: uOps Per Cycle: 1.35 +# EM5-NEXT: IPC: 1.21 +# EM5-NEXT: Block RThroughput: 14.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# EM3: [1] [2] [3] [4] [5] [6] Instructions: +# EM3-NEXT: 1 4 1.00 scvtf s1, w1 +# EM3-NEXT: 1 4 1.00 scvtf d2, x2 +# EM3-NEXT: 1 3 1.00 fcvtzs w4, s4 +# EM3-NEXT: 1 3 1.00 fcvtzs x5, d5 +# EM3-NEXT: 1 1 0.33 fmov s7, #4.00000000 +# EM3-NEXT: 1 1 0.33 fmov d8, #8.00000000 +# EM3-NEXT: 1 1 0.33 fmov s10, w10 +# EM3-NEXT: 1 1 0.33 fmov d11, x11 +# EM3-NEXT: 2 5 1.00 fmov v12.d[1], x12 +# EM3-NEXT: 1 1 0.33 fmov w14, s14 +# EM3-NEXT: 1 1 0.33 fmov x15, d15 +# EM3-NEXT: 2 5 1.00 fmov x16, v16.d[1] + +# EM4: [1] [2] [3] [4] [5] [6] Instructions: +# EM4-NEXT: 1 6 1.00 scvtf h0, w0 +# EM4-NEXT: 1 6 1.00 scvtf s1, w1 +# EM4-NEXT: 1 6 1.00 scvtf d2, x2 +# EM4-NEXT: 1 4 1.00 fcvtzs w3, h3 +# EM4-NEXT: 1 4 1.00 fcvtzs w4, s4 +# EM4-NEXT: 1 4 1.00 fcvtzs x5, d5 +# EM4-NEXT: 1 1 0.33 fmov h6, #2.00000000 +# EM4-NEXT: 1 1 0.33 fmov s7, #4.00000000 +# EM4-NEXT: 1 1 0.33 fmov d8, #8.00000000 +# EM4-NEXT: 1 3 1.00 fmov h9, w9 +# EM4-NEXT: 1 3 1.00 fmov s10, w10 +# EM4-NEXT: 1 3 1.00 fmov d11, x11 +# EM4-NEXT: 2 2 1.00 fmov v12.d[1], x12 +# EM4-NEXT: 1 4 1.00 fmov w13, h13 +# EM4-NEXT: 1 4 1.00 fmov w14, s14 +# EM4-NEXT: 1 4 1.00 fmov x15, d15 +# EM4-NEXT: 2 5 1.00 fmov x16, v16.d[1] + +# EM5: [1] [2] [3] [4] [5] [6] Instructions: +# EM5-NEXT: 1 6 1.00 scvtf h0, w0 +# EM5-NEXT: 1 6 1.00 scvtf s1, w1 +# EM5-NEXT: 1 6 1.00 scvtf d2, x2 +# EM5-NEXT: 1 4 1.00 fcvtzs w3, h3 +# EM5-NEXT: 1 4 1.00 fcvtzs w4, s4 +# EM5-NEXT: 1 4 1.00 fcvtzs x5, d5 +# EM5-NEXT: 1 1 0.33 fmov h6, #2.00000000 +# EM5-NEXT: 1 1 0.33 fmov s7, #4.00000000 +# EM5-NEXT: 1 1 0.33 fmov d8, #8.00000000 +# EM5-NEXT: 1 4 1.00 fmov h9, w9 +# EM5-NEXT: 1 4 1.00 fmov s10, w10 +# EM5-NEXT: 1 4 1.00 fmov d11, x11 +# EM5-NEXT: 2 6 1.00 fmov v12.d[1], x12 +# EM5-NEXT: 1 3 1.00 fmov w13, h13 +# EM5-NEXT: 1 3 1.00 fmov w14, s14 +# EM5-NEXT: 1 3 1.00 fmov x15, d15 +# EM5-NEXT: 2 5 1.00 fmov x16, v16.d[1] diff --git a/test/tools/llvm-mca/AArch64/Exynos/float-load.s b/test/tools/llvm-mca/AArch64/Exynos/float-load.s new file mode 100644 index 00000000000..18dcf5ebe87 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/float-load.s @@ -0,0 +1,153 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +ldr s0, 1f +ldr q0, 1f + +ldur d0, [sp, #2] +ldur q0, [sp, #16] + +ldr b0, [sp], #1 +ldr q0, [sp], #16 + +ldr h0, [sp, #2]! +ldr q0, [sp, #16]! + +ldr s0, [sp, #4] +ldr q0, [sp, #16] + +ldr d0, [sp, x0, lsl #3] +ldr q0, [sp, x0, lsl #4] + +ldr b0, [sp, x0] +ldr q0, [sp, x0] + +ldr h0, [sp, w0, sxtw #1] +ldr q0, [sp, w0, uxtw #4] + +ldr s0, [sp, w0, sxtw] +ldr q0, [sp, w0, uxtw] + +ldp d0, d1, [sp], #16 +ldp q0, q1, [sp], #32 + +ldp s0, s1, [sp, #8]! +ldp q0, q1, [sp, #32]! + +ldp d0, d1, [sp, #16] +ldp q0, q1, [sp, #32] + +1: + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 2400 + +# M3-NEXT: Total Cycles: 4708 +# M3-NEXT: Total uOps: 3200 + +# M4-NEXT: Total Cycles: 4708 +# M4-NEXT: Total uOps: 3200 + +# M5-NEXT: Total Cycles: 5509 +# M5-NEXT: Total uOps: 3300 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.68 +# M3-NEXT: IPC: 0.51 +# M3-NEXT: Block RThroughput: 13.5 + +# M4-NEXT: uOps Per Cycle: 0.68 +# M4-NEXT: IPC: 0.51 +# M4-NEXT: Block RThroughput: 13.0 + +# M5-NEXT: uOps Per Cycle: 0.60 +# M5-NEXT: IPC: 0.44 +# M5-NEXT: Block RThroughput: 13.5 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 5 0.50 * ldr s0, {{\.?}}Ltmp0 +# M3-NEXT: 1 5 0.50 * ldr q0, {{\.?}}Ltmp0 +# M3-NEXT: 1 5 0.50 * ldur d0, [sp, #2] +# M3-NEXT: 1 5 0.50 * ldur q0, [sp, #16] +# M3-NEXT: 1 5 0.50 * ldr b0, [sp], #1 +# M3-NEXT: 1 5 0.50 * ldr q0, [sp], #16 +# M3-NEXT: 1 5 0.50 * ldr h0, [sp, #2]! +# M3-NEXT: 1 5 0.50 * ldr q0, [sp, #16]! +# M3-NEXT: 1 5 0.50 * ldr s0, [sp, #4] +# M3-NEXT: 1 5 0.50 * ldr q0, [sp, #16] +# M3-NEXT: 1 5 0.50 * ldr d0, [sp, x0, lsl #3] +# M3-NEXT: 2 6 0.50 * ldr q0, [sp, x0, lsl #4] +# M3-NEXT: 1 5 0.50 * ldr b0, [sp, x0] +# M3-NEXT: 1 5 0.50 * ldr q0, [sp, x0] +# M3-NEXT: 2 6 0.50 * ldr h0, [sp, w0, sxtw #1] +# M3-NEXT: 2 6 0.50 * ldr q0, [sp, w0, uxtw #4] +# M3-NEXT: 2 6 0.50 * ldr s0, [sp, w0, sxtw] +# M3-NEXT: 1 5 0.50 * ldr q0, [sp, w0, uxtw] +# M3-NEXT: 2 5 0.50 * ldp d0, d1, [sp], #16 +# M3-NEXT: 2 5 1.00 * ldp q0, q1, [sp], #32 +# M3-NEXT: 2 5 0.50 * ldp s0, s1, [sp, #8]! +# M3-NEXT: 2 5 1.00 * ldp q0, q1, [sp, #32]! +# M3-NEXT: 1 5 0.50 * ldp d0, d1, [sp, #16] +# M3-NEXT: 1 5 1.00 * ldp q0, q1, [sp, #32] + +# M4-NEXT: 1 5 0.50 * ldr s0, {{\.?}}Ltmp0 +# M4-NEXT: 1 5 0.50 * ldr q0, {{\.?}}Ltmp0 +# M4-NEXT: 1 5 0.50 * ldur d0, [sp, #2] +# M4-NEXT: 1 5 0.50 * ldur q0, [sp, #16] +# M4-NEXT: 1 5 0.50 * ldr b0, [sp], #1 +# M4-NEXT: 1 5 0.50 * ldr q0, [sp], #16 +# M4-NEXT: 1 5 0.50 * ldr h0, [sp, #2]! +# M4-NEXT: 1 5 0.50 * ldr q0, [sp, #16]! +# M4-NEXT: 1 5 0.50 * ldr s0, [sp, #4] +# M4-NEXT: 1 5 0.50 * ldr q0, [sp, #16] +# M4-NEXT: 1 5 0.50 * ldr d0, [sp, x0, lsl #3] +# M4-NEXT: 2 6 0.50 * ldr q0, [sp, x0, lsl #4] +# M4-NEXT: 1 5 0.50 * ldr b0, [sp, x0] +# M4-NEXT: 1 5 0.50 * ldr q0, [sp, x0] +# M4-NEXT: 2 6 0.50 * ldr h0, [sp, w0, sxtw #1] +# M4-NEXT: 2 6 0.50 * ldr q0, [sp, w0, uxtw #4] +# M4-NEXT: 2 6 0.50 * ldr s0, [sp, w0, sxtw] +# M4-NEXT: 2 6 0.50 * ldr q0, [sp, w0, uxtw] +# M4-NEXT: 1 5 0.50 * ldp d0, d1, [sp], #16 +# M4-NEXT: 2 5 0.50 * ldp q0, q1, [sp], #32 +# M4-NEXT: 2 5 0.50 * ldp s0, s1, [sp, #8]! +# M4-NEXT: 2 5 1.00 * ldp q0, q1, [sp, #32]! +# M4-NEXT: 1 5 0.50 * ldp d0, d1, [sp, #16] +# M4-NEXT: 1 5 1.00 * ldp q0, q1, [sp, #32] + +# M5-NEXT: 1 6 0.50 * ldr s0, {{\.?}}Ltmp0 +# M5-NEXT: 1 6 0.50 * ldr q0, {{\.?}}Ltmp0 +# M5-NEXT: 1 6 0.50 * ldur d0, [sp, #2] +# M5-NEXT: 1 6 0.50 * ldur q0, [sp, #16] +# M5-NEXT: 1 6 0.50 * ldr b0, [sp], #1 +# M5-NEXT: 1 6 0.50 * ldr q0, [sp], #16 +# M5-NEXT: 1 6 0.50 * ldr h0, [sp, #2]! +# M5-NEXT: 1 6 0.50 * ldr q0, [sp, #16]! +# M5-NEXT: 1 6 0.50 * ldr s0, [sp, #4] +# M5-NEXT: 1 6 0.50 * ldr q0, [sp, #16] +# M5-NEXT: 1 6 0.50 * ldr d0, [sp, x0, lsl #3] +# M5-NEXT: 2 7 0.50 * ldr q0, [sp, x0, lsl #4] +# M5-NEXT: 1 6 0.50 * ldr b0, [sp, x0] +# M5-NEXT: 1 6 0.50 * ldr q0, [sp, x0] +# M5-NEXT: 2 7 0.50 * ldr h0, [sp, w0, sxtw #1] +# M5-NEXT: 2 7 0.50 * ldr q0, [sp, w0, uxtw #4] +# M5-NEXT: 2 7 0.50 * ldr s0, [sp, w0, sxtw] +# M5-NEXT: 2 7 0.50 * ldr q0, [sp, w0, uxtw] +# M5-NEXT: 2 6 0.50 * ldp d0, d1, [sp], #16 +# M5-NEXT: 2 6 1.00 * ldp q0, q1, [sp], #32 +# M5-NEXT: 2 6 0.50 * ldp s0, s1, [sp, #8]! +# M5-NEXT: 2 6 1.00 * ldp q0, q1, [sp, #32]! +# M5-NEXT: 1 6 0.50 * ldp d0, d1, [sp, #16] +# M5-NEXT: 1 6 1.00 * ldp q0, q1, [sp, #32] diff --git a/test/tools/llvm-mca/AArch64/Exynos/float-recp.s b/test/tools/llvm-mca/AArch64/Exynos/float-recp.s new file mode 100644 index 00000000000..05245ad631f --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/float-recp.s @@ -0,0 +1,62 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +fmov s31, #1.00000000 +fdiv s30, s31, s30 + +# Newton series for 1 / x. +frecpe s1, s0 +frecps s2, s0, s1 +fmul s1, s1, s2 +frecps s0, s0, s1 +fmul s0, s1, s0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 700 + +# M3-NEXT: Total Cycles: 1803 +# M4-NEXT: Total Cycles: 1703 +# M5-NEXT: Total Cycles: 1703 + +# ALL-NEXT: Total uOps: 700 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.39 +# M3-NEXT: IPC: 0.39 +# M3-NEXT: Block RThroughput: 2.0 + +# M4-NEXT: uOps Per Cycle: 0.41 +# M4-NEXT: IPC: 0.41 +# M4-NEXT: Block RThroughput: 1.5 + +# M5-NEXT: uOps Per Cycle: 0.41 +# M5-NEXT: IPC: 0.41 +# M5-NEXT: Block RThroughput: 1.3 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: +# ALL-NEXT: 1 1 0.33 fmov s31, #1.00000000 + +# M3-NEXT: 1 7 2.00 fdiv s30, s31, s30 +# M3-NEXT: 1 4 0.50 frecpe s1, s0 + +# M4-NEXT: 1 7 1.50 fdiv s30, s31, s30 +# M4-NEXT: 1 3 0.50 frecpe s1, s0 + +# M5-NEXT: 1 7 1.00 fdiv s30, s31, s30 +# M5-NEXT: 1 3 0.50 frecpe s1, s0 + +# ALL-NEXT: 1 4 0.33 frecps s2, s0, s1 +# ALL-NEXT: 1 3 0.33 fmul s1, s1, s2 +# ALL-NEXT: 1 4 0.33 frecps s0, s0, s1 +# ALL-NEXT: 1 3 0.33 fmul s0, s1, s0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/float-rsqrt.s b/test/tools/llvm-mca/AArch64/Exynos/float-rsqrt.s new file mode 100644 index 00000000000..fd82cc35329 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/float-rsqrt.s @@ -0,0 +1,72 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +fsqrt s30, s30 +fmov s31, #1.00000000 +fdiv s30, s31, s30 + +# Newton series for 1 / sqrtf(). +frsqrte s1, s0 +fmul s2, s1, s1 +frsqrts s2, s0, s2 +fmul s1, s1, s2 +fmul s2, s1, s1 +frsqrts s0, s0, s2 +fmul s0, s1, s0 + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1000 + +# M3-NEXT: Total Cycles: 2503 +# M4-NEXT: Total Cycles: 2303 +# M5-NEXT: Total Cycles: 2303 + +# ALL-NEXT: Total uOps: 1000 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.40 +# M3-NEXT: IPC: 0.40 +# M3-NEXT: Block RThroughput: 19.0 + +# M4-NEXT: uOps Per Cycle: 0.43 +# M4-NEXT: IPC: 0.43 +# M4-NEXT: Block RThroughput: 2.0 + +# M5-NEXT: uOps Per Cycle: 0.43 +# M5-NEXT: IPC: 0.43 +# M5-NEXT: Block RThroughput: 2.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 18 19.00 fsqrt s30, s30 +# M4-NEXT: 1 8 1.75 fsqrt s30, s30 +# M5-NEXT: 1 8 1.25 fsqrt s30, s30 + +# ALL-NEXT: 1 1 0.33 fmov s31, #1.00000000 + +# M3-NEXT: 1 7 2.00 fdiv s30, s31, s30 +# M3-NEXT: 1 4 0.50 frsqrte s1, s0 + +# M4-NEXT: 1 7 1.50 fdiv s30, s31, s30 +# M4-NEXT: 1 3 0.50 frsqrte s1, s0 + +# M5-NEXT: 1 7 1.00 fdiv s30, s31, s30 +# M5-NEXT: 1 3 0.50 frsqrte s1, s0 + +# ALL-NEXT: 1 3 0.33 fmul s2, s1, s1 +# ALL-NEXT: 1 4 0.33 frsqrts s2, s0, s2 +# ALL-NEXT: 1 3 0.33 fmul s1, s1, s2 +# ALL-NEXT: 1 3 0.33 fmul s2, s1, s1 +# ALL-NEXT: 1 4 0.33 frsqrts s0, s0, s2 +# ALL-NEXT: 1 3 0.33 fmul s0, s1, s0 diff --git a/test/tools/llvm-mca/AArch64/Exynos/float-sqrt.s b/test/tools/llvm-mca/AArch64/Exynos/float-sqrt.s new file mode 100644 index 00000000000..423fae20366 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/float-sqrt.s @@ -0,0 +1,73 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +fsqrt s31, s31 + +# Newton series for sqrtf(). +frsqrte s1, s0 +fmul s2, s1, s1 +frsqrts s2, s0, s2 +fmul s1, s1, s2 +fmul s2, s1, s1 +frsqrts s2, s0, s2 +fmul s2, s2, s0 +fmul s1, s1, s2 +fcmp s0, #0.0 +fcsel s0, s0, s1, eq + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1100 + +# M3-NEXT: Total Cycles: 3203 +# M4-NEXT: Total Cycles: 3103 +# M5-NEXT: Total Cycles: 2803 + +# ALL-NEXT: Total uOps: 1200 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.37 +# M3-NEXT: IPC: 0.34 +# M3-NEXT: Block RThroughput: 20.0 + +# M4-NEXT: uOps Per Cycle: 0.39 +# M4-NEXT: IPC: 0.35 +# M4-NEXT: Block RThroughput: 2.3 + +# M5-NEXT: uOps Per Cycle: 0.43 +# M5-NEXT: IPC: 0.39 +# M5-NEXT: Block RThroughput: 2.3 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 18 19.00 fsqrt s31, s31 +# M3-NEXT: 1 4 0.50 frsqrte s1, s0 + +# M4-NEXT: 1 8 1.75 fsqrt s31, s31 +# M4-NEXT: 1 3 0.50 frsqrte s1, s0 + +# M5-NEXT: 1 8 1.25 fsqrt s31, s31 +# M5-NEXT: 1 3 0.50 frsqrte s1, s0 + +# ALL-NEXT: 1 3 0.33 fmul s2, s1, s1 +# ALL-NEXT: 1 4 0.33 frsqrts s2, s0, s2 +# ALL-NEXT: 1 3 0.33 fmul s1, s1, s2 +# ALL-NEXT: 1 3 0.33 fmul s2, s1, s1 +# ALL-NEXT: 1 4 0.33 frsqrts s2, s0, s2 +# ALL-NEXT: 1 3 0.33 fmul s2, s2, s0 +# ALL-NEXT: 1 3 0.33 fmul s1, s1, s2 +# ALL-NEXT: 1 2 1.00 fcmp s0, #0.0 + +# M3-NEXT: 2 5 1.00 fcsel s0, s0, s1, eq +# M4-NEXT: 2 5 1.00 fcsel s0, s0, s1, eq +# M5-NEXT: 2 2 1.00 fcsel s0, s0, s1, eq diff --git a/test/tools/llvm-mca/AArch64/Exynos/float-store.s b/test/tools/llvm-mca/AArch64/Exynos/float-store.s new file mode 100644 index 00000000000..55d1d60252b --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/float-store.s @@ -0,0 +1,142 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +stur d0, [sp, #2] +stur q0, [sp, #16] + +str b0, [sp], #1 +str q0, [sp], #16 + +str h0, [sp, #2]! +str q0, [sp, #16]! + +str s0, [sp, #4] +str q0, [sp, #16] + +str d0, [sp, x0, lsl #3] +str q0, [sp, x0, lsl #4] + +str b0, [sp, x0] +str q0, [sp, x0] + +str h0, [sp, w0, sxtw #1] +str q0, [sp, w0, uxtw #4] + +str s0, [sp, w0, sxtw] +str q0, [sp, w0, uxtw] + +stp d0, d1, [sp], #16 +stp q0, q1, [sp], #32 + +stp s0, s1, [sp, #8]! +stp q0, q1, [sp, #32]! + +stp d0, d1, [sp, #16] +stp q0, q1, [sp, #32] + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 2200 + +# M3-NEXT: Total Cycles: 3203 +# M3-NEXT: Total uOps: 2900 + +# M4-NEXT: Total Cycles: 3203 +# M4-NEXT: Total uOps: 3000 + +# M5-NEXT: Total Cycles: 2803 +# M5-NEXT: Total uOps: 2500 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.91 +# M3-NEXT: IPC: 0.69 +# M3-NEXT: Block RThroughput: 22.0 + +# M4-NEXT: uOps Per Cycle: 0.94 +# M4-NEXT: IPC: 0.69 +# M4-NEXT: Block RThroughput: 12.5 + +# M5-NEXT: uOps Per Cycle: 0.89 +# M5-NEXT: IPC: 0.78 +# M5-NEXT: Block RThroughput: 11.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 1 1.00 * stur d0, [sp, #2] +# M3-NEXT: 1 1 1.00 * stur q0, [sp, #16] +# M3-NEXT: 1 1 1.00 * str b0, [sp], #1 +# M3-NEXT: 1 1 1.00 * str q0, [sp], #16 +# M3-NEXT: 1 1 1.00 * str h0, [sp, #2]! +# M3-NEXT: 1 1 1.00 * str q0, [sp, #16]! +# M3-NEXT: 1 1 1.00 * str s0, [sp, #4] +# M3-NEXT: 1 1 1.00 * str q0, [sp, #16] +# M3-NEXT: 1 1 1.00 * str d0, [sp, x0, lsl #3] +# M3-NEXT: 2 3 1.00 * str q0, [sp, x0, lsl #4] +# M3-NEXT: 1 1 1.00 * str b0, [sp, x0] +# M3-NEXT: 1 1 1.00 * str q0, [sp, x0] +# M3-NEXT: 2 3 1.00 * str h0, [sp, w0, sxtw #1] +# M3-NEXT: 2 3 1.00 * str q0, [sp, w0, uxtw #4] +# M3-NEXT: 2 3 1.00 * str s0, [sp, w0, sxtw] +# M3-NEXT: 2 3 1.00 * str q0, [sp, w0, uxtw] +# M3-NEXT: 1 1 1.00 * stp d0, d1, [sp], #16 +# M3-NEXT: 2 1 1.00 * stp q0, q1, [sp], #32 +# M3-NEXT: 1 1 1.00 * stp s0, s1, [sp, #8]! +# M3-NEXT: 2 1 1.00 * stp q0, q1, [sp, #32]! +# M3-NEXT: 1 1 1.00 * stp d0, d1, [sp, #16] +# M3-NEXT: 1 1 1.00 * stp q0, q1, [sp, #32] + +# M4-NEXT: 1 1 0.50 * stur d0, [sp, #2] +# M4-NEXT: 1 1 0.50 * stur q0, [sp, #16] +# M4-NEXT: 1 1 0.50 * str b0, [sp], #1 +# M4-NEXT: 1 1 0.50 * str q0, [sp], #16 +# M4-NEXT: 1 1 0.50 * str h0, [sp, #2]! +# M4-NEXT: 1 1 0.50 * str q0, [sp, #16]! +# M4-NEXT: 1 1 0.50 * str s0, [sp, #4] +# M4-NEXT: 1 1 0.50 * str q0, [sp, #16] +# M4-NEXT: 1 1 0.50 * str d0, [sp, x0, lsl #3] +# M4-NEXT: 2 3 0.50 * str q0, [sp, x0, lsl #4] +# M4-NEXT: 1 1 0.50 * str b0, [sp, x0] +# M4-NEXT: 1 1 0.50 * str q0, [sp, x0] +# M4-NEXT: 2 3 0.50 * str h0, [sp, w0, sxtw #1] +# M4-NEXT: 2 3 0.50 * str q0, [sp, w0, uxtw #4] +# M4-NEXT: 2 3 0.50 * str s0, [sp, w0, sxtw] +# M4-NEXT: 2 3 0.50 * str q0, [sp, w0, uxtw] +# M4-NEXT: 1 1 0.50 * stp d0, d1, [sp], #16 +# M4-NEXT: 2 1 1.00 * stp q0, q1, [sp], #32 +# M4-NEXT: 1 1 0.50 * stp s0, s1, [sp, #8]! +# M4-NEXT: 2 1 1.00 * stp q0, q1, [sp, #32]! +# M4-NEXT: 1 1 0.50 * stp d0, d1, [sp, #16] +# M4-NEXT: 2 1 1.00 * stp q0, q1, [sp, #32] + +# M5-NEXT: 1 1 0.50 * stur d0, [sp, #2] +# M5-NEXT: 1 1 0.50 * stur q0, [sp, #16] +# M5-NEXT: 1 1 0.50 * str b0, [sp], #1 +# M5-NEXT: 1 1 0.50 * str q0, [sp], #16 +# M5-NEXT: 1 1 0.50 * str h0, [sp, #2]! +# M5-NEXT: 1 1 0.50 * str q0, [sp, #16]! +# M5-NEXT: 1 1 0.50 * str s0, [sp, #4] +# M5-NEXT: 1 1 0.50 * str q0, [sp, #16] +# M5-NEXT: 1 1 0.50 * str d0, [sp, x0, lsl #3] +# M5-NEXT: 2 3 0.50 * str q0, [sp, x0, lsl #4] +# M5-NEXT: 1 1 0.50 * str b0, [sp, x0] +# M5-NEXT: 1 1 0.50 * str q0, [sp, x0] +# M5-NEXT: 1 1 0.50 * str h0, [sp, w0, sxtw #1] +# M5-NEXT: 2 3 0.50 * str q0, [sp, w0, uxtw #4] +# M5-NEXT: 1 1 0.50 * str s0, [sp, w0, sxtw] +# M5-NEXT: 2 3 0.50 * str q0, [sp, w0, uxtw] +# M5-NEXT: 1 1 0.50 * stp d0, d1, [sp], #16 +# M5-NEXT: 1 1 1.00 * stp q0, q1, [sp], #32 +# M5-NEXT: 1 1 0.50 * stp s0, s1, [sp, #8]! +# M5-NEXT: 1 1 1.00 * stp q0, q1, [sp, #32]! +# M5-NEXT: 1 1 0.50 * stp d0, d1, [sp, #16] +# M5-NEXT: 1 1 1.00 * stp q0, q1, [sp, #32] diff --git a/test/tools/llvm-mca/AArch64/Exynos/load.s b/test/tools/llvm-mca/AArch64/Exynos/load.s new file mode 100644 index 00000000000..04f30d353ae --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/load.s @@ -0,0 +1,66 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +ldr w0, 1f +ldur x0, [sp, #8] +ldrb w0, [sp], #1 +ldrsh w0, [sp, #2]! +ldr x0, [sp, #8] +ldrb w0, [sp, x31] +ldrsh w0, [sp, x31, lsl #1] +ldr w0, [sp, w31, sxtw] +ldr x0, [sp, w31, uxtw #3] +ldnp w0, w1, [sp, #8] +ldp x0, x1, [sp], #16 +ldpsw x0, x1, [sp, #8]! + +1: + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1200 +# ALL-NEXT: Total Cycles: 1904 + +# M3-NEXT: Total uOps: 1600 +# M4-NEXT: Total uOps: 1400 +# M5-NEXT: Total uOps: 1400 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 0.84 +# M4-NEXT: uOps Per Cycle: 0.74 +# M5-NEXT: uOps Per Cycle: 0.74 + +# ALL-NEXT: IPC: 0.63 +# ALL-NEXT: Block RThroughput: 6.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: +# ALL-NEXT: 1 4 0.50 * ldr w0, {{\.?}}Ltmp0 +# ALL-NEXT: 1 4 0.50 * ldur x0, [sp, #8] +# ALL-NEXT: 1 4 0.50 * ldrb w0, [sp], #1 +# ALL-NEXT: 1 4 0.50 * ldrsh w0, [sp, #2]! +# ALL-NEXT: 1 4 0.50 * ldr x0, [sp, #8] +# ALL-NEXT: 1 4 0.50 * ldrb w0, [sp, xzr] +# ALL-NEXT: 1 5 0.50 * ldrsh w0, [sp, xzr, lsl #1] + +# M3-NEXT: 2 5 0.50 * ldr w0, [sp, wzr, sxtw] +# M3-NEXT: 2 5 0.50 * ldr x0, [sp, wzr, uxtw #3] + +# M4-NEXT: 1 5 0.50 * ldr w0, [sp, wzr, sxtw] +# M4-NEXT: 1 5 0.50 * ldr x0, [sp, wzr, uxtw #3] + +# M5-NEXT: 1 5 0.50 * ldr w0, [sp, wzr, sxtw] +# M5-NEXT: 1 5 0.50 * ldr x0, [sp, wzr, uxtw #3] + +# ALL-NEXT: 1 4 0.50 * ldnp w0, w1, [sp, #8] +# ALL-NEXT: 2 4 0.50 * ldp x0, x1, [sp], #16 +# ALL-NEXT: 2 4 0.50 * ldpsw x0, x1, [sp, #8]! diff --git a/test/tools/llvm-mca/AArch64/Exynos/scheduler-queue-usage.s b/test/tools/llvm-mca/AArch64/Exynos/scheduler-queue-usage.s index 9e8c07149ca..b3bbec5f362 100644 --- a/test/tools/llvm-mca/AArch64/Exynos/scheduler-queue-usage.s +++ b/test/tools/llvm-mca/AArch64/Exynos/scheduler-queue-usage.s @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -iterations=1 -scheduler-stats -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefixes=ALL,M3 # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -iterations=1 -scheduler-stats -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -iterations=1 -scheduler-stats -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefixes=ALL,M5 b main @@ -19,6 +20,11 @@ # M4-NEXT: IPC: 0.50 # M4-NEXT: Block RThroughput: 0.2 +# M5: Dispatch Width: 6 +# M5-NEXT: uOps Per Cycle: 0.50 +# M5-NEXT: IPC: 0.50 +# M5-NEXT: Block RThroughput: 0.2 + # ALL: Schedulers - number of cycles where we saw N micro opcodes issued: # ALL-NEXT: [# issued], [# cycles] # ALL-NEXT: 0, 1 (50.0%) diff --git a/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s b/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s index 6a1c81b5fb4..8d885f431d7 100644 --- a/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s +++ b/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3 # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM5 adds w0, w1, w2, lsl #0 sub x3, x4, x5, lsr #1 @@ -9,13 +10,14 @@ adds w12, w13, w14, lsl #4 sub x15, x16, x17, lsr #6 ands x18, x19, x20, lsl #8 - orr w21, w22, w23, asr #10 + eor w21, w22, w23, asr #10 # ALL: Iterations: 100 # ALL-NEXT: Instructions: 800 # EM3-NEXT: Total Cycles: 354 # EM4-NEXT: Total Cycles: 329 +# EM5-NEXT: Total Cycles: 220 # ALL-NEXT: Total uOps: 800 @@ -29,6 +31,11 @@ # EM4-NEXT: IPC: 2.43 # EM4-NEXT: Block RThroughput: 3.3 +# EM5: Dispatch Width: 6 +# EM5-NEXT: uOps Per Cycle: 3.64 +# EM5-NEXT: IPC: 3.64 +# EM5-NEXT: Block RThroughput: 1.5 + # ALL: Instruction Info: # ALL-NEXT: [1]: #uOps # ALL-NEXT: [2]: Latency @@ -46,7 +53,7 @@ # EM3-NEXT: 1 2 0.50 adds w12, w13, w14, lsl #4 # EM3-NEXT: 1 2 0.50 sub x15, x16, x17, lsr #6 # EM3-NEXT: 1 2 0.50 ands x18, x19, x20, lsl #8 -# EM3-NEXT: 1 2 0.50 orr w21, w22, w23, asr #10 +# EM3-NEXT: 1 2 0.50 eor w21, w22, w23, asr #10 # EM4-NEXT: 1 1 0.25 adds w0, w1, w2 # EM4-NEXT: 1 2 0.50 sub x3, x4, x5, lsr #1 @@ -55,4 +62,13 @@ # EM4-NEXT: 1 2 0.50 adds w12, w13, w14, lsl #4 # EM4-NEXT: 1 2 0.50 sub x15, x16, x17, lsr #6 # EM4-NEXT: 1 1 0.25 ands x18, x19, x20, lsl #8 -# EM4-NEXT: 1 2 0.50 orr w21, w22, w23, asr #10 +# EM4-NEXT: 1 2 0.50 eor w21, w22, w23, asr #10 + +# EM5-NEXT: 1 1 0.17 adds w0, w1, w2 +# EM5-NEXT: 1 2 0.50 sub x3, x4, x5, lsr #1 +# EM5-NEXT: 1 1 0.25 ands x6, x7, x8, lsl #2 +# EM5-NEXT: 1 2 0.33 orr w9, w10, w11, asr #3 +# EM5-NEXT: 1 2 0.33 adds w12, w13, w14, lsl #4 +# EM5-NEXT: 1 2 0.50 sub x15, x16, x17, lsr #6 +# EM5-NEXT: 1 1 0.25 ands x18, x19, x20, lsl #8 +# EM5-NEXT: 1 2 0.33 eor w21, w22, w23, asr #10 diff --git a/test/tools/llvm-mca/AArch64/Exynos/store.s b/test/tools/llvm-mca/AArch64/Exynos/store.s new file mode 100644 index 00000000000..b86cdac50e6 --- /dev/null +++ b/test/tools/llvm-mca/AArch64/Exynos/store.s @@ -0,0 +1,82 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 + +stur x0, [sp, #8] +strb w0, [sp], #1 +strh w0, [sp, #2]! +str x0, [sp, #8] +strb w0, [sp, x31] +strh w0, [sp, x31, lsl #1] +str w0, [sp, w31, sxtw] +str x0, [sp, w31, uxtw #3] +stnp w0, w1, [sp, #8] +stp x0, x1, [sp], #16 +stp w0, w1, [sp, #8]! + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1100 +# ALL-NEXT: Total Cycles: 1303 + +# M3-NEXT: Total uOps: 1300 +# M4-NEXT: Total uOps: 1100 +# M5-NEXT: Total uOps: 1100 + +# ALL: Dispatch Width: 6 + +# M3-NEXT: uOps Per Cycle: 1.00 +# M4-NEXT: uOps Per Cycle: 0.84 +# M5-NEXT: uOps Per Cycle: 0.84 + +# ALL-NEXT: IPC: 0.84 + +# M3-NEXT: Block RThroughput: 11.0 +# M4-NEXT: Block RThroughput: 5.5 +# M5-NEXT: Block RThroughput: 5.5 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: + +# M3-NEXT: 1 1 1.00 * stur x0, [sp, #8] +# M3-NEXT: 1 1 1.00 * strb w0, [sp], #1 +# M3-NEXT: 1 1 1.00 * strh w0, [sp, #2]! +# M3-NEXT: 1 1 1.00 * str x0, [sp, #8] +# M3-NEXT: 1 1 1.00 * strb w0, [sp, xzr] +# M3-NEXT: 1 1 1.00 * strh w0, [sp, xzr, lsl #1] +# M3-NEXT: 2 2 1.00 * str w0, [sp, wzr, sxtw] +# M3-NEXT: 2 2 1.00 * str x0, [sp, wzr, uxtw #3] +# M3-NEXT: 1 1 1.00 * stnp w0, w1, [sp, #8] +# M3-NEXT: 1 1 1.00 * stp x0, x1, [sp], #16 +# M3-NEXT: 1 1 1.00 * stp w0, w1, [sp, #8]! + +# M4-NEXT: 1 1 0.50 * stur x0, [sp, #8] +# M4-NEXT: 1 1 0.50 * strb w0, [sp], #1 +# M4-NEXT: 1 1 0.50 * strh w0, [sp, #2]! +# M4-NEXT: 1 1 0.50 * str x0, [sp, #8] +# M4-NEXT: 1 1 0.50 * strb w0, [sp, xzr] +# M4-NEXT: 1 1 0.50 * strh w0, [sp, xzr, lsl #1] +# M4-NEXT: 1 2 0.50 * str w0, [sp, wzr, sxtw] +# M4-NEXT: 1 2 0.50 * str x0, [sp, wzr, uxtw #3] +# M4-NEXT: 1 1 0.50 * stnp w0, w1, [sp, #8] +# M4-NEXT: 1 1 0.50 * stp x0, x1, [sp], #16 +# M4-NEXT: 1 1 0.50 * stp w0, w1, [sp, #8]! + +# M5-NEXT: 1 1 0.50 * stur x0, [sp, #8] +# M5-NEXT: 1 1 0.50 * strb w0, [sp], #1 +# M5-NEXT: 1 1 0.50 * strh w0, [sp, #2]! +# M5-NEXT: 1 1 0.50 * str x0, [sp, #8] +# M5-NEXT: 1 1 0.50 * strb w0, [sp, xzr] +# M5-NEXT: 1 1 0.50 * strh w0, [sp, xzr, lsl #1] +# M5-NEXT: 1 2 0.50 * str w0, [sp, wzr, sxtw] +# M5-NEXT: 1 2 0.50 * str x0, [sp, wzr, uxtw #3] +# M5-NEXT: 1 1 0.50 * stnp w0, w1, [sp, #8] +# M5-NEXT: 1 1 0.50 * stp x0, x1, [sp], #16 +# M5-NEXT: 1 1 0.50 * stp w0, w1, [sp, #8]! diff --git a/test/tools/llvm-mca/AArch64/Exynos/zero-latency-move.s b/test/tools/llvm-mca/AArch64/Exynos/zero-latency-move.s index a42291108d0..3fecb1eebd1 100644 --- a/test/tools/llvm-mca/AArch64/Exynos/zero-latency-move.s +++ b/test/tools/llvm-mca/AArch64/Exynos/zero-latency-move.s @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 mov x0, x1 mov sp, x0 @@ -22,21 +23,13 @@ # ALL: Iterations: 100 # ALL-NEXT: Instructions: 1000 - -# M3-NEXT: Total Cycles: 172 -# M4-NEXT: Total Cycles: 172 - +# ALL-NEXT: Total Cycles: 172 # ALL-NEXT: Total uOps: 1000 -# M3: Dispatch Width: 6 -# M3-NEXT: uOps Per Cycle: 5.81 -# M3-NEXT: IPC: 5.81 -# M3-NEXT: Block RThroughput: 1.7 - -# M4: Dispatch Width: 6 -# M4-NEXT: uOps Per Cycle: 5.81 -# M4-NEXT: IPC: 5.81 -# M4-NEXT: Block RThroughput: 1.7 +# ALL: Dispatch Width: 6 +# ALL-NEXT: uOps Per Cycle: 5.81 +# ALL-NEXT: IPC: 5.81 +# ALL-NEXT: Block RThroughput: 1.7 # ALL: Instruction Info: # ALL-NEXT: [1]: #uOps @@ -47,25 +40,21 @@ # ALL-NEXT: [6]: HasSideEffects (U) # ALL: [1] [2] [3] [4] [5] [6] Instructions: +# ALL-NEXT: 1 0 0.17 mov x0, x1 +# ALL-NEXT: 1 0 0.17 mov sp, x0 +# ALL-NEXT: 1 0 0.17 mov w0, #12816 -# M3-NEXT: 1 0 0.17 mov x0, x1 -# M3-NEXT: 1 0 0.17 mov sp, x0 -# M3-NEXT: 1 0 0.17 mov w0, #12816 # M3-NEXT: 1 1 0.25 add w0, w1, #0 -# M3-NEXT: 1 0 0.17 adr x0, {{\.?}}Ltmp0 -# M3-NEXT: 1 4 0.50 * ldr x0, [x0] -# M3-NEXT: 1 0 0.17 adrp x0, {{\.?}}Ltmp0 -# M3-NEXT: 1 1 0.25 add x0, x0, :lo12:{{\.?}}Ltmp0 -# M3-NEXT: 1 1 0.33 fmov s0, s1 -# M3-NEXT: 1 0 0.17 movi d0, #0000000000000000 - -# M4-NEXT: 1 0 0.17 mov x0, x1 -# M4-NEXT: 1 0 0.17 mov sp, x0 -# M4-NEXT: 1 0 0.17 mov w0, #12816 # M4-NEXT: 1 1 0.25 add w0, w1, #0 -# M4-NEXT: 1 0 0.17 adr x0, {{\.?}}Ltmp0 -# M4-NEXT: 1 4 0.50 * ldr x0, [x0] -# M4-NEXT: 1 0 0.17 adrp x0, {{\.?}}Ltmp0 -# M4-NEXT: 1 1 0.25 add x0, x0, :lo12:{{\.?}}Ltmp0 +# M5-NEXT: 1 1 0.17 add w0, w1, #0 + +# ALL-NEXT: 1 0 0.17 adr x0, {{\.?}}Ltmp0 +# ALL-NEXT: 1 4 0.50 * ldr x0, [x0] +# ALL-NEXT: 1 0 0.17 adrp x0, {{\.?}}Ltmp0 +# ALL-NEXT: 1 1 0.25 add x0, x0, :lo12:{{\.?}}Ltmp0 + +# M3-NEXT: 1 1 0.33 fmov s0, s1 # M4-NEXT: 1 1 0.33 fmov s0, s1 -# M4-NEXT: 1 0 0.17 movi d0, #0000000000000000 +# M5-NEXT: 1 2 0.33 fmov s0, s1 + +# ALL-NEXT: 1 0 0.17 movi d0, #0000000000000000