mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
5d534d8259
I really needed this, like, factually, yesterday, when verifying dependency breaking idioms for AMD Zen 3 scheduler model. Consider the following example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=duplicate Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-4a7e50.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 0.31025, per_snippet_value: 0.31025 } error: '' info: '' assembled_snippet: C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C3 ... ``` What does it tell us? So wait, it can only execute ~3 x86 AVX YMM PXOR zero-idioms per cycle? That doesn't seem right. That's even less than there are pipes supporting this type of op. Now, second example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-2418b5.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 1.00011, per_snippet_value: 1.00011 } error: '' info: '' assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3 ... ``` Now that's just worse. Due to the looping, the throughput completely plummeted, and now we can only do a single instruction/cycle!? That's not great. And final example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop --loop-body-size=1000 Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-c402e2.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 0.167087, per_snippet_value: 0.167087 } error: '' info: '' assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3 ... ``` So if we merge the previous two approaches, do duplicate this single-instruction snippet 1000x (loop-body-size/instruction count in snippet), and run a loop with 1000 iterations over that duplicated/unrolled snippet, the measured throughput goes through the roof, up to 5.9 instructions/cycle, which finally tells us that this idiom is zero-cycle! Reviewed By: courbet Differential Revision: https://reviews.llvm.org/D102522
123 lines
3.9 KiB
C++
123 lines
3.9 KiB
C++
//===-- BenchmarkResult.h ---------------------------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// Defines classes to represent measurements and serialize/deserialize them to
|
|
// Yaml.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRESULT_H
|
|
#define LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRESULT_H
|
|
|
|
#include "LlvmState.h"
|
|
#include "RegisterValue.h"
|
|
#include "llvm/ADT/StringMap.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/MC/MCInst.h"
|
|
#include "llvm/MC/MCInstBuilder.h"
|
|
#include "llvm/Support/YAMLTraits.h"
|
|
#include <limits>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
namespace llvm {
|
|
class Error;
|
|
|
|
namespace exegesis {
|
|
|
|
struct InstructionBenchmarkKey {
|
|
// The LLVM opcode name.
|
|
std::vector<MCInst> Instructions;
|
|
// The initial values of the registers.
|
|
std::vector<RegisterValue> RegisterInitialValues;
|
|
// An opaque configuration, that can be used to separate several benchmarks of
|
|
// the same instruction under different configurations.
|
|
std::string Config;
|
|
};
|
|
|
|
struct BenchmarkMeasure {
|
|
// A helper to create an unscaled BenchmarkMeasure.
|
|
static BenchmarkMeasure Create(std::string Key, double Value) {
|
|
return {Key, Value, Value};
|
|
}
|
|
std::string Key;
|
|
// This is the per-instruction value, i.e. measured quantity scaled per
|
|
// instruction.
|
|
double PerInstructionValue;
|
|
// This is the per-snippet value, i.e. measured quantity for one repetition of
|
|
// the whole snippet.
|
|
double PerSnippetValue;
|
|
};
|
|
|
|
// The result of an instruction benchmark.
|
|
struct InstructionBenchmark {
|
|
InstructionBenchmarkKey Key;
|
|
enum ModeE { Unknown, Latency, Uops, InverseThroughput };
|
|
ModeE Mode;
|
|
std::string CpuName;
|
|
std::string LLVMTriple;
|
|
// Which instruction is being benchmarked here?
|
|
const MCInst &keyInstruction() const { return Key.Instructions[0]; }
|
|
// The number of instructions inside the repeated snippet. For example, if a
|
|
// snippet of 3 instructions is repeated 4 times, this is 12.
|
|
unsigned NumRepetitions = 0;
|
|
enum RepetitionModeE { Duplicate, Loop, AggregateMin };
|
|
// Note that measurements are per instruction.
|
|
std::vector<BenchmarkMeasure> Measurements;
|
|
std::string Error;
|
|
std::string Info;
|
|
std::vector<uint8_t> AssembledSnippet;
|
|
// How to aggregate measurements.
|
|
enum ResultAggregationModeE { Min, Max, Mean, MinVariance };
|
|
// Read functions.
|
|
static Expected<InstructionBenchmark> readYaml(const LLVMState &State,
|
|
StringRef Filename);
|
|
|
|
static Expected<std::vector<InstructionBenchmark>>
|
|
readYamls(const LLVMState &State, StringRef Filename);
|
|
|
|
class Error readYamlFrom(const LLVMState &State, StringRef InputContent);
|
|
|
|
// Write functions, non-const because of YAML traits.
|
|
class Error writeYamlTo(const LLVMState &State, raw_ostream &S);
|
|
|
|
class Error writeYaml(const LLVMState &State, const StringRef Filename);
|
|
};
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Utilities to work with Benchmark measures.
|
|
|
|
// A class that measures stats over benchmark measures.
|
|
class PerInstructionStats {
|
|
public:
|
|
void push(const BenchmarkMeasure &BM);
|
|
|
|
double avg() const {
|
|
assert(NumValues);
|
|
return SumValues / NumValues;
|
|
}
|
|
double min() const { return MinValue; }
|
|
double max() const { return MaxValue; }
|
|
|
|
const std::string &key() const { return Key; }
|
|
|
|
private:
|
|
std::string Key;
|
|
double SumValues = 0.0;
|
|
int NumValues = 0;
|
|
double MaxValue = std::numeric_limits<double>::min();
|
|
double MinValue = std::numeric_limits<double>::max();
|
|
};
|
|
|
|
} // namespace exegesis
|
|
} // namespace llvm
|
|
|
|
#endif // LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRESULT_H
|