1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00
llvm-mirror/tools/llvm-exegesis/lib/BenchmarkResult.h
Roman Lebedev 5d534d8259 [llvm-exegesis] Loop unrolling for loop snippet repetitor mode
I really needed this, like, factually, yesterday,
when verifying dependency breaking idioms for AMD Zen 3 scheduler model.

Consider the following example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=duplicate
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-4a7e50.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 0.31025, per_snippet_value: 0.31025 }
error:           ''
info:            ''
assembled_snippet: C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C3
...

```
What does it tell us?
So wait, it can only execute ~3 x86 AVX YMM PXOR zero-idioms per cycle?
That doesn't seem right. That's even less than there are pipes supporting this type of op.

Now, second example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-2418b5.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 1.00011, per_snippet_value: 1.00011 }
error:           ''
info:            ''
assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3
...
```
Now that's just worse. Due to the looping, the throughput completely plummeted,
and now we can only do a single instruction/cycle!?

That's not great.
And final example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop --loop-body-size=1000
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-c402e2.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 0.167087, per_snippet_value: 0.167087 }
error:           ''
info:            ''
assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3
...
```

So if we merge the previous two approaches, do duplicate this single-instruction snippet 1000x
(loop-body-size/instruction count in snippet), and run a loop with 1000 iterations
over that duplicated/unrolled snippet, the measured throughput goes through the roof,
up to 5.9 instructions/cycle, which finally tells us that this idiom is zero-cycle!

Reviewed By: courbet

Differential Revision: https://reviews.llvm.org/D102522
2021-05-25 12:08:27 +03:00

123 lines
3.9 KiB
C++

//===-- BenchmarkResult.h ---------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// Defines classes to represent measurements and serialize/deserialize them to
// Yaml.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRESULT_H
#define LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRESULT_H
#include "LlvmState.h"
#include "RegisterValue.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/Support/YAMLTraits.h"
#include <limits>
#include <string>
#include <unordered_map>
#include <vector>
namespace llvm {
class Error;
namespace exegesis {
struct InstructionBenchmarkKey {
// The LLVM opcode name.
std::vector<MCInst> Instructions;
// The initial values of the registers.
std::vector<RegisterValue> RegisterInitialValues;
// An opaque configuration, that can be used to separate several benchmarks of
// the same instruction under different configurations.
std::string Config;
};
struct BenchmarkMeasure {
// A helper to create an unscaled BenchmarkMeasure.
static BenchmarkMeasure Create(std::string Key, double Value) {
return {Key, Value, Value};
}
std::string Key;
// This is the per-instruction value, i.e. measured quantity scaled per
// instruction.
double PerInstructionValue;
// This is the per-snippet value, i.e. measured quantity for one repetition of
// the whole snippet.
double PerSnippetValue;
};
// The result of an instruction benchmark.
struct InstructionBenchmark {
InstructionBenchmarkKey Key;
enum ModeE { Unknown, Latency, Uops, InverseThroughput };
ModeE Mode;
std::string CpuName;
std::string LLVMTriple;
// Which instruction is being benchmarked here?
const MCInst &keyInstruction() const { return Key.Instructions[0]; }
// The number of instructions inside the repeated snippet. For example, if a
// snippet of 3 instructions is repeated 4 times, this is 12.
unsigned NumRepetitions = 0;
enum RepetitionModeE { Duplicate, Loop, AggregateMin };
// Note that measurements are per instruction.
std::vector<BenchmarkMeasure> Measurements;
std::string Error;
std::string Info;
std::vector<uint8_t> AssembledSnippet;
// How to aggregate measurements.
enum ResultAggregationModeE { Min, Max, Mean, MinVariance };
// Read functions.
static Expected<InstructionBenchmark> readYaml(const LLVMState &State,
StringRef Filename);
static Expected<std::vector<InstructionBenchmark>>
readYamls(const LLVMState &State, StringRef Filename);
class Error readYamlFrom(const LLVMState &State, StringRef InputContent);
// Write functions, non-const because of YAML traits.
class Error writeYamlTo(const LLVMState &State, raw_ostream &S);
class Error writeYaml(const LLVMState &State, const StringRef Filename);
};
//------------------------------------------------------------------------------
// Utilities to work with Benchmark measures.
// A class that measures stats over benchmark measures.
class PerInstructionStats {
public:
void push(const BenchmarkMeasure &BM);
double avg() const {
assert(NumValues);
return SumValues / NumValues;
}
double min() const { return MinValue; }
double max() const { return MaxValue; }
const std::string &key() const { return Key; }
private:
std::string Key;
double SumValues = 0.0;
int NumValues = 0;
double MaxValue = std::numeric_limits<double>::min();
double MinValue = std::numeric_limits<double>::max();
};
} // namespace exegesis
} // namespace llvm
#endif // LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRESULT_H