mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 19:12:56 +02:00
8691a62a19
Summary: This eps param is used for two distinct things: * initial point clusterization * checking clusters against the llvm values What if one wants to only look at highly different clusters, without changing the clustering itself? In particular, this helps to weed out noisy measurements (since the clusterization epsilon is still small, so there is a better chance that noisy measurements from the same opcode will go into different clusters) By splitting it into two params it is now possible. This is nearly-free performance-wise: Old: ``` $ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html no exegesis target for x86_64-unknown-linux-gnu, using default Parsed 10099 benchmark points Printing sched class consistency analysis results to file '/tmp/clusters-old.html' ... Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (25 runs): 390.01 msec task-clock # 0.998 CPUs utilized ( +- 0.25% ) 12 context-switches # 31.735 M/sec ( +- 27.38% ) 0 cpu-migrations # 0.000 K/sec 4745 page-faults # 12183.732 M/sec ( +- 0.54% ) 1562711900 cycles # 4012303.327 GHz ( +- 0.24% ) (82.90%) 185567822 stalled-cycles-frontend # 11.87% frontend cycles idle ( +- 0.52% ) (83.30%) 392106234 stalled-cycles-backend # 25.09% backend cycles idle ( +- 1.31% ) (33.79%) 1839236666 instructions # 1.18 insn per cycle # 0.21 stalled cycles per insn ( +- 0.15% ) (50.37%) 407035764 branches # 1045074878.710 M/sec ( +- 0.12% ) (66.80%) 10896459 branch-misses # 2.68% of all branches ( +- 0.17% ) (83.20%) 0.390629 +- 0.000972 seconds time elapsed ( +- 0.25% ) ``` ``` $ perf stat -r 9 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-old.html no exegesis target for x86_64-unknown-linux-gnu, using default Parsed 50572 benchmark points Printing sched class consistency analysis results to file '/tmp/clusters-old.html' ... Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (9 runs): 6803.36 msec task-clock # 0.999 CPUs utilized ( +- 0.96% ) 262 context-switches # 38.546 M/sec ( +- 23.06% ) 0 cpu-migrations # 0.065 M/sec ( +- 76.03% ) 13287 page-faults # 1953.206 M/sec ( +- 0.32% ) 27252537904 cycles # 4006024.257 GHz ( +- 0.95% ) (83.31%) 1496314935 stalled-cycles-frontend # 5.49% frontend cycles idle ( +- 0.97% ) (83.32%) 16128404524 stalled-cycles-backend # 59.18% backend cycles idle ( +- 0.30% ) (33.37%) 17611143370 instructions # 0.65 insn per cycle # 0.92 stalled cycles per insn ( +- 0.05% ) (50.04%) 3894906599 branches # 572537147.437 M/sec ( +- 0.03% ) (66.69%) 116314514 branch-misses # 2.99% of all branches ( +- 0.20% ) (83.35%) 6.8118 +- 0.0689 seconds time elapsed ( +- 1.01%) ``` New: ``` $ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new.html no exegesis target for x86_64-unknown-linux-gnu, using default Parsed 10099 benchmark points Printing sched class consistency analysis results to file '/tmp/clusters-new.html' ... Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new.html' (25 runs): 400.14 msec task-clock # 0.998 CPUs utilized ( +- 0.66% ) 12 context-switches # 29.429 M/sec ( +- 25.95% ) 0 cpu-migrations # 0.100 M/sec ( +-100.00% ) 4714 page-faults # 11796.496 M/sec ( +- 0.55% ) 1603131306 cycles # 4011840.105 GHz ( +- 0.66% ) (82.85%) 199538509 stalled-cycles-frontend # 12.45% frontend cycles idle ( +- 2.40% ) (83.10%) 402249109 stalled-cycles-backend # 25.09% backend cycles idle ( +- 1.19% ) (34.05%) 1847783963 instructions # 1.15 insn per cycle # 0.22 stalled cycles per insn ( +- 0.18% ) (50.64%) 407162722 branches # 1018925730.631 M/sec ( +- 0.12% ) (67.02%) 10932779 branch-misses # 2.69% of all branches ( +- 0.51% ) (83.28%) 0.40077 +- 0.00267 seconds time elapsed ( +- 0.67% ) lebedevri@pini-pini:/build/llvm-build-Clang-release$ perf stat -r 9 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-new.html no exegesis target for x86_64-unknown-linux-gnu, using default Parsed 50572 benchmark points Printing sched class consistency analysis results to file '/tmp/clusters-new.html' ... Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-new.html' (9 runs): 6947.79 msec task-clock # 1.000 CPUs utilized ( +- 0.90% ) 217 context-switches # 31.236 M/sec ( +- 36.16% ) 1 cpu-migrations # 0.096 M/sec ( +- 50.00% ) 13258 page-faults # 1908.389 M/sec ( +- 0.34% ) 27830796523 cycles # 4006032.286 GHz ( +- 0.89% ) (83.30%) 1504554006 stalled-cycles-frontend # 5.41% frontend cycles idle ( +- 2.10% ) (83.32%) 16716574843 stalled-cycles-backend # 60.07% backend cycles idle ( +- 0.65% ) (33.38%) 17755545931 instructions # 0.64 insn per cycle # 0.94 stalled cycles per insn ( +- 0.09% ) (50.04%) 3897255686 branches # 560980426.597 M/sec ( +- 0.06% ) (66.70%) 117045395 branch-misses # 3.00% of all branches ( +- 0.47% ) (83.34%) 6.9507 +- 0.0627 seconds time elapsed ( +- 0.90% ) ``` I.e. it's +2.6% slowdown for one whole sweep, or +2% for 5 whole sweeps. Within noise i'd say. Should help with [[ https://bugs.llvm.org/show_bug.cgi?id=40787 | PR40787 ]]. Reviewers: courbet, gchatelet Reviewed By: courbet Subscribers: tschuett, RKSimon, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D58476 llvm-svn: 354767
147 lines
5.2 KiB
C++
147 lines
5.2 KiB
C++
//===-- Analysis.h ----------------------------------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// Analysis output for benchmark results.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_TOOLS_LLVM_EXEGESIS_ANALYSIS_H
|
|
#define LLVM_TOOLS_LLVM_EXEGESIS_ANALYSIS_H
|
|
|
|
#include "Clustering.h"
|
|
#include "llvm/MC/MCContext.h"
|
|
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
|
|
#include "llvm/MC/MCInstPrinter.h"
|
|
#include "llvm/MC/MCInstrInfo.h"
|
|
#include "llvm/MC/MCObjectFileInfo.h"
|
|
#include "llvm/MC/MCSubtargetInfo.h"
|
|
#include "llvm/Support/Error.h"
|
|
#include "llvm/Support/TargetRegistry.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include <memory>
|
|
#include <set>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
|
|
namespace llvm {
|
|
namespace exegesis {
|
|
|
|
// A helper class to analyze benchmark results for a target.
|
|
class Analysis {
|
|
public:
|
|
Analysis(const llvm::Target &Target,
|
|
std::unique_ptr<llvm::MCInstrInfo> InstrInfo,
|
|
const InstructionBenchmarkClustering &Clustering,
|
|
double AnalysisInconsistencyEpsilon,
|
|
bool AnalysisDisplayUnstableOpcodes);
|
|
|
|
// Prints a csv of instructions for each cluster.
|
|
struct PrintClusters {};
|
|
// Find potential errors in the scheduling information given measurements.
|
|
struct PrintSchedClassInconsistencies {};
|
|
|
|
template <typename Pass> llvm::Error run(llvm::raw_ostream &OS) const;
|
|
|
|
private:
|
|
using ClusterId = InstructionBenchmarkClustering::ClusterId;
|
|
|
|
// An llvm::MCSchedClassDesc augmented with some additional data.
|
|
struct ResolvedSchedClass {
|
|
ResolvedSchedClass(const llvm::MCSubtargetInfo &STI,
|
|
unsigned ResolvedSchedClassId, bool WasVariant);
|
|
|
|
const unsigned SchedClassId;
|
|
const llvm::MCSchedClassDesc *const SCDesc;
|
|
const bool WasVariant; // Whether the original class was variant.
|
|
const llvm::SmallVector<llvm::MCWriteProcResEntry, 8>
|
|
NonRedundantWriteProcRes;
|
|
const std::vector<std::pair<uint16_t, float>> IdealizedProcResPressure;
|
|
};
|
|
|
|
// Represents the intersection of a sched class and a cluster.
|
|
class SchedClassCluster {
|
|
public:
|
|
const InstructionBenchmarkClustering::ClusterId &id() const {
|
|
return ClusterId;
|
|
}
|
|
|
|
const std::vector<size_t> &getPointIds() const { return PointIds; }
|
|
|
|
// Return the cluster centroid.
|
|
const std::vector<PerInstructionStats> &getRepresentative() const {
|
|
return Representative;
|
|
}
|
|
|
|
// Returns true if the cluster representative measurements match that of SC.
|
|
bool
|
|
measurementsMatch(const llvm::MCSubtargetInfo &STI,
|
|
const ResolvedSchedClass &SC,
|
|
const InstructionBenchmarkClustering &Clustering,
|
|
const double AnalysisInconsistencyEpsilonSquared_) const;
|
|
|
|
void addPoint(size_t PointId,
|
|
const InstructionBenchmarkClustering &Clustering);
|
|
|
|
private:
|
|
InstructionBenchmarkClustering::ClusterId ClusterId;
|
|
std::vector<size_t> PointIds;
|
|
// Measurement stats for the points in the SchedClassCluster.
|
|
std::vector<PerInstructionStats> Representative;
|
|
};
|
|
|
|
void printInstructionRowCsv(size_t PointId, llvm::raw_ostream &OS) const;
|
|
|
|
void
|
|
printSchedClassClustersHtml(const std::vector<SchedClassCluster> &Clusters,
|
|
const ResolvedSchedClass &SC,
|
|
llvm::raw_ostream &OS) const;
|
|
void printSchedClassDescHtml(const ResolvedSchedClass &SC,
|
|
llvm::raw_ostream &OS) const;
|
|
|
|
// A pair of (Sched Class, indices of points that belong to the sched
|
|
// class).
|
|
struct ResolvedSchedClassAndPoints {
|
|
explicit ResolvedSchedClassAndPoints(ResolvedSchedClass &&RSC);
|
|
|
|
ResolvedSchedClass RSC;
|
|
std::vector<size_t> PointIds;
|
|
};
|
|
|
|
// Builds a list of ResolvedSchedClassAndPoints.
|
|
std::vector<ResolvedSchedClassAndPoints> makePointsPerSchedClass() const;
|
|
|
|
template <typename EscapeTag, EscapeTag Tag>
|
|
void writeSnippet(llvm::raw_ostream &OS, llvm::ArrayRef<uint8_t> Bytes,
|
|
const char *Separator) const;
|
|
|
|
const InstructionBenchmarkClustering &Clustering_;
|
|
llvm::MCObjectFileInfo ObjectFileInfo_;
|
|
std::unique_ptr<llvm::MCContext> Context_;
|
|
std::unique_ptr<llvm::MCSubtargetInfo> SubtargetInfo_;
|
|
std::unique_ptr<llvm::MCInstrInfo> InstrInfo_;
|
|
std::unique_ptr<llvm::MCRegisterInfo> RegInfo_;
|
|
std::unique_ptr<llvm::MCAsmInfo> AsmInfo_;
|
|
std::unique_ptr<llvm::MCInstPrinter> InstPrinter_;
|
|
std::unique_ptr<llvm::MCDisassembler> Disasm_;
|
|
const double AnalysisInconsistencyEpsilonSquared_;
|
|
const bool AnalysisDisplayUnstableOpcodes_;
|
|
};
|
|
|
|
// Computes the idealized ProcRes Unit pressure. This is the expected
|
|
// distribution if the CPU scheduler can distribute the load as evenly as
|
|
// possible.
|
|
std::vector<std::pair<uint16_t, float>> computeIdealizedProcResPressure(
|
|
const llvm::MCSchedModel &SM,
|
|
llvm::SmallVector<llvm::MCWriteProcResEntry, 8> WPRS);
|
|
|
|
} // namespace exegesis
|
|
} // namespace llvm
|
|
|
|
#endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
|