[llvm-exegesis] Let Counter returns up to 16 entries

LBR contains (up to) 16 entries for last x branches and the X86LBRCounter (from D77422) should be able to return all those. Currently, it just returns the latest entry, which could lead to mis-leading measurements. This patch aslo changes the LatencyBenchmarkRunner to accommodate multi-value readings. https://reviews.llvm.org/D81050
2024-11-23 03:02:36 +01:00 · 2020-06-25 11:15:16 -04:00 · 2020-06-25 11:15:16 -04:00 · 052f666a48
commit 052f666a48
parent 61be63afc4
10 changed files with 207 additions and 44 deletions
--- a/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/tools/llvm-exegesis/lib/BenchmarkResult.h
@ -74,7 +74,8 @@ struct InstructionBenchmark {
  std::string Error;
  std::string Info;
  std::vector<uint8_t> AssembledSnippet;
-
+  // How to aggregate measurements.
+  enum ResultAggregationModeE { Min, Max, Mean, MinVariance };
  // Read functions.
  static Expected<InstructionBenchmark> readYaml(const LLVMState &State,
                                                 StringRef Filename);
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@ -46,9 +46,29 @@ public:

 private:
  Expected<int64_t> runAndMeasure(const char *Counters) const override {
+    auto ResultOrError = runAndSample(Counters);
+    if (ResultOrError)
+      return ResultOrError.get()[0];
+    return ResultOrError.takeError();
+  }
+
+  static void
+  accumulateCounterValues(const llvm::SmallVector<int64_t, 4> &NewValues,
+                          llvm::SmallVector<int64_t, 4> *Result) {
+
+    const size_t NumValues = std::max(NewValues.size(), Result->size());
+    if (NumValues > Result->size())
+      Result->resize(NumValues, 0);
+    for (size_t I = 0, End = NewValues.size(); I < End; ++I)
+      (*Result)[I] += NewValues[I];
+  }
+
+  Expected<llvm::SmallVector<int64_t, 4>>
+  runAndSample(const char *Counters) const override {
    // We sum counts when there are several counters for a single ProcRes
    // (e.g. P23 on SandyBridge).
-    int64_t CounterValue = 0;
+    llvm::SmallVector<int64_t, 4> CounterValues;
+    int Reserved = 0;
    SmallVector<StringRef, 2> CounterNames;
    StringRef(Counters).split(CounterNames, '+');
    char *const ScratchPtr = Scratch->ptr();
@ -61,6 +81,17 @@ private:
        return CounterOrError.takeError();

      pfm::Counter *Counter = CounterOrError.get().get();
+      if (Reserved == 0) {
+        Reserved = Counter->numValues();
+        CounterValues.reserve(Reserved);
+      } else if (Reserved != Counter->numValues())
+        // It'd be wrong to accumulate vectors of different sizes.
+        return make_error<Failure>(
+            llvm::Twine("Inconsistent number of values for counter ")
+                .concat(CounterName)
+                .concat(std::to_string(Counter->numValues()))
+                .concat(" vs expected of ")
+                .concat(std::to_string(Reserved)));
      Scratch->clear();
      {
        CrashRecoveryContext CRC;
@ -75,9 +106,13 @@ private:
        if (Crashed)
          return make_error<SnippetCrash>("snippet crashed while running");
      }
-      CounterValue += Counter->read();
+      auto ValueOrError = Counter->readOrError();
+      if (!ValueOrError)
+        return ValueOrError.takeError();
+
+      accumulateCounterValues(ValueOrError.get(), &CounterValues);
    }
-    return CounterValue;
+    return CounterValues;
  }

  const LLVMState &State;
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.h
@ -21,6 +21,7 @@
 #include "LlvmState.h"
 #include "MCInstrDescView.h"
 #include "SnippetRepetitor.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Error.h"
 #include <cstdlib>
@ -65,7 +66,11 @@ public:
  class FunctionExecutor {
  public:
    virtual ~FunctionExecutor();
+    // FIXME deprecate this.
    virtual Expected<int64_t> runAndMeasure(const char *Counters) const = 0;
+
+    virtual Expected<llvm::SmallVector<int64_t, 4>>
+    runAndSample(const char *Counters) const = 0;
  };

 protected:
--- a/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
+++ b/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
@ -8,48 +8,135 @@

 #include "LatencyBenchmarkRunner.h"

-#include "Target.h"
 #include "BenchmarkRunner.h"
+#include "Target.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cmath>

 namespace llvm {
 namespace exegesis {

-LatencyBenchmarkRunner::LatencyBenchmarkRunner(const LLVMState &State,
-                                               InstructionBenchmark::ModeE Mode)
+LatencyBenchmarkRunner::LatencyBenchmarkRunner(
+    const LLVMState &State, InstructionBenchmark::ModeE Mode,
+    InstructionBenchmark::ResultAggregationModeE ResultAgg)
    : BenchmarkRunner(State, Mode) {
  assert((Mode == InstructionBenchmark::Latency ||
          Mode == InstructionBenchmark::InverseThroughput) &&
         "invalid mode");
+  ResultAggMode = ResultAgg;
 }

 LatencyBenchmarkRunner::~LatencyBenchmarkRunner() = default;

+static double computeVariance(const llvm::SmallVector<int64_t, 4> &Values) {
+  if (Values.empty())
+    return 0.0;
+  double Sum = std::accumulate(Values.begin(), Values.end(), 0.0);
+
+  const double Mean = Sum / Values.size();
+  double Ret = 0;
+  for (const auto &V : Values) {
+    double Delta = V - Mean;
+    Ret += Delta * Delta;
+  }
+  return Ret / Values.size();
+}
+
+static int64_t findMin(const llvm::SmallVector<int64_t, 4> &Values) {
+  if (Values.empty())
+    return 0;
+  return *std::min_element(Values.begin(), Values.end());
+}
+
+static int64_t findMax(const llvm::SmallVector<int64_t, 4> &Values) {
+  if (Values.empty())
+    return 0;
+  return *std::max_element(Values.begin(), Values.end());
+}
+
+static int64_t findMean(const llvm::SmallVector<int64_t, 4> &Values) {
+  if (Values.empty())
+    return 0;
+  return std::accumulate(Values.begin(), Values.end(), 0.0) /
+         static_cast<double>(Values.size());
+}
+
 Expected<std::vector<BenchmarkMeasure>> LatencyBenchmarkRunner::runMeasurements(
    const FunctionExecutor &Executor) const {
  // Cycle measurements include some overhead from the kernel. Repeat the
-  // measure several times and take the minimum value.
+  // measure several times and return the aggregated value, as specified by
+  // ResultAggMode.
  constexpr const int NumMeasurements = 30;
-  int64_t MinValue = std::numeric_limits<int64_t>::max();
+  llvm::SmallVector<int64_t, 4> AccumulatedValues;
+  double MinVariance = std::numeric_limits<double>::infinity();
  const char *CounterName = State.getPfmCounters().CycleCounter;
+  // Values count for each run.
+  int ValuesCount = 0;
  for (size_t I = 0; I < NumMeasurements; ++I) {
-    auto ExpectedCounterValue = Executor.runAndMeasure(CounterName);
-    if (!ExpectedCounterValue)
-      return ExpectedCounterValue.takeError();
-    if (*ExpectedCounterValue < MinValue)
-      MinValue = *ExpectedCounterValue;
+    auto ExpectedCounterValues = Executor.runAndSample(CounterName);
+    if (!ExpectedCounterValues)
+      return ExpectedCounterValues.takeError();
+    ValuesCount = ExpectedCounterValues.get().size();
+    if (ValuesCount == 1)
+      AccumulatedValues.push_back(ExpectedCounterValues.get()[0]);
+    else {
+      // We'll keep the reading with lowest variance (ie., most stable)
+      double Variance = computeVariance(*ExpectedCounterValues);
+      if (MinVariance > Variance) {
+        AccumulatedValues = std::move(ExpectedCounterValues.get());
+        MinVariance = Variance;
+      }
+    }
  }
-  std::vector<BenchmarkMeasure> Result;
+
+  std::string ModeName;
  switch (Mode) {
  case InstructionBenchmark::Latency:
-    Result = {BenchmarkMeasure::Create("latency", MinValue)};
+    ModeName = "latency";
    break;
  case InstructionBenchmark::InverseThroughput:
-    Result = {BenchmarkMeasure::Create("inverse_throughput", MinValue)};
+    ModeName = "inverse_throughput";
    break;
  default:
    break;
  }
-  return std::move(Result);
+
+  switch (ResultAggMode) {
+  case InstructionBenchmark::MinVariance: {
+    if (ValuesCount == 1)
+      llvm::errs() << "Each sample only has one value. result-aggregation-mode "
+                      "of min-variance is probably non-sensical\n";
+    std::vector<BenchmarkMeasure> Result;
+    Result.reserve(AccumulatedValues.size());
+    for (const int64_t Value : AccumulatedValues)
+      Result.push_back(BenchmarkMeasure::Create(ModeName, Value));
+    return std::move(Result);
+  }
+  case InstructionBenchmark::Min: {
+    std::vector<BenchmarkMeasure> Result;
+    Result.push_back(
+        BenchmarkMeasure::Create(ModeName, findMin(AccumulatedValues)));
+    return std::move(Result);
+  }
+  case InstructionBenchmark::Max: {
+    std::vector<BenchmarkMeasure> Result;
+    Result.push_back(
+        BenchmarkMeasure::Create(ModeName, findMax(AccumulatedValues)));
+    return std::move(Result);
+  }
+  case InstructionBenchmark::Mean: {
+    std::vector<BenchmarkMeasure> Result;
+    Result.push_back(
+        BenchmarkMeasure::Create(ModeName, findMean(AccumulatedValues)));
+    return std::move(Result);
+  }
+  }
+  return llvm::make_error<Failure>(llvm::Twine("Unexpected benchmark mode(")
+                                       .concat(std::to_string(Mode))
+                                       .concat(" and unexpected ResultAggMode ")
+                                       .concat(std::to_string(ResultAggMode)));
 }

 } // namespace exegesis
--- a/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
+++ b/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
@ -21,13 +21,16 @@ namespace exegesis {

 class LatencyBenchmarkRunner : public BenchmarkRunner {
 public:
-  LatencyBenchmarkRunner(const LLVMState &State,
-                         InstructionBenchmark::ModeE Mode);
+  LatencyBenchmarkRunner(
+      const LLVMState &State, InstructionBenchmark::ModeE Mode,
+      InstructionBenchmark::ResultAggregationModeE ResultAggMode);
  ~LatencyBenchmarkRunner() override;

 private:
  Expected<std::vector<BenchmarkMeasure>>
  runMeasurements(const FunctionExecutor &Executor) const override;
+
+  InstructionBenchmark::ResultAggregationModeE ResultAggMode;
 };
 } // namespace exegesis
 } // namespace llvm
--- a/tools/llvm-exegesis/lib/PerfHelper.cpp
+++ b/tools/llvm-exegesis/lib/PerfHelper.cpp
@ -119,23 +119,27 @@ void Counter::stop() { ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0); }

 int64_t Counter::read() const {
  auto ValueOrError = readOrError();
-  if (ValueOrError)
-    return ValueOrError.get();
-
-  errs() << ValueOrError.takeError() << "\n";
+  if (ValueOrError) {
+    if (!ValueOrError.get().empty())
+      return ValueOrError.get()[0];
+    errs() << "Counter has no reading\n";
+  } else
+    errs() << ValueOrError.takeError() << "\n";
  return -1;
 }

-llvm::Expected<int64_t> Counter::readOrError() const {
+llvm::Expected<llvm::SmallVector<int64_t, 4>> Counter::readOrError() const {
  int64_t Count = 0;
  ssize_t ReadSize = ::read(FileDescriptor, &Count, sizeof(Count));
  if (ReadSize != sizeof(Count))
    return llvm::make_error<llvm::StringError>("Failed to read event counter",
                                               llvm::errc::io_error);
-
-  return Count;
+  llvm::SmallVector<int64_t, 4> Result;
+  Result.push_back(Count);
+  return Result;
 }

+int Counter::numValues() const { return 1; }
 #else

 Counter::Counter(PerfEvent &&Event) : Event(std::move(Event)) {}
@ -148,11 +152,13 @@ void Counter::stop() {}

 int64_t Counter::read() const { return 42; }

-llvm::Expected<int64_t> Counter::readOrError() const {
+llvm::Expected<llvm::SmallVector<int64_t, 4>> Counter::readOrError() const {
  return llvm::make_error<llvm::StringError>("Not implemented",
                                             llvm::errc::io_error);
 }

+int Counter::numValues() const { return 1; }
+
 #endif

 } // namespace pfm
--- a/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/tools/llvm-exegesis/lib/PerfHelper.h
@ -15,9 +15,11 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_PERFHELPER_H

 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Error.h"
+
 #include <cstdint>
 #include <functional>
 #include <memory>
@ -85,7 +87,9 @@ public:
  int64_t read() const;

  /// Returns the current value of the counter or error if it cannot be read.
-  virtual llvm::Expected<int64_t> readOrError() const;
+  virtual llvm::Expected<llvm::SmallVector<int64_t, 4>> readOrError() const;
+
+  virtual int numValues() const;

 private:
  PerfEvent Event;
--- a/tools/llvm-exegesis/lib/Target.cpp
+++ b/tools/llvm-exegesis/lib/Target.cpp
@ -68,8 +68,9 @@ std::unique_ptr<SnippetGenerator> ExegesisTarget::createSnippetGenerator(
 }

 Expected<std::unique_ptr<BenchmarkRunner>>
-ExegesisTarget::createBenchmarkRunner(InstructionBenchmark::ModeE Mode,
-                                      const LLVMState &State) const {
+ExegesisTarget::createBenchmarkRunner(
+    InstructionBenchmark::ModeE Mode, const LLVMState &State,
+    InstructionBenchmark::ResultAggregationModeE ResultAggMode) const {
  PfmCountersInfo PfmCounters = State.getPfmCounters();
  switch (Mode) {
  case InstructionBenchmark::Unknown:
@ -85,12 +86,12 @@ ExegesisTarget::createBenchmarkRunner(InstructionBenchmark::ModeE Mode,
              .concat(ModeName)
              .concat("' mode, sched model does not define a cycle counter."));
    }
-    return createLatencyBenchmarkRunner(State, Mode);
+    return createLatencyBenchmarkRunner(State, Mode, ResultAggMode);
  case InstructionBenchmark::Uops:
    if (!PfmCounters.UopsCounter && !PfmCounters.IssueCounters)
      return make_error<Failure>("can't run 'uops' mode, sched model does not "
                                 "define uops or issue counters.");
-    return createUopsBenchmarkRunner(State);
+    return createUopsBenchmarkRunner(State, ResultAggMode);
  }
  return nullptr;
 }
@ -106,12 +107,14 @@ std::unique_ptr<SnippetGenerator> ExegesisTarget::createParallelSnippetGenerator
 }

 std::unique_ptr<BenchmarkRunner> ExegesisTarget::createLatencyBenchmarkRunner(
-    const LLVMState &State, InstructionBenchmark::ModeE Mode) const {
-  return std::make_unique<LatencyBenchmarkRunner>(State, Mode);
+    const LLVMState &State, InstructionBenchmark::ModeE Mode,
+    InstructionBenchmark::ResultAggregationModeE ResultAggMode) const {
+  return std::make_unique<LatencyBenchmarkRunner>(State, Mode, ResultAggMode);
 }

-std::unique_ptr<BenchmarkRunner>
-ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const {
+std::unique_ptr<BenchmarkRunner> ExegesisTarget::createUopsBenchmarkRunner(
+    const LLVMState &State,
+    InstructionBenchmark::ResultAggregationModeE /*unused*/) const {
  return std::make_unique<UopsBenchmarkRunner>(State);
 }

--- a/tools/llvm-exegesis/lib/Target.h
+++ b/tools/llvm-exegesis/lib/Target.h
@ -148,9 +148,10 @@ public:
                         const LLVMState &State,
                         const SnippetGenerator::Options &Opts) const;
  // Creates a benchmark runner for the given mode.
-  Expected<std::unique_ptr<BenchmarkRunner>>
-  createBenchmarkRunner(InstructionBenchmark::ModeE Mode,
-                        const LLVMState &State) const;
+  Expected<std::unique_ptr<BenchmarkRunner>> createBenchmarkRunner(
+      InstructionBenchmark::ModeE Mode, const LLVMState &State,
+      InstructionBenchmark::ResultAggregationModeE ResultAggMode =
+          InstructionBenchmark::Min) const;

  // Returns the ExegesisTarget for the given triple or nullptr if the target
  // does not exist.
@ -176,9 +177,11 @@ private:
  std::unique_ptr<SnippetGenerator> virtual createParallelSnippetGenerator(
      const LLVMState &State, const SnippetGenerator::Options &Opts) const;
  std::unique_ptr<BenchmarkRunner> virtual createLatencyBenchmarkRunner(
-      const LLVMState &State, InstructionBenchmark::ModeE Mode) const;
+      const LLVMState &State, InstructionBenchmark::ModeE Mode,
+      InstructionBenchmark::ResultAggregationModeE ResultAggMode) const;
  std::unique_ptr<BenchmarkRunner> virtual createUopsBenchmarkRunner(
-      const LLVMState &State) const;
+      const LLVMState &State,
+      InstructionBenchmark::ResultAggregationModeE ResultAggMode) const;

  const ExegesisTarget *Next = nullptr;
  const ArrayRef<CpuAndPfmCounters> CpuPfmCounters;
--- a/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/tools/llvm-exegesis/llvm-exegesis.cpp
@ -83,6 +83,21 @@ static cl::opt<exegesis::InstructionBenchmark::ModeE> BenchmarkMode(
               clEnumValN(exegesis::InstructionBenchmark::Unknown, "analysis",
                          "Analysis")));

+static cl::opt<exegesis::InstructionBenchmark::ResultAggregationModeE>
+    ResultAggMode(
+        "result-aggregation-mode",
+        cl::desc("How to aggregate multi-values result"), cl::cat(Options),
+        cl::values(clEnumValN(exegesis::InstructionBenchmark::Min, "min",
+                              "Keep min reading"),
+                   clEnumValN(exegesis::InstructionBenchmark::Max, "max",
+                              "Keep max reading"),
+                   clEnumValN(exegesis::InstructionBenchmark::Mean, "mean",
+                              "Compute mean of all readings"),
+                   clEnumValN(exegesis::InstructionBenchmark::MinVariance,
+                              "min-variance",
+                              "Keep readings set with min-variance")),
+        cl::init(exegesis::InstructionBenchmark::Min));
+
 static cl::opt<exegesis::InstructionBenchmark::RepetitionModeE> RepetitionMode(
    "repetition-mode", cl::desc("how to repeat the instruction snippet"),
    cl::cat(BenchmarkOptions),
@ -281,8 +296,9 @@ void benchmarkMain() {

  const LLVMState State(CpuName);

-  const std::unique_ptr<BenchmarkRunner> Runner = ExitOnErr(
-      State.getExegesisTarget().createBenchmarkRunner(BenchmarkMode, State));
+  const std::unique_ptr<BenchmarkRunner> Runner =
+      ExitOnErr(State.getExegesisTarget().createBenchmarkRunner(
+          BenchmarkMode, State, ResultAggMode));
  if (!Runner) {
    ExitWithError("cannot create benchmark runner");
  }