mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
5d534d8259
I really needed this, like, factually, yesterday, when verifying dependency breaking idioms for AMD Zen 3 scheduler model. Consider the following example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=duplicate Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-4a7e50.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 0.31025, per_snippet_value: 0.31025 } error: '' info: '' assembled_snippet: C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C3 ... ``` What does it tell us? So wait, it can only execute ~3 x86 AVX YMM PXOR zero-idioms per cycle? That doesn't seem right. That's even less than there are pipes supporting this type of op. Now, second example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-2418b5.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 1.00011, per_snippet_value: 1.00011 } error: '' info: '' assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3 ... ``` Now that's just worse. Due to the looping, the throughput completely plummeted, and now we can only do a single instruction/cycle!? That's not great. And final example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop --loop-body-size=1000 Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-c402e2.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 0.167087, per_snippet_value: 0.167087 } error: '' info: '' assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3 ... ``` So if we merge the previous two approaches, do duplicate this single-instruction snippet 1000x (loop-body-size/instruction count in snippet), and run a loop with 1000 iterations over that duplicated/unrolled snippet, the measured throughput goes through the roof, up to 5.9 instructions/cycle, which finally tells us that this idiom is zero-cycle! Reviewed By: courbet Differential Revision: https://reviews.llvm.org/D102522
134 lines
4.6 KiB
C++
134 lines
4.6 KiB
C++
//===-- SnippetRepetitor.cpp ------------------------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include <array>
|
|
#include <string>
|
|
|
|
#include "SnippetRepetitor.h"
|
|
#include "Target.h"
|
|
#include "llvm/ADT/Sequence.h"
|
|
#include "llvm/CodeGen/TargetInstrInfo.h"
|
|
#include "llvm/CodeGen/TargetSubtargetInfo.h"
|
|
|
|
namespace llvm {
|
|
namespace exegesis {
|
|
namespace {
|
|
|
|
class DuplicateSnippetRepetitor : public SnippetRepetitor {
|
|
public:
|
|
using SnippetRepetitor::SnippetRepetitor;
|
|
|
|
// Repeats the snippet until there are at least MinInstructions in the
|
|
// resulting code.
|
|
FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
|
|
unsigned LoopBodySize) const override {
|
|
return [Instructions, MinInstructions](FunctionFiller &Filler) {
|
|
auto Entry = Filler.getEntry();
|
|
if (!Instructions.empty()) {
|
|
// Add the whole snippet at least once.
|
|
Entry.addInstructions(Instructions);
|
|
for (unsigned I = Instructions.size(); I < MinInstructions; ++I) {
|
|
Entry.addInstruction(Instructions[I % Instructions.size()]);
|
|
}
|
|
}
|
|
Entry.addReturn();
|
|
};
|
|
}
|
|
|
|
BitVector getReservedRegs() const override {
|
|
// We're using no additional registers.
|
|
return State.getRATC().emptyRegisters();
|
|
}
|
|
};
|
|
|
|
class LoopSnippetRepetitor : public SnippetRepetitor {
|
|
public:
|
|
explicit LoopSnippetRepetitor(const LLVMState &State)
|
|
: SnippetRepetitor(State),
|
|
LoopCounter(State.getExegesisTarget().getLoopCounterRegister(
|
|
State.getTargetMachine().getTargetTriple())) {}
|
|
|
|
// Loop over the snippet ceil(MinInstructions / Instructions.Size()) times.
|
|
FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
|
|
unsigned LoopBodySize) const override {
|
|
return [this, Instructions, MinInstructions,
|
|
LoopBodySize](FunctionFiller &Filler) {
|
|
const auto &ET = State.getExegesisTarget();
|
|
auto Entry = Filler.getEntry();
|
|
auto Loop = Filler.addBasicBlock();
|
|
auto Exit = Filler.addBasicBlock();
|
|
|
|
const unsigned LoopUnrollFactor =
|
|
LoopBodySize <= Instructions.size()
|
|
? 1
|
|
: divideCeil(LoopBodySize, Instructions.size());
|
|
assert(LoopUnrollFactor >= 1 && "Should end up with at least 1 snippet.");
|
|
|
|
// Set loop counter to the right value:
|
|
const APInt LoopCount(
|
|
32,
|
|
divideCeil(MinInstructions, LoopUnrollFactor * Instructions.size()));
|
|
assert(LoopCount.uge(1) && "Trip count should be at least 1.");
|
|
for (const MCInst &Inst :
|
|
ET.setRegTo(State.getSubtargetInfo(), LoopCounter, LoopCount))
|
|
Entry.addInstruction(Inst);
|
|
|
|
// Set up the loop basic block.
|
|
Entry.MBB->addSuccessor(Loop.MBB, BranchProbability::getOne());
|
|
Loop.MBB->addSuccessor(Loop.MBB, BranchProbability::getOne());
|
|
// The live ins are: the loop counter, the registers that were setup by
|
|
// the entry block, and entry block live ins.
|
|
Loop.MBB->addLiveIn(LoopCounter);
|
|
for (unsigned Reg : Filler.getRegistersSetUp())
|
|
Loop.MBB->addLiveIn(Reg);
|
|
for (const auto &LiveIn : Entry.MBB->liveins())
|
|
Loop.MBB->addLiveIn(LiveIn);
|
|
for (auto _ : seq(0U, LoopUnrollFactor)) {
|
|
(void)_;
|
|
Loop.addInstructions(Instructions);
|
|
}
|
|
ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB,
|
|
State.getInstrInfo());
|
|
|
|
// Set up the exit basic block.
|
|
Loop.MBB->addSuccessor(Exit.MBB, BranchProbability::getZero());
|
|
Exit.addReturn();
|
|
};
|
|
}
|
|
|
|
BitVector getReservedRegs() const override {
|
|
// We're using a single loop counter, but we have to reserve all aliasing
|
|
// registers.
|
|
return State.getRATC().getRegister(LoopCounter).aliasedBits();
|
|
}
|
|
|
|
private:
|
|
const unsigned LoopCounter;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
SnippetRepetitor::~SnippetRepetitor() {}
|
|
|
|
std::unique_ptr<const SnippetRepetitor>
|
|
SnippetRepetitor::Create(InstructionBenchmark::RepetitionModeE Mode,
|
|
const LLVMState &State) {
|
|
switch (Mode) {
|
|
case InstructionBenchmark::Duplicate:
|
|
return std::make_unique<DuplicateSnippetRepetitor>(State);
|
|
case InstructionBenchmark::Loop:
|
|
return std::make_unique<LoopSnippetRepetitor>(State);
|
|
case InstructionBenchmark::AggregateMin:
|
|
break;
|
|
}
|
|
llvm_unreachable("Unknown RepetitionModeE enum");
|
|
}
|
|
|
|
} // namespace exegesis
|
|
} // namespace llvm
|