1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00
llvm-mirror/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
Roman Lebedev 5d534d8259 [llvm-exegesis] Loop unrolling for loop snippet repetitor mode
I really needed this, like, factually, yesterday,
when verifying dependency breaking idioms for AMD Zen 3 scheduler model.

Consider the following example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=duplicate
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-4a7e50.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 0.31025, per_snippet_value: 0.31025 }
error:           ''
info:            ''
assembled_snippet: C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C3
...

```
What does it tell us?
So wait, it can only execute ~3 x86 AVX YMM PXOR zero-idioms per cycle?
That doesn't seem right. That's even less than there are pipes supporting this type of op.

Now, second example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-2418b5.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 1.00011, per_snippet_value: 1.00011 }
error:           ''
info:            ''
assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3
...
```
Now that's just worse. Due to the looping, the throughput completely plummeted,
and now we can only do a single instruction/cycle!?

That's not great.
And final example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop --loop-body-size=1000
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-c402e2.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 0.167087, per_snippet_value: 0.167087 }
error:           ''
info:            ''
assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3
...
```

So if we merge the previous two approaches, do duplicate this single-instruction snippet 1000x
(loop-body-size/instruction count in snippet), and run a loop with 1000 iterations
over that duplicated/unrolled snippet, the measured throughput goes through the roof,
up to 5.9 instructions/cycle, which finally tells us that this idiom is zero-cycle!

Reviewed By: courbet

Differential Revision: https://reviews.llvm.org/D102522
2021-05-25 12:08:27 +03:00

134 lines
4.6 KiB
C++

//===-- SnippetRepetitor.cpp ------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <array>
#include <string>
#include "SnippetRepetitor.h"
#include "Target.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
namespace llvm {
namespace exegesis {
namespace {
class DuplicateSnippetRepetitor : public SnippetRepetitor {
public:
using SnippetRepetitor::SnippetRepetitor;
// Repeats the snippet until there are at least MinInstructions in the
// resulting code.
FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
unsigned LoopBodySize) const override {
return [Instructions, MinInstructions](FunctionFiller &Filler) {
auto Entry = Filler.getEntry();
if (!Instructions.empty()) {
// Add the whole snippet at least once.
Entry.addInstructions(Instructions);
for (unsigned I = Instructions.size(); I < MinInstructions; ++I) {
Entry.addInstruction(Instructions[I % Instructions.size()]);
}
}
Entry.addReturn();
};
}
BitVector getReservedRegs() const override {
// We're using no additional registers.
return State.getRATC().emptyRegisters();
}
};
class LoopSnippetRepetitor : public SnippetRepetitor {
public:
explicit LoopSnippetRepetitor(const LLVMState &State)
: SnippetRepetitor(State),
LoopCounter(State.getExegesisTarget().getLoopCounterRegister(
State.getTargetMachine().getTargetTriple())) {}
// Loop over the snippet ceil(MinInstructions / Instructions.Size()) times.
FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
unsigned LoopBodySize) const override {
return [this, Instructions, MinInstructions,
LoopBodySize](FunctionFiller &Filler) {
const auto &ET = State.getExegesisTarget();
auto Entry = Filler.getEntry();
auto Loop = Filler.addBasicBlock();
auto Exit = Filler.addBasicBlock();
const unsigned LoopUnrollFactor =
LoopBodySize <= Instructions.size()
? 1
: divideCeil(LoopBodySize, Instructions.size());
assert(LoopUnrollFactor >= 1 && "Should end up with at least 1 snippet.");
// Set loop counter to the right value:
const APInt LoopCount(
32,
divideCeil(MinInstructions, LoopUnrollFactor * Instructions.size()));
assert(LoopCount.uge(1) && "Trip count should be at least 1.");
for (const MCInst &Inst :
ET.setRegTo(State.getSubtargetInfo(), LoopCounter, LoopCount))
Entry.addInstruction(Inst);
// Set up the loop basic block.
Entry.MBB->addSuccessor(Loop.MBB, BranchProbability::getOne());
Loop.MBB->addSuccessor(Loop.MBB, BranchProbability::getOne());
// The live ins are: the loop counter, the registers that were setup by
// the entry block, and entry block live ins.
Loop.MBB->addLiveIn(LoopCounter);
for (unsigned Reg : Filler.getRegistersSetUp())
Loop.MBB->addLiveIn(Reg);
for (const auto &LiveIn : Entry.MBB->liveins())
Loop.MBB->addLiveIn(LiveIn);
for (auto _ : seq(0U, LoopUnrollFactor)) {
(void)_;
Loop.addInstructions(Instructions);
}
ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB,
State.getInstrInfo());
// Set up the exit basic block.
Loop.MBB->addSuccessor(Exit.MBB, BranchProbability::getZero());
Exit.addReturn();
};
}
BitVector getReservedRegs() const override {
// We're using a single loop counter, but we have to reserve all aliasing
// registers.
return State.getRATC().getRegister(LoopCounter).aliasedBits();
}
private:
const unsigned LoopCounter;
};
} // namespace
SnippetRepetitor::~SnippetRepetitor() {}
std::unique_ptr<const SnippetRepetitor>
SnippetRepetitor::Create(InstructionBenchmark::RepetitionModeE Mode,
const LLVMState &State) {
switch (Mode) {
case InstructionBenchmark::Duplicate:
return std::make_unique<DuplicateSnippetRepetitor>(State);
case InstructionBenchmark::Loop:
return std::make_unique<LoopSnippetRepetitor>(State);
case InstructionBenchmark::AggregateMin:
break;
}
llvm_unreachable("Unknown RepetitionModeE enum");
}
} // namespace exegesis
} // namespace llvm