1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 12:43:36 +01:00
llvm-mirror/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
Roman Lebedev 5d534d8259 [llvm-exegesis] Loop unrolling for loop snippet repetitor mode
I really needed this, like, factually, yesterday,
when verifying dependency breaking idioms for AMD Zen 3 scheduler model.

Consider the following example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=duplicate
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-4a7e50.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 0.31025, per_snippet_value: 0.31025 }
error:           ''
info:            ''
assembled_snippet: C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C3
...

```
What does it tell us?
So wait, it can only execute ~3 x86 AVX YMM PXOR zero-idioms per cycle?
That doesn't seem right. That's even less than there are pipes supporting this type of op.

Now, second example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-2418b5.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 1.00011, per_snippet_value: 1.00011 }
error:           ''
info:            ''
assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3
...
```
Now that's just worse. Due to the looping, the throughput completely plummeted,
and now we can only do a single instruction/cycle!?

That's not great.
And final example:
```
$ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop --loop-body-size=1000
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-c402e2.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VPXORYrr YMM0 YMM0 YMM0'
  config:          ''
  register_initial_values: []
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 1000000
measurements:
  - { key: inverse_throughput, value: 0.167087, per_snippet_value: 0.167087 }
error:           ''
info:            ''
assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3
...
```

So if we merge the previous two approaches, do duplicate this single-instruction snippet 1000x
(loop-body-size/instruction count in snippet), and run a loop with 1000 iterations
over that duplicated/unrolled snippet, the measured throughput goes through the roof,
up to 5.9 instructions/cycle, which finally tells us that this idiom is zero-cycle!

Reviewed By: courbet

Differential Revision: https://reviews.llvm.org/D102522
2021-05-25 12:08:27 +03:00

99 lines
3.4 KiB
C++

//===-- SnippetRepetitorTest.cpp --------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "../Common/AssemblerUtils.h"
#include "LlvmState.h"
#include "MCInstrDescView.h"
#include "RegisterAliasing.h"
#include "TestBase.h"
#include "X86InstrInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
namespace llvm {
namespace exegesis {
void InitializeX86ExegesisTarget();
namespace {
using testing::ElementsAre;
using testing::Eq;
using testing::Field;
using testing::Property;
using testing::UnorderedElementsAre;
class X86SnippetRepetitorTest : public X86TestBase {
protected:
void SetUp() override {
TM = State.createTargetMachine();
Context = std::make_unique<LLVMContext>();
Mod = std::make_unique<Module>("X86SnippetRepetitorTest", *Context);
Mod->setDataLayout(TM->createDataLayout());
MMI = std::make_unique<MachineModuleInfo>(TM.get());
MF = &createVoidVoidPtrMachineFunction("TestFn", Mod.get(), MMI.get());
}
void TestCommon(InstructionBenchmark::RepetitionModeE RepetitionMode) {
const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State);
const std::vector<MCInst> Instructions = {MCInstBuilder(X86::NOOP)};
FunctionFiller Sink(*MF, {X86::EAX});
const auto Fill =
Repetitor->Repeat(Instructions, kMinInstructions, kLoopBodySize);
Fill(Sink);
}
static constexpr const unsigned kMinInstructions = 3;
static constexpr const unsigned kLoopBodySize = 5;
std::unique_ptr<LLVMTargetMachine> TM;
std::unique_ptr<LLVMContext> Context;
std::unique_ptr<Module> Mod;
std::unique_ptr<MachineModuleInfo> MMI;
MachineFunction *MF = nullptr;
};
static auto HasOpcode = [](unsigned Opcode) {
return Property(&MachineInstr::getOpcode, Eq(Opcode));
};
static auto LiveReg = [](unsigned Reg) {
return Field(&MachineBasicBlock::RegisterMaskPair::PhysReg, Eq(Reg));
};
TEST_F(X86SnippetRepetitorTest, Duplicate) {
TestCommon(InstructionBenchmark::Duplicate);
// Duplicating creates a single basic block that repeats the instructions.
ASSERT_EQ(MF->getNumBlockIDs(), 1u);
EXPECT_THAT(MF->getBlockNumbered(0)->instrs(),
ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
HasOpcode(X86::NOOP), HasOpcode(X86::RETQ)));
}
TEST_F(X86SnippetRepetitorTest, Loop) {
TestCommon(InstructionBenchmark::Loop);
// Duplicating creates an entry block, a loop body and a ret block.
ASSERT_EQ(MF->getNumBlockIDs(), 3u);
const auto &LoopBlock = *MF->getBlockNumbered(1);
EXPECT_THAT(LoopBlock.instrs(),
ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8),
HasOpcode(X86::JCC_1)));
EXPECT_THAT(LoopBlock.liveins(),
UnorderedElementsAre(
LiveReg(X86::EAX),
LiveReg(State.getExegesisTarget().getLoopCounterRegister(
State.getTargetMachine().getTargetTriple()))));
EXPECT_THAT(MF->getBlockNumbered(2)->instrs(),
ElementsAre(HasOpcode(X86::RETQ)));
}
} // namespace
} // namespace exegesis
} // namespace llvm