mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 12:43:36 +01:00
5d534d8259
I really needed this, like, factually, yesterday, when verifying dependency breaking idioms for AMD Zen 3 scheduler model. Consider the following example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=duplicate Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-4a7e50.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 0.31025, per_snippet_value: 0.31025 } error: '' info: '' assembled_snippet: C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C5FDEFC0C3 ... ``` What does it tell us? So wait, it can only execute ~3 x86 AVX YMM PXOR zero-idioms per cycle? That doesn't seem right. That's even less than there are pipes supporting this type of op. Now, second example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-2418b5.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 1.00011, per_snippet_value: 1.00011 } error: '' info: '' assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3 ... ``` Now that's just worse. Due to the looping, the throughput completely plummeted, and now we can only do a single instruction/cycle!? That's not great. And final example: ``` $ ./bin/llvm-exegesis --mode=inverse_throughput --snippets-file=/tmp/snippet.s --num-repetitions=1000000 --repetition-mode=loop --loop-body-size=1000 Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-c402e2.o --- mode: inverse_throughput key: instructions: - 'VPXORYrr YMM0 YMM0 YMM0' config: '' register_initial_values: [] cpu_name: znver3 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 1000000 measurements: - { key: inverse_throughput, value: 0.167087, per_snippet_value: 0.167087 } error: '' info: '' assembled_snippet: 49B80800000000000000C5FDEFC0C5FDEFC04983C0FF75F2C3 ... ``` So if we merge the previous two approaches, do duplicate this single-instruction snippet 1000x (loop-body-size/instruction count in snippet), and run a loop with 1000 iterations over that duplicated/unrolled snippet, the measured throughput goes through the roof, up to 5.9 instructions/cycle, which finally tells us that this idiom is zero-cycle! Reviewed By: courbet Differential Revision: https://reviews.llvm.org/D102522
99 lines
3.4 KiB
C++
99 lines
3.4 KiB
C++
//===-- SnippetRepetitorTest.cpp --------------------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "../Common/AssemblerUtils.h"
|
|
#include "LlvmState.h"
|
|
#include "MCInstrDescView.h"
|
|
#include "RegisterAliasing.h"
|
|
#include "TestBase.h"
|
|
#include "X86InstrInfo.h"
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
|
|
namespace llvm {
|
|
namespace exegesis {
|
|
|
|
void InitializeX86ExegesisTarget();
|
|
|
|
namespace {
|
|
|
|
using testing::ElementsAre;
|
|
using testing::Eq;
|
|
using testing::Field;
|
|
using testing::Property;
|
|
using testing::UnorderedElementsAre;
|
|
|
|
class X86SnippetRepetitorTest : public X86TestBase {
|
|
protected:
|
|
void SetUp() override {
|
|
TM = State.createTargetMachine();
|
|
Context = std::make_unique<LLVMContext>();
|
|
Mod = std::make_unique<Module>("X86SnippetRepetitorTest", *Context);
|
|
Mod->setDataLayout(TM->createDataLayout());
|
|
MMI = std::make_unique<MachineModuleInfo>(TM.get());
|
|
MF = &createVoidVoidPtrMachineFunction("TestFn", Mod.get(), MMI.get());
|
|
}
|
|
|
|
void TestCommon(InstructionBenchmark::RepetitionModeE RepetitionMode) {
|
|
const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State);
|
|
const std::vector<MCInst> Instructions = {MCInstBuilder(X86::NOOP)};
|
|
FunctionFiller Sink(*MF, {X86::EAX});
|
|
const auto Fill =
|
|
Repetitor->Repeat(Instructions, kMinInstructions, kLoopBodySize);
|
|
Fill(Sink);
|
|
}
|
|
|
|
static constexpr const unsigned kMinInstructions = 3;
|
|
static constexpr const unsigned kLoopBodySize = 5;
|
|
|
|
std::unique_ptr<LLVMTargetMachine> TM;
|
|
std::unique_ptr<LLVMContext> Context;
|
|
std::unique_ptr<Module> Mod;
|
|
std::unique_ptr<MachineModuleInfo> MMI;
|
|
MachineFunction *MF = nullptr;
|
|
};
|
|
|
|
static auto HasOpcode = [](unsigned Opcode) {
|
|
return Property(&MachineInstr::getOpcode, Eq(Opcode));
|
|
};
|
|
|
|
static auto LiveReg = [](unsigned Reg) {
|
|
return Field(&MachineBasicBlock::RegisterMaskPair::PhysReg, Eq(Reg));
|
|
};
|
|
|
|
TEST_F(X86SnippetRepetitorTest, Duplicate) {
|
|
TestCommon(InstructionBenchmark::Duplicate);
|
|
// Duplicating creates a single basic block that repeats the instructions.
|
|
ASSERT_EQ(MF->getNumBlockIDs(), 1u);
|
|
EXPECT_THAT(MF->getBlockNumbered(0)->instrs(),
|
|
ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
|
|
HasOpcode(X86::NOOP), HasOpcode(X86::RETQ)));
|
|
}
|
|
|
|
TEST_F(X86SnippetRepetitorTest, Loop) {
|
|
TestCommon(InstructionBenchmark::Loop);
|
|
// Duplicating creates an entry block, a loop body and a ret block.
|
|
ASSERT_EQ(MF->getNumBlockIDs(), 3u);
|
|
const auto &LoopBlock = *MF->getBlockNumbered(1);
|
|
EXPECT_THAT(LoopBlock.instrs(),
|
|
ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
|
|
HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
|
|
HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8),
|
|
HasOpcode(X86::JCC_1)));
|
|
EXPECT_THAT(LoopBlock.liveins(),
|
|
UnorderedElementsAre(
|
|
LiveReg(X86::EAX),
|
|
LiveReg(State.getExegesisTarget().getLoopCounterRegister(
|
|
State.getTargetMachine().getTargetTriple()))));
|
|
EXPECT_THAT(MF->getBlockNumbered(2)->instrs(),
|
|
ElementsAre(HasOpcode(X86::RETQ)));
|
|
}
|
|
|
|
} // namespace
|
|
} // namespace exegesis
|
|
} // namespace llvm
|