2018-03-08 14:05:02 +01:00
|
|
|
//===--------------------- Instruction.cpp ----------------------*- C++ -*-===//
|
|
|
|
//
|
2019-01-19 09:50:56 +01:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2018-03-08 14:05:02 +01:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2018-06-25 18:53:00 +02:00
|
|
|
// This file defines abstractions used by the Pipeline to model register reads,
|
2018-03-08 14:05:02 +01:00
|
|
|
// register writes and instructions.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2018-12-17 09:08:31 +01:00
|
|
|
#include "llvm/MCA/Instruction.h"
|
2018-03-08 14:05:02 +01:00
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
|
2018-10-30 16:56:08 +01:00
|
|
|
namespace llvm {
|
2018-03-08 14:05:02 +01:00
|
|
|
namespace mca {
|
|
|
|
|
|
|
|
void ReadState::writeStartEvent(unsigned Cycles) {
|
|
|
|
assert(DependentWrites);
|
|
|
|
assert(CyclesLeft == UNKNOWN_CYCLES);
|
|
|
|
|
|
|
|
// This read may be dependent on more than one write. This typically occurs
|
|
|
|
// when a definition is the result of multiple writes where at least one
|
|
|
|
// write does a partial register update.
|
|
|
|
// The HW is forced to do some extra bookkeeping to track of all the
|
|
|
|
// dependent writes, and implement a merging scheme for the partial writes.
|
|
|
|
--DependentWrites;
|
|
|
|
TotalCycles = std::max(TotalCycles, Cycles);
|
|
|
|
|
2018-06-27 13:17:07 +02:00
|
|
|
if (!DependentWrites) {
|
2018-03-08 14:05:02 +01:00
|
|
|
CyclesLeft = TotalCycles;
|
2018-06-27 13:17:07 +02:00
|
|
|
IsReady = !CyclesLeft;
|
|
|
|
}
|
2018-03-08 14:05:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void WriteState::onInstructionIssued() {
|
|
|
|
assert(CyclesLeft == UNKNOWN_CYCLES);
|
|
|
|
// Update the number of cycles left based on the WriteDescriptor info.
|
2018-07-06 15:46:10 +02:00
|
|
|
CyclesLeft = getLatency();
|
2018-03-08 14:05:02 +01:00
|
|
|
|
2018-06-05 19:12:02 +02:00
|
|
|
// Now that the time left before write-back is known, notify
|
2018-03-08 14:05:02 +01:00
|
|
|
// all the users.
|
|
|
|
for (const std::pair<ReadState *, int> &User : Users) {
|
|
|
|
ReadState *RS = User.first;
|
|
|
|
unsigned ReadCycles = std::max(0, CyclesLeft - User.second);
|
|
|
|
RS->writeStartEvent(ReadCycles);
|
|
|
|
}
|
2018-11-22 13:48:57 +01:00
|
|
|
|
|
|
|
// Notify any writes that are in a false dependency with this write.
|
|
|
|
if (PartialWrite)
|
|
|
|
PartialWrite->writeStartEvent(CyclesLeft);
|
2018-03-08 14:05:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void WriteState::addUser(ReadState *User, int ReadAdvance) {
|
|
|
|
// If CyclesLeft is different than -1, then we don't need to
|
|
|
|
// update the list of users. We can just notify the user with
|
|
|
|
// the actual number of cycles left (which may be zero).
|
|
|
|
if (CyclesLeft != UNKNOWN_CYCLES) {
|
|
|
|
unsigned ReadCycles = std::max(0, CyclesLeft - ReadAdvance);
|
|
|
|
User->writeStartEvent(ReadCycles);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-11-22 15:48:53 +01:00
|
|
|
if (llvm::find_if(Users, [&User](const std::pair<ReadState *, int> &Use) {
|
|
|
|
return Use.first == User;
|
|
|
|
}) == Users.end()) {
|
|
|
|
Users.emplace_back(User, ReadAdvance);
|
|
|
|
}
|
2018-03-08 14:05:02 +01:00
|
|
|
}
|
|
|
|
|
2018-11-22 13:48:57 +01:00
|
|
|
void WriteState::addUser(WriteState *User) {
|
|
|
|
if (CyclesLeft != UNKNOWN_CYCLES) {
|
|
|
|
User->writeStartEvent(std::max(0, CyclesLeft));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(!PartialWrite && "PartialWrite already set!");
|
|
|
|
PartialWrite = User;
|
|
|
|
User->setDependentWrite(this);
|
|
|
|
}
|
|
|
|
|
2018-03-08 14:05:02 +01:00
|
|
|
void WriteState::cycleEvent() {
|
|
|
|
// Note: CyclesLeft can be a negative number. It is an error to
|
|
|
|
// make it an unsigned quantity because users of this write may
|
|
|
|
// specify a negative ReadAdvance.
|
|
|
|
if (CyclesLeft != UNKNOWN_CYCLES)
|
|
|
|
CyclesLeft--;
|
2018-11-22 13:48:57 +01:00
|
|
|
|
|
|
|
if (DependentWriteCyclesLeft)
|
|
|
|
DependentWriteCyclesLeft--;
|
2018-03-08 14:05:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void ReadState::cycleEvent() {
|
2018-06-05 19:12:02 +02:00
|
|
|
// Update the total number of cycles.
|
|
|
|
if (DependentWrites && TotalCycles) {
|
|
|
|
--TotalCycles;
|
2018-03-08 14:05:02 +01:00
|
|
|
return;
|
2018-06-05 19:12:02 +02:00
|
|
|
}
|
2018-03-08 14:05:02 +01:00
|
|
|
|
2018-06-05 19:12:02 +02:00
|
|
|
// Bail out immediately if we don't know how many cycles are left.
|
|
|
|
if (CyclesLeft == UNKNOWN_CYCLES)
|
2018-03-08 14:05:02 +01:00
|
|
|
return;
|
|
|
|
|
2018-06-27 13:17:07 +02:00
|
|
|
if (CyclesLeft) {
|
2018-06-05 19:12:02 +02:00
|
|
|
--CyclesLeft;
|
2018-06-27 13:17:07 +02:00
|
|
|
IsReady = !CyclesLeft;
|
|
|
|
}
|
2018-03-08 14:05:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
void WriteState::dump() const {
|
[llvm-mca] Lower to mca::Instructon before the pipeline is run.
Before this change, the lowering of instructions from llvm::MCInst to
mca::Instruction was done as part of the first stage of the pipeline (i.e. the
FetchStage). In particular, FetchStage was responsible for picking the next
instruction from the source sequence, and lower it to an mca::Instruction with
the help of an object of class InstrBuilder.
The dependency on InstrBuilder was problematic for a number of reasons. Class
InstrBuilder only knows how to lower from llvm::MCInst to mca::Instruction.
That means, it is hard to support a different scenario where instructions
in input are not instances of class llvm::MCInst. Even if we managed to
specialize InstrBuilder, and generalize most of its internal logic, the
dependency on InstrBuilder in FetchStage would have caused more troubles (other
than complicating the pipeline logic).
With this patch, the lowering step is done before the pipeline is run. The
pipeline is no longer responsible for lowering from MCInst to mca::Instruction.
As a consequence of this, the FetchStage no longer needs to interact with an
InstrBuilder. The mca::SourceMgr class now simply wraps a reference to a
sequence of mca::Instruction objects.
This simplifies the logic of FetchStage, and increases the usability of it. As
a result, on a debug build, we see a 7-9% speedup; on a release build, the
speedup is around 3-4%.
llvm-svn: 345500
2018-10-29 14:29:22 +01:00
|
|
|
dbgs() << "{ OpIdx=" << WD->OpIndex << ", Lat=" << getLatency() << ", RegID "
|
2018-07-05 18:13:49 +02:00
|
|
|
<< getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }";
|
2018-03-08 14:05:02 +01:00
|
|
|
}
|
2018-06-28 17:50:26 +02:00
|
|
|
|
|
|
|
void WriteRef::dump() const {
|
2018-07-05 18:13:49 +02:00
|
|
|
dbgs() << "IID=" << getSourceIndex() << ' ';
|
2018-06-28 17:50:26 +02:00
|
|
|
if (isValid())
|
|
|
|
getWriteState()->dump();
|
|
|
|
else
|
|
|
|
dbgs() << "(null)";
|
|
|
|
}
|
2018-03-08 14:05:02 +01:00
|
|
|
#endif
|
|
|
|
|
2018-03-22 12:39:34 +01:00
|
|
|
void Instruction::dispatch(unsigned RCUToken) {
|
2018-03-22 11:19:20 +01:00
|
|
|
assert(Stage == IS_INVALID);
|
|
|
|
Stage = IS_AVAILABLE;
|
2018-03-22 12:39:34 +01:00
|
|
|
RCUTokenID = RCUToken;
|
2018-03-08 14:05:02 +01:00
|
|
|
|
2018-03-22 12:39:34 +01:00
|
|
|
// Check if input operands are already available.
|
2018-03-29 16:26:56 +02:00
|
|
|
update();
|
2018-03-08 14:05:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void Instruction::execute() {
|
|
|
|
assert(Stage == IS_READY);
|
|
|
|
Stage = IS_EXECUTING;
|
2018-03-22 12:39:34 +01:00
|
|
|
|
|
|
|
// Set the cycles left before the write-back stage.
|
2018-10-25 19:03:51 +02:00
|
|
|
CyclesLeft = getLatency();
|
2018-03-22 12:39:34 +01:00
|
|
|
|
2018-10-25 19:03:51 +02:00
|
|
|
for (WriteState &WS : getDefs())
|
|
|
|
WS.onInstructionIssued();
|
2018-03-22 12:39:34 +01:00
|
|
|
|
|
|
|
// Transition to the "executed" stage if this is a zero-latency instruction.
|
2018-03-22 11:19:20 +01:00
|
|
|
if (!CyclesLeft)
|
|
|
|
Stage = IS_EXECUTED;
|
2018-03-08 14:05:02 +01:00
|
|
|
}
|
|
|
|
|
2018-10-03 17:02:44 +02:00
|
|
|
void Instruction::forceExecuted() {
|
|
|
|
assert(Stage == IS_READY && "Invalid internal state!");
|
|
|
|
CyclesLeft = 0;
|
|
|
|
Stage = IS_EXECUTED;
|
|
|
|
}
|
|
|
|
|
2018-03-29 16:26:56 +02:00
|
|
|
void Instruction::update() {
|
2018-06-27 13:17:07 +02:00
|
|
|
assert(isDispatched() && "Unexpected instruction stage found!");
|
[llvm-mca][BtVer2] teach how to identify false dependencies on partially written
registers.
The goal of this patch is to improve the throughput analysis in llvm-mca for the
case where instructions perform partial register writes.
On x86, partial register writes are quite difficult to model, mainly because
different processors tend to implement different register merging schemes in
hardware.
When the code contains partial register writes, the IPC (instructions per
cycles) estimated by llvm-mca tends to diverge quite significantly from the
observed IPC (using perf).
Modern AMD processors (at least, from Bulldozer onwards) don't rename partial
registers. Quoting Agner Fog's microarchitecture.pdf:
" The processor always keeps the different parts of an integer register together.
For example, AL and AH are not treated as independent by the out-of-order
execution mechanism. An instruction that writes to part of a register will
therefore have a false dependence on any previous write to the same register or
any part of it."
This patch is a first important step towards improving the analysis of partial
register updates. It changes the semantic of RegisterFile descriptors in
tablegen, and teaches llvm-mca how to identify false dependences in the presence
of partial register writes (for more details: see the new code comments in
include/Target/TargetSchedule.h - class RegisterFile).
This patch doesn't address the case where a write to a part of a register is
followed by a read from the whole register. On Intel chips, high8 registers
(AH/BH/CH/DH)) can be stored in separate physical registers. However, a later
(dirty) read of the full register (example: AX/EAX) triggers a merge uOp, which
adds extra latency (and potentially affects the pipe usage).
This is a very interesting article on the subject with a very informative answer
from Peter Cordes:
https://stackoverflow.com/questions/45660139/how-exactly-do-partial-registers-on-haswell-skylake-perform-writing-al-seems-to
In future, the definition of RegisterFile can be extended with extra information
that may be used to identify delays caused by merge opcodes triggered by a dirty
read of a partial write.
Differential Revision: https://reviews.llvm.org/D49196
llvm-svn: 337123
2018-07-15 13:01:38 +02:00
|
|
|
|
2018-10-25 19:03:51 +02:00
|
|
|
if (!all_of(getUses(), [](const ReadState &Use) { return Use.isReady(); }))
|
[llvm-mca][BtVer2] teach how to identify false dependencies on partially written
registers.
The goal of this patch is to improve the throughput analysis in llvm-mca for the
case where instructions perform partial register writes.
On x86, partial register writes are quite difficult to model, mainly because
different processors tend to implement different register merging schemes in
hardware.
When the code contains partial register writes, the IPC (instructions per
cycles) estimated by llvm-mca tends to diverge quite significantly from the
observed IPC (using perf).
Modern AMD processors (at least, from Bulldozer onwards) don't rename partial
registers. Quoting Agner Fog's microarchitecture.pdf:
" The processor always keeps the different parts of an integer register together.
For example, AL and AH are not treated as independent by the out-of-order
execution mechanism. An instruction that writes to part of a register will
therefore have a false dependence on any previous write to the same register or
any part of it."
This patch is a first important step towards improving the analysis of partial
register updates. It changes the semantic of RegisterFile descriptors in
tablegen, and teaches llvm-mca how to identify false dependences in the presence
of partial register writes (for more details: see the new code comments in
include/Target/TargetSchedule.h - class RegisterFile).
This patch doesn't address the case where a write to a part of a register is
followed by a read from the whole register. On Intel chips, high8 registers
(AH/BH/CH/DH)) can be stored in separate physical registers. However, a later
(dirty) read of the full register (example: AX/EAX) triggers a merge uOp, which
adds extra latency (and potentially affects the pipe usage).
This is a very interesting article on the subject with a very informative answer
from Peter Cordes:
https://stackoverflow.com/questions/45660139/how-exactly-do-partial-registers-on-haswell-skylake-perform-writing-al-seems-to
In future, the definition of RegisterFile can be extended with extra information
that may be used to identify delays caused by merge opcodes triggered by a dirty
read of a partial write.
Differential Revision: https://reviews.llvm.org/D49196
llvm-svn: 337123
2018-07-15 13:01:38 +02:00
|
|
|
return;
|
|
|
|
|
|
|
|
// A partial register write cannot complete before a dependent write.
|
2018-10-25 19:03:51 +02:00
|
|
|
auto IsDefReady = [&](const WriteState &Def) {
|
2018-11-22 13:48:57 +01:00
|
|
|
if (!Def.getDependentWrite()) {
|
|
|
|
unsigned CyclesLeft = Def.getDependentWriteCyclesLeft();
|
|
|
|
return !CyclesLeft || CyclesLeft < getLatency();
|
[llvm-mca][BtVer2] teach how to identify false dependencies on partially written
registers.
The goal of this patch is to improve the throughput analysis in llvm-mca for the
case where instructions perform partial register writes.
On x86, partial register writes are quite difficult to model, mainly because
different processors tend to implement different register merging schemes in
hardware.
When the code contains partial register writes, the IPC (instructions per
cycles) estimated by llvm-mca tends to diverge quite significantly from the
observed IPC (using perf).
Modern AMD processors (at least, from Bulldozer onwards) don't rename partial
registers. Quoting Agner Fog's microarchitecture.pdf:
" The processor always keeps the different parts of an integer register together.
For example, AL and AH are not treated as independent by the out-of-order
execution mechanism. An instruction that writes to part of a register will
therefore have a false dependence on any previous write to the same register or
any part of it."
This patch is a first important step towards improving the analysis of partial
register updates. It changes the semantic of RegisterFile descriptors in
tablegen, and teaches llvm-mca how to identify false dependences in the presence
of partial register writes (for more details: see the new code comments in
include/Target/TargetSchedule.h - class RegisterFile).
This patch doesn't address the case where a write to a part of a register is
followed by a read from the whole register. On Intel chips, high8 registers
(AH/BH/CH/DH)) can be stored in separate physical registers. However, a later
(dirty) read of the full register (example: AX/EAX) triggers a merge uOp, which
adds extra latency (and potentially affects the pipe usage).
This is a very interesting article on the subject with a very informative answer
from Peter Cordes:
https://stackoverflow.com/questions/45660139/how-exactly-do-partial-registers-on-haswell-skylake-perform-writing-al-seems-to
In future, the definition of RegisterFile can be extended with extra information
that may be used to identify delays caused by merge opcodes triggered by a dirty
read of a partial write.
Differential Revision: https://reviews.llvm.org/D49196
llvm-svn: 337123
2018-07-15 13:01:38 +02:00
|
|
|
}
|
2018-11-22 13:48:57 +01:00
|
|
|
return false;
|
[llvm-mca][BtVer2] teach how to identify false dependencies on partially written
registers.
The goal of this patch is to improve the throughput analysis in llvm-mca for the
case where instructions perform partial register writes.
On x86, partial register writes are quite difficult to model, mainly because
different processors tend to implement different register merging schemes in
hardware.
When the code contains partial register writes, the IPC (instructions per
cycles) estimated by llvm-mca tends to diverge quite significantly from the
observed IPC (using perf).
Modern AMD processors (at least, from Bulldozer onwards) don't rename partial
registers. Quoting Agner Fog's microarchitecture.pdf:
" The processor always keeps the different parts of an integer register together.
For example, AL and AH are not treated as independent by the out-of-order
execution mechanism. An instruction that writes to part of a register will
therefore have a false dependence on any previous write to the same register or
any part of it."
This patch is a first important step towards improving the analysis of partial
register updates. It changes the semantic of RegisterFile descriptors in
tablegen, and teaches llvm-mca how to identify false dependences in the presence
of partial register writes (for more details: see the new code comments in
include/Target/TargetSchedule.h - class RegisterFile).
This patch doesn't address the case where a write to a part of a register is
followed by a read from the whole register. On Intel chips, high8 registers
(AH/BH/CH/DH)) can be stored in separate physical registers. However, a later
(dirty) read of the full register (example: AX/EAX) triggers a merge uOp, which
adds extra latency (and potentially affects the pipe usage).
This is a very interesting article on the subject with a very informative answer
from Peter Cordes:
https://stackoverflow.com/questions/45660139/how-exactly-do-partial-registers-on-haswell-skylake-perform-writing-al-seems-to
In future, the definition of RegisterFile can be extended with extra information
that may be used to identify delays caused by merge opcodes triggered by a dirty
read of a partial write.
Differential Revision: https://reviews.llvm.org/D49196
llvm-svn: 337123
2018-07-15 13:01:38 +02:00
|
|
|
};
|
|
|
|
|
2018-10-25 19:03:51 +02:00
|
|
|
if (all_of(getDefs(), IsDefReady))
|
2018-03-29 16:26:56 +02:00
|
|
|
Stage = IS_READY;
|
|
|
|
}
|
|
|
|
|
2018-03-08 14:05:02 +01:00
|
|
|
void Instruction::cycleEvent() {
|
2018-03-22 11:19:20 +01:00
|
|
|
if (isReady())
|
|
|
|
return;
|
|
|
|
|
2018-03-08 14:05:02 +01:00
|
|
|
if (isDispatched()) {
|
2018-10-25 19:03:51 +02:00
|
|
|
for (ReadState &Use : getUses())
|
|
|
|
Use.cycleEvent();
|
2018-06-27 13:17:07 +02:00
|
|
|
|
2018-11-22 13:48:57 +01:00
|
|
|
for (WriteState &Def : getDefs())
|
|
|
|
Def.cycleEvent();
|
|
|
|
|
[llvm-mca][BtVer2] teach how to identify false dependencies on partially written
registers.
The goal of this patch is to improve the throughput analysis in llvm-mca for the
case where instructions perform partial register writes.
On x86, partial register writes are quite difficult to model, mainly because
different processors tend to implement different register merging schemes in
hardware.
When the code contains partial register writes, the IPC (instructions per
cycles) estimated by llvm-mca tends to diverge quite significantly from the
observed IPC (using perf).
Modern AMD processors (at least, from Bulldozer onwards) don't rename partial
registers. Quoting Agner Fog's microarchitecture.pdf:
" The processor always keeps the different parts of an integer register together.
For example, AL and AH are not treated as independent by the out-of-order
execution mechanism. An instruction that writes to part of a register will
therefore have a false dependence on any previous write to the same register or
any part of it."
This patch is a first important step towards improving the analysis of partial
register updates. It changes the semantic of RegisterFile descriptors in
tablegen, and teaches llvm-mca how to identify false dependences in the presence
of partial register writes (for more details: see the new code comments in
include/Target/TargetSchedule.h - class RegisterFile).
This patch doesn't address the case where a write to a part of a register is
followed by a read from the whole register. On Intel chips, high8 registers
(AH/BH/CH/DH)) can be stored in separate physical registers. However, a later
(dirty) read of the full register (example: AX/EAX) triggers a merge uOp, which
adds extra latency (and potentially affects the pipe usage).
This is a very interesting article on the subject with a very informative answer
from Peter Cordes:
https://stackoverflow.com/questions/45660139/how-exactly-do-partial-registers-on-haswell-skylake-perform-writing-al-seems-to
In future, the definition of RegisterFile can be extended with extra information
that may be used to identify delays caused by merge opcodes triggered by a dirty
read of a partial write.
Differential Revision: https://reviews.llvm.org/D49196
llvm-svn: 337123
2018-07-15 13:01:38 +02:00
|
|
|
update();
|
2018-03-08 14:05:02 +01:00
|
|
|
return;
|
|
|
|
}
|
2018-03-22 11:19:20 +01:00
|
|
|
|
|
|
|
assert(isExecuting() && "Instruction not in-flight?");
|
|
|
|
assert(CyclesLeft && "Instruction already executed?");
|
2018-10-25 19:03:51 +02:00
|
|
|
for (WriteState &Def : getDefs())
|
|
|
|
Def.cycleEvent();
|
2018-03-22 11:19:20 +01:00
|
|
|
CyclesLeft--;
|
2018-03-08 14:05:02 +01:00
|
|
|
if (!CyclesLeft)
|
|
|
|
Stage = IS_EXECUTED;
|
|
|
|
}
|
2018-06-28 17:50:26 +02:00
|
|
|
|
|
|
|
const unsigned WriteRef::INVALID_IID = std::numeric_limits<unsigned>::max();
|
|
|
|
|
2018-03-08 14:05:02 +01:00
|
|
|
} // namespace mca
|
2018-10-30 16:56:08 +01:00
|
|
|
} // namespace llvm
|