2012-07-07 06:00:00 +02:00
|
|
|
//===-- llvm/MC/MCSchedule.h - Scheduling -----------------------*- C++ -*-===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file defines the classes used to describe a subtarget's machine model
|
|
|
|
// for scheduling and other instruction cost heuristics.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2013-01-10 01:45:19 +01:00
|
|
|
#ifndef LLVM_MC_MCSCHEDULE_H
|
|
|
|
#define LLVM_MC_MCSCHEDULE_H
|
2012-07-07 06:00:00 +02:00
|
|
|
|
2018-03-13 17:28:55 +01:00
|
|
|
#include "llvm/ADT/Optional.h"
|
2018-04-30 16:59:11 +02:00
|
|
|
#include "llvm/Config/llvm-config.h"
|
2012-07-07 06:00:00 +02:00
|
|
|
#include "llvm/Support/DataTypes.h"
|
2012-09-14 22:26:41 +02:00
|
|
|
#include <cassert>
|
2012-07-07 06:00:00 +02:00
|
|
|
|
|
|
|
namespace llvm {
|
|
|
|
|
|
|
|
struct InstrItinerary;
|
2018-03-13 16:22:13 +01:00
|
|
|
class MCSubtargetInfo;
|
2018-04-15 19:32:17 +02:00
|
|
|
class MCInstrInfo;
|
2018-05-31 15:30:42 +02:00
|
|
|
class MCInst;
|
2018-04-15 19:32:17 +02:00
|
|
|
class InstrItineraryData;
|
2012-07-07 06:00:00 +02:00
|
|
|
|
2012-09-14 22:26:41 +02:00
|
|
|
/// Define a kind of processor resource that will be modeled by the scheduler.
|
|
|
|
struct MCProcResourceDesc {
|
|
|
|
const char *Name;
|
2012-10-10 07:43:04 +02:00
|
|
|
unsigned NumUnits; // Number of resource of this kind
|
2012-09-14 22:26:41 +02:00
|
|
|
unsigned SuperIdx; // Index of the resources kind that contains this kind.
|
|
|
|
|
2013-06-15 06:49:57 +02:00
|
|
|
// Number of resources that may be buffered.
|
|
|
|
//
|
2014-03-01 08:57:02 +01:00
|
|
|
// Buffered resources (BufferSize != 0) may be consumed at some indeterminate
|
|
|
|
// cycle after dispatch. This should be used for out-of-order cpus when
|
|
|
|
// instructions that use this resource can be buffered in a reservaton
|
|
|
|
// station.
|
|
|
|
//
|
|
|
|
// Unbuffered resources (BufferSize == 0) always consume their resource some
|
|
|
|
// fixed number of cycles after dispatch. If a resource is unbuffered, then
|
|
|
|
// the scheduler will avoid scheduling instructions with conflicting resources
|
|
|
|
// in the same cycle. This is for in-order cpus, or the in-order portion of
|
|
|
|
// an out-of-order cpus.
|
2013-06-15 06:49:57 +02:00
|
|
|
int BufferSize;
|
2012-10-10 07:43:04 +02:00
|
|
|
|
2018-02-08 09:46:48 +01:00
|
|
|
// If the resource has sub-units, a pointer to the first element of an array
|
|
|
|
// of `NumUnits` elements containing the ProcResourceIdx of the sub units.
|
|
|
|
// nullptr if the resource does not have sub-units.
|
|
|
|
const unsigned *SubUnitsIdxBegin;
|
|
|
|
|
2012-09-14 22:26:41 +02:00
|
|
|
bool operator==(const MCProcResourceDesc &Other) const {
|
2012-10-10 07:43:04 +02:00
|
|
|
return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx
|
2013-06-15 06:49:57 +02:00
|
|
|
&& BufferSize == Other.BufferSize;
|
2012-09-14 22:26:41 +02:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Identify one of the processor resource kinds consumed by a particular
|
|
|
|
/// scheduling class for the specified number of cycles.
|
|
|
|
struct MCWriteProcResEntry {
|
2018-02-23 20:32:56 +01:00
|
|
|
uint16_t ProcResourceIdx;
|
|
|
|
uint16_t Cycles;
|
2012-09-14 22:26:41 +02:00
|
|
|
|
|
|
|
bool operator==(const MCWriteProcResEntry &Other) const {
|
|
|
|
return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Specify the latency in cpu cycles for a particular scheduling class and def
|
2012-10-17 19:27:10 +02:00
|
|
|
/// index. -1 indicates an invalid latency. Heuristics would typically consider
|
|
|
|
/// an instruction with invalid latency to have infinite latency. Also identify
|
|
|
|
/// the WriteResources of this def. When the operand expands to a sequence of
|
|
|
|
/// writes, this ID is the last write in the sequence.
|
2012-09-14 22:26:41 +02:00
|
|
|
struct MCWriteLatencyEntry {
|
2018-02-23 20:32:56 +01:00
|
|
|
int16_t Cycles;
|
|
|
|
uint16_t WriteResourceID;
|
2012-09-14 22:26:41 +02:00
|
|
|
|
|
|
|
bool operator==(const MCWriteLatencyEntry &Other) const {
|
|
|
|
return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Specify the number of cycles allowed after instruction issue before a
|
|
|
|
/// particular use operand reads its registers. This effectively reduces the
|
|
|
|
/// write's latency. Here we allow negative cycles for corner cases where
|
|
|
|
/// latency increases. This rule only applies when the entry's WriteResource
|
|
|
|
/// matches the write's WriteResource.
|
|
|
|
///
|
|
|
|
/// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by
|
|
|
|
/// WriteResourceIdx.
|
|
|
|
struct MCReadAdvanceEntry {
|
|
|
|
unsigned UseIdx;
|
|
|
|
unsigned WriteResourceID;
|
|
|
|
int Cycles;
|
|
|
|
|
|
|
|
bool operator==(const MCReadAdvanceEntry &Other) const {
|
|
|
|
return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID
|
|
|
|
&& Cycles == Other.Cycles;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Summarize the scheduling resources required for an instruction of a
|
|
|
|
/// particular scheduling class.
|
|
|
|
///
|
|
|
|
/// Defined as an aggregate struct for creating tables with initializer lists.
|
|
|
|
struct MCSchedClassDesc {
|
2018-02-23 20:32:56 +01:00
|
|
|
static const unsigned short InvalidNumMicroOps = (1U << 14) - 1;
|
|
|
|
static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
|
2012-09-14 22:26:41 +02:00
|
|
|
|
2017-10-15 16:32:27 +02:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2012-09-14 22:26:41 +02:00
|
|
|
const char* Name;
|
|
|
|
#endif
|
2018-02-23 20:32:56 +01:00
|
|
|
uint16_t NumMicroOps : 14;
|
|
|
|
bool BeginGroup : 1;
|
|
|
|
bool EndGroup : 1;
|
|
|
|
uint16_t WriteProcResIdx; // First index into WriteProcResTable.
|
|
|
|
uint16_t NumWriteProcResEntries;
|
|
|
|
uint16_t WriteLatencyIdx; // First index into WriteLatencyTable.
|
|
|
|
uint16_t NumWriteLatencyEntries;
|
|
|
|
uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable.
|
|
|
|
uint16_t NumReadAdvanceEntries;
|
2012-09-14 22:26:41 +02:00
|
|
|
|
|
|
|
bool isValid() const {
|
|
|
|
return NumMicroOps != InvalidNumMicroOps;
|
|
|
|
}
|
|
|
|
bool isVariant() const {
|
|
|
|
return NumMicroOps == VariantNumMicroOps;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
[MC][Tablegen] Allow the definition of processor register files in the scheduling model for llvm-mca
This patch allows the description of register files in processor scheduling
models. This addresses PR36662.
A new tablegen class named 'RegisterFile' has been added to TargetSchedule.td.
Targets can optionally describe register files for their processors using that
class. In particular, class RegisterFile allows to specify:
- The total number of physical registers.
- Which target registers are accessible through the register file.
- The cost of allocating a register at register renaming stage.
Example (from this patch - see file X86/X86ScheduleBtVer2.td)
def FpuPRF : RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>
Here, FpuPRF describes a register file for MMX/XMM/YMM registers. On Jaguar
(btver2), a YMM register definition consumes 2 physical registers, while MMX/XMM
register definitions only cost 1 physical register.
The syntax allows to specify an empty set of register classes. An empty set of
register classes means: this register file models all the registers specified by
the Target. For each register class, users can specify an optional register
cost. By default, register costs default to 1. A value of 0 for the number of
physical registers means: "this register file has an unbounded number of
physical registers".
This patch is structured in two parts.
* Part 1 - MC/Tablegen *
A first part adds the tablegen definition of RegisterFile, and teaches the
SubtargetEmitter how to emit information related to register files.
Information about register files is accessible through an instance of
MCExtraProcessorInfo.
The idea behind this design is to logically partition the processor description
which is only used by external tools (like llvm-mca) from the processor
information used by the llvm machine schedulers.
I think that this design would make easier for targets to get rid of the extra
processor information if they don't want it.
* Part 2 - llvm-mca related *
The second part of this patch is related to changes to llvm-mca.
The main differences are:
1) class RegisterFile now needs to take into account the "cost of a register"
when allocating physical registers at register renaming stage.
2) Point 1. triggered a minor refactoring which lef to the removal of the
"maximum 32 register files" restriction.
3) The BackendStatistics view has been updated so that we can print out extra
details related to each register file implemented by the processor.
The effect of point 3. is also visible in tests register-files-[1..5].s.
Differential Revision: https://reviews.llvm.org/D44980
llvm-svn: 329067
2018-04-03 15:36:24 +02:00
|
|
|
/// Specify the cost of a register definition in terms of number of physical
|
|
|
|
/// register allocated at register renaming stage. For example, AMD Jaguar.
|
|
|
|
/// natively supports 128-bit data types, and operations on 256-bit registers
|
|
|
|
/// (i.e. YMM registers) are internally split into two COPs (complex operations)
|
|
|
|
/// and each COP updates a physical register. Basically, on Jaguar, a YMM
|
|
|
|
/// register write effectively consumes two physical registers. That means,
|
|
|
|
/// the cost of a YMM write in the BtVer2 model is 2.
|
|
|
|
struct MCRegisterCostEntry {
|
|
|
|
unsigned RegisterClassID;
|
|
|
|
unsigned Cost;
|
[tblgen][llvm-mca] Add the ability to describe move elimination candidates via tablegen.
This patch adds the ability to identify instructions that are "move elimination
candidates". It also allows scheduling models to describe processor register
files that allow move elimination.
A move elimination candidate is an instruction that can be eliminated at
register renaming stage.
Each subtarget can specify which instructions are move elimination candidates
with the help of tablegen class "IsOptimizableRegisterMove" (see
llvm/Target/TargetInstrPredicate.td).
For example, on X86, BtVer2 allows both GPR and MMX/SSE moves to be eliminated.
The definition of 'IsOptimizableRegisterMove' for BtVer2 looks like this:
```
def : IsOptimizableRegisterMove<[
InstructionEquivalenceClass<[
// GPR variants.
MOV32rr, MOV64rr,
// MMX variants.
MMX_MOVQ64rr,
// SSE variants.
MOVAPSrr, MOVUPSrr,
MOVAPDrr, MOVUPDrr,
MOVDQArr, MOVDQUrr,
// AVX variants.
VMOVAPSrr, VMOVUPSrr,
VMOVAPDrr, VMOVUPDrr,
VMOVDQArr, VMOVDQUrr
], CheckNot<CheckSameRegOperand<0, 1>> >
]>;
```
Definitions of IsOptimizableRegisterMove from processor models of a same
Target are processed by the SubtargetEmitter to auto-generate a target-specific
override for each of the following predicate methods:
```
bool TargetSubtargetInfo::isOptimizableRegisterMove(const MachineInstr *MI)
const;
bool MCInstrAnalysis::isOptimizableRegisterMove(const MCInst &MI, unsigned
CPUID) const;
```
By default, those methods return false (i.e. conservatively assume that there
are no move elimination candidates).
Tablegen class RegisterFile has been extended with the following information:
- The set of register classes that allow move elimination.
- Maxium number of moves that can be eliminated every cycle.
- Whether move elimination is restricted to moves from registers that are
known to be zero.
This patch is structured in three part:
A first part (which is mostly boilerplate) adds the new
'isOptimizableRegisterMove' target hooks, and extends existing register file
descriptors in MC by introducing new fields to describe properties related to
move elimination.
A second part, uses the new tablegen constructs to describe move elimination in
the BtVer2 scheduling model.
A third part, teaches llm-mca how to query the new 'isOptimizableRegisterMove'
hook to mark instructions that are candidates for move elimination. It also
teaches class RegisterFile how to describe constraints on move elimination at
PRF granularity.
llvm-mca tests for btver2 show differences before/after this patch.
Differential Revision: https://reviews.llvm.org/D53134
llvm-svn: 344334
2018-10-12 13:23:04 +02:00
|
|
|
bool AllowMoveElimination;
|
[MC][Tablegen] Allow the definition of processor register files in the scheduling model for llvm-mca
This patch allows the description of register files in processor scheduling
models. This addresses PR36662.
A new tablegen class named 'RegisterFile' has been added to TargetSchedule.td.
Targets can optionally describe register files for their processors using that
class. In particular, class RegisterFile allows to specify:
- The total number of physical registers.
- Which target registers are accessible through the register file.
- The cost of allocating a register at register renaming stage.
Example (from this patch - see file X86/X86ScheduleBtVer2.td)
def FpuPRF : RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>
Here, FpuPRF describes a register file for MMX/XMM/YMM registers. On Jaguar
(btver2), a YMM register definition consumes 2 physical registers, while MMX/XMM
register definitions only cost 1 physical register.
The syntax allows to specify an empty set of register classes. An empty set of
register classes means: this register file models all the registers specified by
the Target. For each register class, users can specify an optional register
cost. By default, register costs default to 1. A value of 0 for the number of
physical registers means: "this register file has an unbounded number of
physical registers".
This patch is structured in two parts.
* Part 1 - MC/Tablegen *
A first part adds the tablegen definition of RegisterFile, and teaches the
SubtargetEmitter how to emit information related to register files.
Information about register files is accessible through an instance of
MCExtraProcessorInfo.
The idea behind this design is to logically partition the processor description
which is only used by external tools (like llvm-mca) from the processor
information used by the llvm machine schedulers.
I think that this design would make easier for targets to get rid of the extra
processor information if they don't want it.
* Part 2 - llvm-mca related *
The second part of this patch is related to changes to llvm-mca.
The main differences are:
1) class RegisterFile now needs to take into account the "cost of a register"
when allocating physical registers at register renaming stage.
2) Point 1. triggered a minor refactoring which lef to the removal of the
"maximum 32 register files" restriction.
3) The BackendStatistics view has been updated so that we can print out extra
details related to each register file implemented by the processor.
The effect of point 3. is also visible in tests register-files-[1..5].s.
Differential Revision: https://reviews.llvm.org/D44980
llvm-svn: 329067
2018-04-03 15:36:24 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/// A register file descriptor.
|
|
|
|
///
|
|
|
|
/// This struct allows to describe processor register files. In particular, it
|
|
|
|
/// helps describing the size of the register file, as well as the cost of
|
|
|
|
/// allocating a register file at register renaming stage.
|
|
|
|
/// FIXME: this struct can be extended to provide information about the number
|
|
|
|
/// of read/write ports to the register file. A value of zero for field
|
|
|
|
/// 'NumPhysRegs' means: this register file has an unbounded number of physical
|
|
|
|
/// registers.
|
|
|
|
struct MCRegisterFileDesc {
|
|
|
|
const char *Name;
|
|
|
|
uint16_t NumPhysRegs;
|
|
|
|
uint16_t NumRegisterCostEntries;
|
|
|
|
// Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
|
|
|
|
uint16_t RegisterCostEntryIdx;
|
[tblgen][llvm-mca] Add the ability to describe move elimination candidates via tablegen.
This patch adds the ability to identify instructions that are "move elimination
candidates". It also allows scheduling models to describe processor register
files that allow move elimination.
A move elimination candidate is an instruction that can be eliminated at
register renaming stage.
Each subtarget can specify which instructions are move elimination candidates
with the help of tablegen class "IsOptimizableRegisterMove" (see
llvm/Target/TargetInstrPredicate.td).
For example, on X86, BtVer2 allows both GPR and MMX/SSE moves to be eliminated.
The definition of 'IsOptimizableRegisterMove' for BtVer2 looks like this:
```
def : IsOptimizableRegisterMove<[
InstructionEquivalenceClass<[
// GPR variants.
MOV32rr, MOV64rr,
// MMX variants.
MMX_MOVQ64rr,
// SSE variants.
MOVAPSrr, MOVUPSrr,
MOVAPDrr, MOVUPDrr,
MOVDQArr, MOVDQUrr,
// AVX variants.
VMOVAPSrr, VMOVUPSrr,
VMOVAPDrr, VMOVUPDrr,
VMOVDQArr, VMOVDQUrr
], CheckNot<CheckSameRegOperand<0, 1>> >
]>;
```
Definitions of IsOptimizableRegisterMove from processor models of a same
Target are processed by the SubtargetEmitter to auto-generate a target-specific
override for each of the following predicate methods:
```
bool TargetSubtargetInfo::isOptimizableRegisterMove(const MachineInstr *MI)
const;
bool MCInstrAnalysis::isOptimizableRegisterMove(const MCInst &MI, unsigned
CPUID) const;
```
By default, those methods return false (i.e. conservatively assume that there
are no move elimination candidates).
Tablegen class RegisterFile has been extended with the following information:
- The set of register classes that allow move elimination.
- Maxium number of moves that can be eliminated every cycle.
- Whether move elimination is restricted to moves from registers that are
known to be zero.
This patch is structured in three part:
A first part (which is mostly boilerplate) adds the new
'isOptimizableRegisterMove' target hooks, and extends existing register file
descriptors in MC by introducing new fields to describe properties related to
move elimination.
A second part, uses the new tablegen constructs to describe move elimination in
the BtVer2 scheduling model.
A third part, teaches llm-mca how to query the new 'isOptimizableRegisterMove'
hook to mark instructions that are candidates for move elimination. It also
teaches class RegisterFile how to describe constraints on move elimination at
PRF granularity.
llvm-mca tests for btver2 show differences before/after this patch.
Differential Revision: https://reviews.llvm.org/D53134
llvm-svn: 344334
2018-10-12 13:23:04 +02:00
|
|
|
// A value of zero means: there is no limit in the number of moves that can be
|
|
|
|
// eliminated every cycle.
|
|
|
|
uint16_t MaxMovesEliminatedPerCycle;
|
|
|
|
// Ture if this register file only knows how to optimize register moves from
|
|
|
|
// known zero registers.
|
|
|
|
bool AllowZeroMoveEliminationOnly;
|
[MC][Tablegen] Allow the definition of processor register files in the scheduling model for llvm-mca
This patch allows the description of register files in processor scheduling
models. This addresses PR36662.
A new tablegen class named 'RegisterFile' has been added to TargetSchedule.td.
Targets can optionally describe register files for their processors using that
class. In particular, class RegisterFile allows to specify:
- The total number of physical registers.
- Which target registers are accessible through the register file.
- The cost of allocating a register at register renaming stage.
Example (from this patch - see file X86/X86ScheduleBtVer2.td)
def FpuPRF : RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>
Here, FpuPRF describes a register file for MMX/XMM/YMM registers. On Jaguar
(btver2), a YMM register definition consumes 2 physical registers, while MMX/XMM
register definitions only cost 1 physical register.
The syntax allows to specify an empty set of register classes. An empty set of
register classes means: this register file models all the registers specified by
the Target. For each register class, users can specify an optional register
cost. By default, register costs default to 1. A value of 0 for the number of
physical registers means: "this register file has an unbounded number of
physical registers".
This patch is structured in two parts.
* Part 1 - MC/Tablegen *
A first part adds the tablegen definition of RegisterFile, and teaches the
SubtargetEmitter how to emit information related to register files.
Information about register files is accessible through an instance of
MCExtraProcessorInfo.
The idea behind this design is to logically partition the processor description
which is only used by external tools (like llvm-mca) from the processor
information used by the llvm machine schedulers.
I think that this design would make easier for targets to get rid of the extra
processor information if they don't want it.
* Part 2 - llvm-mca related *
The second part of this patch is related to changes to llvm-mca.
The main differences are:
1) class RegisterFile now needs to take into account the "cost of a register"
when allocating physical registers at register renaming stage.
2) Point 1. triggered a minor refactoring which lef to the removal of the
"maximum 32 register files" restriction.
3) The BackendStatistics view has been updated so that we can print out extra
details related to each register file implemented by the processor.
The effect of point 3. is also visible in tests register-files-[1..5].s.
Differential Revision: https://reviews.llvm.org/D44980
llvm-svn: 329067
2018-04-03 15:36:24 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/// Provide extra details about the machine processor.
|
|
|
|
///
|
|
|
|
/// This is a collection of "optional" processor information that is not
|
|
|
|
/// normally used by the LLVM machine schedulers, but that can be consumed by
|
|
|
|
/// external tools like llvm-mca to improve the quality of the peformance
|
|
|
|
/// analysis.
|
|
|
|
struct MCExtraProcessorInfo {
|
2018-04-05 17:41:41 +02:00
|
|
|
// Actual size of the reorder buffer in hardware.
|
|
|
|
unsigned ReorderBufferSize;
|
|
|
|
// Number of instructions retired per cycle.
|
|
|
|
unsigned MaxRetirePerCycle;
|
[MC][Tablegen] Allow the definition of processor register files in the scheduling model for llvm-mca
This patch allows the description of register files in processor scheduling
models. This addresses PR36662.
A new tablegen class named 'RegisterFile' has been added to TargetSchedule.td.
Targets can optionally describe register files for their processors using that
class. In particular, class RegisterFile allows to specify:
- The total number of physical registers.
- Which target registers are accessible through the register file.
- The cost of allocating a register at register renaming stage.
Example (from this patch - see file X86/X86ScheduleBtVer2.td)
def FpuPRF : RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>
Here, FpuPRF describes a register file for MMX/XMM/YMM registers. On Jaguar
(btver2), a YMM register definition consumes 2 physical registers, while MMX/XMM
register definitions only cost 1 physical register.
The syntax allows to specify an empty set of register classes. An empty set of
register classes means: this register file models all the registers specified by
the Target. For each register class, users can specify an optional register
cost. By default, register costs default to 1. A value of 0 for the number of
physical registers means: "this register file has an unbounded number of
physical registers".
This patch is structured in two parts.
* Part 1 - MC/Tablegen *
A first part adds the tablegen definition of RegisterFile, and teaches the
SubtargetEmitter how to emit information related to register files.
Information about register files is accessible through an instance of
MCExtraProcessorInfo.
The idea behind this design is to logically partition the processor description
which is only used by external tools (like llvm-mca) from the processor
information used by the llvm machine schedulers.
I think that this design would make easier for targets to get rid of the extra
processor information if they don't want it.
* Part 2 - llvm-mca related *
The second part of this patch is related to changes to llvm-mca.
The main differences are:
1) class RegisterFile now needs to take into account the "cost of a register"
when allocating physical registers at register renaming stage.
2) Point 1. triggered a minor refactoring which lef to the removal of the
"maximum 32 register files" restriction.
3) The BackendStatistics view has been updated so that we can print out extra
details related to each register file implemented by the processor.
The effect of point 3. is also visible in tests register-files-[1..5].s.
Differential Revision: https://reviews.llvm.org/D44980
llvm-svn: 329067
2018-04-03 15:36:24 +02:00
|
|
|
const MCRegisterFileDesc *RegisterFiles;
|
|
|
|
unsigned NumRegisterFiles;
|
|
|
|
const MCRegisterCostEntry *RegisterCostTable;
|
|
|
|
unsigned NumRegisterCostEntries;
|
[llvm-mca][MC] Add the ability to declare which processor resources model load/store queues (PR36666).
This patch adds the ability to specify via tablegen which processor resources
are load/store queue resources.
A new tablegen class named MemoryQueue can be optionally used to mark resources
that model load/store queues. Information about the load/store queue is
collected at 'CodeGenSchedule' stage, and analyzed by the 'SubtargetEmitter' to
initialize two new fields in struct MCExtraProcessorInfo named `LoadQueueID` and
`StoreQueueID`. Those two fields are identifiers for buffered resources used to
describe the load queue and the store queue.
Field `BufferSize` is interpreted as the number of entries in the queue, while
the number of units is a throughput indicator (i.e. number of available pickers
for loads/stores).
At construction time, LSUnit in llvm-mca checks for the presence of extra
processor information (i.e. MCExtraProcessorInfo) in the scheduling model. If
that information is available, and fields LoadQueueID and StoreQueueID are set
to a value different than zero (i.e. the invalid processor resource index), then
LSUnit initializes its LoadQueue/StoreQueue based on the BufferSize value
declared by the two processor resources.
With this patch, we more accurately track dynamic dispatch stalls caused by the
lack of LS tokens (i.e. load/store queue full). This is also shown by the
differences in two BdVer2 tests. Stalls that were previously classified as
generic SCHEDULER FULL stalls, are not correctly classified either as "load
queue full" or "store queue full".
About the differences in the -scheduler-stats view: those differences are
expected, because entries in the load/store queue are not released at
instruction issue stage. Instead, those are released at instruction executed
stage. This is the main reason why for the modified tests, the load/store
queues gets full before PdEx is full.
Differential Revision: https://reviews.llvm.org/D54957
llvm-svn: 347857
2018-11-29 13:15:56 +01:00
|
|
|
unsigned LoadQueueID;
|
|
|
|
unsigned StoreQueueID;
|
[MC][Tablegen] Allow the definition of processor register files in the scheduling model for llvm-mca
This patch allows the description of register files in processor scheduling
models. This addresses PR36662.
A new tablegen class named 'RegisterFile' has been added to TargetSchedule.td.
Targets can optionally describe register files for their processors using that
class. In particular, class RegisterFile allows to specify:
- The total number of physical registers.
- Which target registers are accessible through the register file.
- The cost of allocating a register at register renaming stage.
Example (from this patch - see file X86/X86ScheduleBtVer2.td)
def FpuPRF : RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>
Here, FpuPRF describes a register file for MMX/XMM/YMM registers. On Jaguar
(btver2), a YMM register definition consumes 2 physical registers, while MMX/XMM
register definitions only cost 1 physical register.
The syntax allows to specify an empty set of register classes. An empty set of
register classes means: this register file models all the registers specified by
the Target. For each register class, users can specify an optional register
cost. By default, register costs default to 1. A value of 0 for the number of
physical registers means: "this register file has an unbounded number of
physical registers".
This patch is structured in two parts.
* Part 1 - MC/Tablegen *
A first part adds the tablegen definition of RegisterFile, and teaches the
SubtargetEmitter how to emit information related to register files.
Information about register files is accessible through an instance of
MCExtraProcessorInfo.
The idea behind this design is to logically partition the processor description
which is only used by external tools (like llvm-mca) from the processor
information used by the llvm machine schedulers.
I think that this design would make easier for targets to get rid of the extra
processor information if they don't want it.
* Part 2 - llvm-mca related *
The second part of this patch is related to changes to llvm-mca.
The main differences are:
1) class RegisterFile now needs to take into account the "cost of a register"
when allocating physical registers at register renaming stage.
2) Point 1. triggered a minor refactoring which lef to the removal of the
"maximum 32 register files" restriction.
3) The BackendStatistics view has been updated so that we can print out extra
details related to each register file implemented by the processor.
The effect of point 3. is also visible in tests register-files-[1..5].s.
Differential Revision: https://reviews.llvm.org/D44980
llvm-svn: 329067
2018-04-03 15:36:24 +02:00
|
|
|
};
|
|
|
|
|
2012-07-07 06:00:00 +02:00
|
|
|
/// Machine model for scheduling, bundling, and heuristics.
|
|
|
|
///
|
|
|
|
/// The machine model directly provides basic information about the
|
|
|
|
/// microarchitecture to the scheduler in the form of properties. It also
|
2012-09-14 22:26:41 +02:00
|
|
|
/// optionally refers to scheduler resource tables and itinerary
|
|
|
|
/// tables. Scheduler resource tables model the latency and cost for each
|
2013-10-22 17:18:03 +02:00
|
|
|
/// instruction type. Itinerary tables are an independent mechanism that
|
2012-07-07 06:00:00 +02:00
|
|
|
/// provides a detailed reservation table describing each cycle of instruction
|
|
|
|
/// execution. Subtargets may define any or all of the above categories of data
|
|
|
|
/// depending on the type of CPU and selected scheduler.
|
2018-05-18 17:57:54 +02:00
|
|
|
///
|
|
|
|
/// The machine independent properties defined here are used by the scheduler as
|
|
|
|
/// an abstract machine model. A real micro-architecture has a number of
|
|
|
|
/// buffers, queues, and stages. Declaring that a given machine-independent
|
|
|
|
/// abstract property corresponds to a specific physical property across all
|
|
|
|
/// subtargets can't be done. Nonetheless, the abstract model is
|
|
|
|
/// useful. Futhermore, subtargets typically extend this model with processor
|
|
|
|
/// specific resources to model any hardware features that can be exploited by
|
|
|
|
/// sceduling heuristics and aren't sufficiently represented in the abstract.
|
|
|
|
///
|
|
|
|
/// The abstract pipeline is built around the notion of an "issue point". This
|
|
|
|
/// is merely a reference point for counting machine cycles. The physical
|
|
|
|
/// machine will have pipeline stages that delay execution. The scheduler does
|
|
|
|
/// not model those delays because they are irrelevant as long as they are
|
|
|
|
/// consistent. Inaccuracies arise when instructions have different execution
|
|
|
|
/// delays relative to each other, in addition to their intrinsic latency. Those
|
|
|
|
/// special cases can be handled by TableGen constructs such as, ReadAdvance,
|
|
|
|
/// which reduces latency when reading data, and ResourceCycles, which consumes
|
|
|
|
/// a processor resource when writing data for a number of abstract
|
|
|
|
/// cycles.
|
|
|
|
///
|
|
|
|
/// TODO: One tool currently missing is the ability to add a delay to
|
|
|
|
/// ResourceCycles. That would be easy to add and would likely cover all cases
|
|
|
|
/// currently handled by the legacy itinerary tables.
|
|
|
|
///
|
|
|
|
/// A note on out-of-order execution and, more generally, instruction
|
|
|
|
/// buffers. Part of the CPU pipeline is always in-order. The issue point, which
|
|
|
|
/// is the point of reference for counting cycles, only makes sense as an
|
|
|
|
/// in-order part of the pipeline. Other parts of the pipeline are sometimes
|
|
|
|
/// falling behind and sometimes catching up. It's only interesting to model
|
|
|
|
/// those other, decoupled parts of the pipeline if they may be predictably
|
|
|
|
/// resource constrained in a way that the scheduler can exploit.
|
|
|
|
///
|
|
|
|
/// The LLVM machine model distinguishes between in-order constraints and
|
|
|
|
/// out-of-order constraints so that the target's scheduling strategy can apply
|
|
|
|
/// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order
|
|
|
|
/// resources would not typically be treated as a hard scheduling
|
|
|
|
/// constraint. For example, in the GenericScheduler, a delay caused by limited
|
|
|
|
/// out-of-order resources is not directly reflected in the number of cycles
|
|
|
|
/// that the scheduler sees between issuing an instruction and its dependent
|
|
|
|
/// instructions. In other words, out-of-order resources don't directly increase
|
|
|
|
/// the latency between pairs of instructions. However, they can still be used
|
|
|
|
/// to detect potential bottlenecks across a sequence of instructions and bias
|
|
|
|
/// the scheduling heuristics appropriately.
|
2014-09-02 19:43:54 +02:00
|
|
|
struct MCSchedModel {
|
2012-07-07 06:00:00 +02:00
|
|
|
// IssueWidth is the maximum number of instructions that may be scheduled in
|
2018-05-18 17:57:54 +02:00
|
|
|
// the same per-cycle group. This is meant to be a hard in-order constraint
|
|
|
|
// (a.k.a. "hazard"). In the GenericScheduler strategy, no more than
|
|
|
|
// IssueWidth micro-ops can ever be scheduled in a particular cycle.
|
|
|
|
//
|
|
|
|
// In practice, IssueWidth is useful to model any bottleneck between the
|
|
|
|
// decoder (after micro-op expansion) and the out-of-order reservation
|
|
|
|
// stations or the decoder bandwidth itself. If the total number of
|
|
|
|
// reservation stations is also a bottleneck, or if any other pipeline stage
|
|
|
|
// has a bandwidth limitation, then that can be naturally modeled by adding an
|
|
|
|
// out-of-order processor resource.
|
2012-07-07 06:00:00 +02:00
|
|
|
unsigned IssueWidth;
|
|
|
|
static const unsigned DefaultIssueWidth = 1;
|
|
|
|
|
2013-06-15 06:49:57 +02:00
|
|
|
// MicroOpBufferSize is the number of micro-ops that the processor may buffer
|
|
|
|
// for out-of-order execution.
|
2012-07-07 06:00:00 +02:00
|
|
|
//
|
2013-06-15 06:49:57 +02:00
|
|
|
// "0" means operations that are not ready in this cycle are not considered
|
|
|
|
// for scheduling (they go in the pending queue). Latency is paramount. This
|
|
|
|
// may be more efficient if many instructions are pending in a schedule.
|
2012-07-07 06:00:00 +02:00
|
|
|
//
|
2013-06-15 06:49:57 +02:00
|
|
|
// "1" means all instructions are considered for scheduling regardless of
|
|
|
|
// whether they are ready in this cycle. Latency still causes issue stalls,
|
|
|
|
// but we balance those stalls against other heuristics.
|
2012-07-07 06:00:00 +02:00
|
|
|
//
|
2013-06-15 06:49:57 +02:00
|
|
|
// "> 1" means the processor is out-of-order. This is a machine independent
|
2013-12-05 18:55:47 +01:00
|
|
|
// estimate of highly machine specific characteristics such as the register
|
2013-06-15 06:49:57 +02:00
|
|
|
// renaming pool and reorder buffer.
|
|
|
|
unsigned MicroOpBufferSize;
|
|
|
|
static const unsigned DefaultMicroOpBufferSize = 0;
|
2012-07-07 06:00:00 +02:00
|
|
|
|
2014-05-08 11:14:44 +02:00
|
|
|
// LoopMicroOpBufferSize is the number of micro-ops that the processor may
|
|
|
|
// buffer for optimized loop execution. More generally, this represents the
|
|
|
|
// optimal number of micro-ops in a loop body. A loop may be partially
|
|
|
|
// unrolled to bring the count of micro-ops in the loop body closer to this
|
|
|
|
// number.
|
|
|
|
unsigned LoopMicroOpBufferSize;
|
|
|
|
static const unsigned DefaultLoopMicroOpBufferSize = 0;
|
|
|
|
|
2012-07-07 06:00:00 +02:00
|
|
|
// LoadLatency is the expected latency of load instructions.
|
|
|
|
unsigned LoadLatency;
|
|
|
|
static const unsigned DefaultLoadLatency = 4;
|
|
|
|
|
|
|
|
// HighLatency is the expected latency of "very high latency" operations.
|
|
|
|
// See TargetInstrInfo::isHighLatencyDef().
|
|
|
|
// By default, this is set to an arbitrarily high number of cycles
|
|
|
|
// likely to have some impact on scheduling heuristics.
|
|
|
|
unsigned HighLatency;
|
|
|
|
static const unsigned DefaultHighLatency = 10;
|
|
|
|
|
2012-08-08 04:44:16 +02:00
|
|
|
// MispredictPenalty is the typical number of extra cycles the processor
|
|
|
|
// takes to recover from a branch misprediction.
|
|
|
|
unsigned MispredictPenalty;
|
|
|
|
static const unsigned DefaultMispredictPenalty = 10;
|
2015-10-05 06:43:48 +02:00
|
|
|
|
2014-07-16 00:39:58 +02:00
|
|
|
bool PostRAScheduler; // default value is false
|
2012-08-08 04:44:16 +02:00
|
|
|
|
2013-09-25 20:14:12 +02:00
|
|
|
bool CompleteModel;
|
|
|
|
|
2012-09-14 22:26:41 +02:00
|
|
|
unsigned ProcID;
|
|
|
|
const MCProcResourceDesc *ProcResourceTable;
|
|
|
|
const MCSchedClassDesc *SchedClassTable;
|
|
|
|
unsigned NumProcResourceKinds;
|
|
|
|
unsigned NumSchedClasses;
|
2012-07-07 06:00:00 +02:00
|
|
|
// Instruction itinerary tables used by InstrItineraryData.
|
|
|
|
friend class InstrItineraryData;
|
|
|
|
const InstrItinerary *InstrItineraries;
|
|
|
|
|
[MC][Tablegen] Allow the definition of processor register files in the scheduling model for llvm-mca
This patch allows the description of register files in processor scheduling
models. This addresses PR36662.
A new tablegen class named 'RegisterFile' has been added to TargetSchedule.td.
Targets can optionally describe register files for their processors using that
class. In particular, class RegisterFile allows to specify:
- The total number of physical registers.
- Which target registers are accessible through the register file.
- The cost of allocating a register at register renaming stage.
Example (from this patch - see file X86/X86ScheduleBtVer2.td)
def FpuPRF : RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>
Here, FpuPRF describes a register file for MMX/XMM/YMM registers. On Jaguar
(btver2), a YMM register definition consumes 2 physical registers, while MMX/XMM
register definitions only cost 1 physical register.
The syntax allows to specify an empty set of register classes. An empty set of
register classes means: this register file models all the registers specified by
the Target. For each register class, users can specify an optional register
cost. By default, register costs default to 1. A value of 0 for the number of
physical registers means: "this register file has an unbounded number of
physical registers".
This patch is structured in two parts.
* Part 1 - MC/Tablegen *
A first part adds the tablegen definition of RegisterFile, and teaches the
SubtargetEmitter how to emit information related to register files.
Information about register files is accessible through an instance of
MCExtraProcessorInfo.
The idea behind this design is to logically partition the processor description
which is only used by external tools (like llvm-mca) from the processor
information used by the llvm machine schedulers.
I think that this design would make easier for targets to get rid of the extra
processor information if they don't want it.
* Part 2 - llvm-mca related *
The second part of this patch is related to changes to llvm-mca.
The main differences are:
1) class RegisterFile now needs to take into account the "cost of a register"
when allocating physical registers at register renaming stage.
2) Point 1. triggered a minor refactoring which lef to the removal of the
"maximum 32 register files" restriction.
3) The BackendStatistics view has been updated so that we can print out extra
details related to each register file implemented by the processor.
The effect of point 3. is also visible in tests register-files-[1..5].s.
Differential Revision: https://reviews.llvm.org/D44980
llvm-svn: 329067
2018-04-03 15:36:24 +02:00
|
|
|
const MCExtraProcessorInfo *ExtraProcessorInfo;
|
|
|
|
|
|
|
|
bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; }
|
|
|
|
|
2012-09-14 22:26:46 +02:00
|
|
|
unsigned getProcessorID() const { return ProcID; }
|
|
|
|
|
2012-09-14 22:26:41 +02:00
|
|
|
/// Does this machine model include instruction-level scheduling.
|
2012-09-18 06:18:39 +02:00
|
|
|
bool hasInstrSchedModel() const { return SchedClassTable; }
|
2012-09-14 22:26:41 +02:00
|
|
|
|
[MC][Tablegen] Allow the definition of processor register files in the scheduling model for llvm-mca
This patch allows the description of register files in processor scheduling
models. This addresses PR36662.
A new tablegen class named 'RegisterFile' has been added to TargetSchedule.td.
Targets can optionally describe register files for their processors using that
class. In particular, class RegisterFile allows to specify:
- The total number of physical registers.
- Which target registers are accessible through the register file.
- The cost of allocating a register at register renaming stage.
Example (from this patch - see file X86/X86ScheduleBtVer2.td)
def FpuPRF : RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>
Here, FpuPRF describes a register file for MMX/XMM/YMM registers. On Jaguar
(btver2), a YMM register definition consumes 2 physical registers, while MMX/XMM
register definitions only cost 1 physical register.
The syntax allows to specify an empty set of register classes. An empty set of
register classes means: this register file models all the registers specified by
the Target. For each register class, users can specify an optional register
cost. By default, register costs default to 1. A value of 0 for the number of
physical registers means: "this register file has an unbounded number of
physical registers".
This patch is structured in two parts.
* Part 1 - MC/Tablegen *
A first part adds the tablegen definition of RegisterFile, and teaches the
SubtargetEmitter how to emit information related to register files.
Information about register files is accessible through an instance of
MCExtraProcessorInfo.
The idea behind this design is to logically partition the processor description
which is only used by external tools (like llvm-mca) from the processor
information used by the llvm machine schedulers.
I think that this design would make easier for targets to get rid of the extra
processor information if they don't want it.
* Part 2 - llvm-mca related *
The second part of this patch is related to changes to llvm-mca.
The main differences are:
1) class RegisterFile now needs to take into account the "cost of a register"
when allocating physical registers at register renaming stage.
2) Point 1. triggered a minor refactoring which lef to the removal of the
"maximum 32 register files" restriction.
3) The BackendStatistics view has been updated so that we can print out extra
details related to each register file implemented by the processor.
The effect of point 3. is also visible in tests register-files-[1..5].s.
Differential Revision: https://reviews.llvm.org/D44980
llvm-svn: 329067
2018-04-03 15:36:24 +02:00
|
|
|
const MCExtraProcessorInfo &getExtraProcessorInfo() const {
|
|
|
|
assert(hasExtraProcessorInfo() &&
|
|
|
|
"No extra information available for this model");
|
|
|
|
return *ExtraProcessorInfo;
|
|
|
|
}
|
|
|
|
|
2013-09-25 20:14:12 +02:00
|
|
|
/// Return true if this machine model data for all instructions with a
|
|
|
|
/// scheduling class (itinerary class or SchedRW list).
|
|
|
|
bool isComplete() const { return CompleteModel; }
|
|
|
|
|
2015-07-18 01:18:30 +02:00
|
|
|
/// Return true if machine supports out of order execution.
|
|
|
|
bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
|
|
|
|
|
2012-11-06 08:10:38 +01:00
|
|
|
unsigned getNumProcResourceKinds() const {
|
|
|
|
return NumProcResourceKinds;
|
|
|
|
}
|
|
|
|
|
2012-09-14 22:26:41 +02:00
|
|
|
const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
|
|
|
|
assert(hasInstrSchedModel() && "No scheduling machine model");
|
|
|
|
|
|
|
|
assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx");
|
|
|
|
return &ProcResourceTable[ProcResourceIdx];
|
|
|
|
}
|
|
|
|
|
|
|
|
const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const {
|
|
|
|
assert(hasInstrSchedModel() && "No scheduling machine model");
|
|
|
|
|
|
|
|
assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
|
|
|
|
return &SchedClassTable[SchedClassIdx];
|
|
|
|
}
|
2014-09-02 19:43:54 +02:00
|
|
|
|
2018-03-13 16:22:13 +01:00
|
|
|
/// Returns the latency value for the scheduling class.
|
|
|
|
static int computeInstrLatency(const MCSubtargetInfo &STI,
|
|
|
|
const MCSchedClassDesc &SCDesc);
|
|
|
|
|
2018-04-15 19:32:17 +02:00
|
|
|
int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const;
|
2018-05-31 15:30:42 +02:00
|
|
|
int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
|
|
|
|
const MCInst &Inst) const;
|
2018-04-15 19:32:17 +02:00
|
|
|
|
|
|
|
// Returns the reciprocal throughput information from a MCSchedClassDesc.
|
2018-06-06 01:34:45 +02:00
|
|
|
static double
|
2018-03-13 17:28:55 +01:00
|
|
|
getReciprocalThroughput(const MCSubtargetInfo &STI,
|
|
|
|
const MCSchedClassDesc &SCDesc);
|
|
|
|
|
2018-06-06 01:34:45 +02:00
|
|
|
static double
|
2018-04-15 19:32:17 +02:00
|
|
|
getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID);
|
|
|
|
|
2018-06-06 01:34:45 +02:00
|
|
|
double
|
2018-05-31 15:30:42 +02:00
|
|
|
getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
|
|
|
|
const MCInst &Inst) const;
|
|
|
|
|
2015-07-11 00:13:43 +02:00
|
|
|
/// Returns the default initialized model.
|
|
|
|
static const MCSchedModel &GetDefaultSchedModel() { return Default; }
|
|
|
|
static const MCSchedModel Default;
|
2012-07-07 06:00:00 +02:00
|
|
|
};
|
|
|
|
|
2018-04-15 19:32:17 +02:00
|
|
|
} // namespace llvm
|
2012-07-07 06:00:00 +02:00
|
|
|
|
|
|
|
#endif
|