mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
[MachineCombiner] Update instruction depths incrementally for large BBs.
Summary: For large basic blocks with lots of combinable instructions, the MachineTraceMetrics computations in MachineCombiner can dominate the compile time, as computing the trace information is quadratic in the number of instructions in a BB and it's relevant successors/predecessors. In most cases, knowing the instruction depth should be enough to make combination decisions. As we already iterate over all instructions in a basic block, the instruction depth can be computed incrementally. This reduces the cost of machine-combine drastically in cases where lots of instructions are combined. The major drawback is that AFAIK, computing the critical path length cannot be done incrementally. Therefore we only compute instruction depths incrementally, for basic blocks with more instructions than inc_threshold. The -machine-combiner-inc-threshold option can be used to set the threshold and allows for easier experimenting and checking if using incremental updates for all basic blocks has any impact on the performance. Reviewers: sanjoy, Gerolf, MatzeB, efriedma, fhahn Reviewed By: fhahn Subscribers: kiranchandramohan, javed.absar, efriedma, llvm-commits Differential Revision: https://reviews.llvm.org/D36619 llvm-svn: 312719
This commit is contained in:
parent
cc61b82fdc
commit
e0cb40c03e
@ -366,6 +366,12 @@ public:
|
||||
SparseSet<LiveRegUnit> &RegUnits);
|
||||
void updateDepth(const MachineBasicBlock *, const MachineInstr&,
|
||||
SparseSet<LiveRegUnit> &RegUnits);
|
||||
|
||||
/// Updates the depth of the instructions from Start to End.
|
||||
void updateDepths(MachineBasicBlock::iterator Start,
|
||||
MachineBasicBlock::iterator End,
|
||||
SparseSet<LiveRegUnit> &RegUnits);
|
||||
|
||||
};
|
||||
|
||||
/// Strategies for selecting traces.
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "llvm/CodeGen/MachineTraceMetrics.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/TargetSchedule.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Target/TargetInstrInfo.h"
|
||||
@ -34,6 +35,11 @@ using namespace llvm;
|
||||
|
||||
STATISTIC(NumInstCombined, "Number of machineinst combined");
|
||||
|
||||
static cl::opt<unsigned>
|
||||
inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
|
||||
cl::desc("Incremental depth computation will be used for basic "
|
||||
"blocks with more instructions."), cl::init(500));
|
||||
|
||||
namespace {
|
||||
class MachineCombiner : public MachineFunctionPass {
|
||||
const TargetInstrInfo *TII;
|
||||
@ -73,7 +79,7 @@ private:
|
||||
SmallVectorImpl<MachineInstr *> &InsInstrs,
|
||||
SmallVectorImpl<MachineInstr *> &DelInstrs,
|
||||
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
|
||||
MachineCombinerPattern Pattern);
|
||||
MachineCombinerPattern Pattern, bool SlackIsAccurate);
|
||||
bool preservesResourceLen(MachineBasicBlock *MBB,
|
||||
MachineTraceMetrics::Trace BlockTrace,
|
||||
SmallVectorImpl<MachineInstr *> &InsInstrs,
|
||||
@ -247,7 +253,8 @@ bool MachineCombiner::improvesCriticalPathLen(
|
||||
SmallVectorImpl<MachineInstr *> &InsInstrs,
|
||||
SmallVectorImpl<MachineInstr *> &DelInstrs,
|
||||
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
|
||||
MachineCombinerPattern Pattern) {
|
||||
MachineCombinerPattern Pattern,
|
||||
bool SlackIsAccurate) {
|
||||
assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
|
||||
"Missing machine model\n");
|
||||
// NewRoot is the last instruction in the \p InsInstrs vector.
|
||||
@ -258,7 +265,7 @@ bool MachineCombiner::improvesCriticalPathLen(
|
||||
unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
|
||||
unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
|
||||
|
||||
DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
|
||||
DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n";
|
||||
dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
|
||||
dbgs() << " RootDepth: " << RootDepth << "\n");
|
||||
|
||||
@ -281,17 +288,18 @@ bool MachineCombiner::improvesCriticalPathLen(
|
||||
RootLatency += TSchedModel.computeInstrLatency(I);
|
||||
|
||||
unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
|
||||
|
||||
unsigned NewCycleCount = NewRootDepth + NewRootLatency;
|
||||
unsigned OldCycleCount = RootDepth + RootLatency +
|
||||
(SlackIsAccurate ? RootSlack : 0);
|
||||
DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
|
||||
dbgs() << " RootLatency: " << RootLatency << "\n";
|
||||
dbgs() << " RootSlack: " << RootSlack << "\n";
|
||||
dbgs() << " RootSlack: " << RootSlack << " SlackIsAccurate="
|
||||
<< SlackIsAccurate << "\n";
|
||||
dbgs() << " NewRootDepth + NewRootLatency = "
|
||||
<< NewRootDepth + NewRootLatency << "\n";
|
||||
<< NewCycleCount << "\n";
|
||||
dbgs() << " RootDepth + RootLatency + RootSlack = "
|
||||
<< RootDepth + RootLatency + RootSlack << "\n";);
|
||||
|
||||
unsigned NewCycleCount = NewRootDepth + NewRootLatency;
|
||||
unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
|
||||
<< OldCycleCount << "\n";
|
||||
);
|
||||
|
||||
return NewCycleCount <= OldCycleCount;
|
||||
}
|
||||
@ -354,17 +362,44 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
|
||||
/// depths if requested.
|
||||
///
|
||||
/// \param MBB basic block to insert instructions in
|
||||
/// \param MI current machine instruction
|
||||
/// \param InsInstrs new instructions to insert in \p MBB
|
||||
/// \param DelInstrs instruction to delete from \p MBB
|
||||
/// \param MinInstr is a pointer to the machine trace information
|
||||
/// \param RegUnits set of live registers, needed to compute instruction depths
|
||||
/// \param IncrementalUpdate if true, compute instruction depths incrementally,
|
||||
/// otherwise invalidate the trace
|
||||
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
|
||||
SmallVector<MachineInstr *, 16> InsInstrs,
|
||||
SmallVector<MachineInstr *, 16> DelInstrs,
|
||||
MachineTraceMetrics *Traces) {
|
||||
MachineTraceMetrics::Ensemble *MinInstr,
|
||||
SparseSet<LiveRegUnit> &RegUnits,
|
||||
bool IncrementalUpdate) {
|
||||
for (auto *InstrPtr : InsInstrs)
|
||||
MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
|
||||
for (auto *InstrPtr : DelInstrs)
|
||||
|
||||
for (auto *InstrPtr : DelInstrs) {
|
||||
InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
|
||||
++NumInstCombined;
|
||||
Traces->invalidate(MBB);
|
||||
Traces->verifyAnalysis();
|
||||
// Erase all LiveRegs defined by the removed instruction
|
||||
for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
|
||||
if (I->MI == InstrPtr)
|
||||
I = RegUnits.erase(I);
|
||||
else
|
||||
I++;
|
||||
}
|
||||
}
|
||||
|
||||
if (IncrementalUpdate)
|
||||
for (auto *InstrPtr : InsInstrs)
|
||||
MinInstr->updateDepth(MBB, *InstrPtr, RegUnits);
|
||||
else
|
||||
MinInstr->invalidate(MBB);
|
||||
|
||||
NumInstCombined++;
|
||||
}
|
||||
|
||||
/// Substitute a slow code sequence with a faster one by
|
||||
@ -378,9 +413,16 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
|
||||
bool Changed = false;
|
||||
DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
|
||||
|
||||
bool IncrementalUpdate = false;
|
||||
auto BlockIter = MBB->begin();
|
||||
auto LastUpdate = BlockIter;
|
||||
// Check if the block is in a loop.
|
||||
const MachineLoop *ML = MLI->getLoopFor(MBB);
|
||||
if (!MinInstr)
|
||||
MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
|
||||
|
||||
SparseSet<LiveRegUnit> RegUnits;
|
||||
RegUnits.setUniverse(TRI->getNumRegUnits());
|
||||
|
||||
while (BlockIter != MBB->end()) {
|
||||
auto &MI = *BlockIter++;
|
||||
@ -419,9 +461,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
|
||||
SmallVector<MachineInstr *, 16> InsInstrs;
|
||||
SmallVector<MachineInstr *, 16> DelInstrs;
|
||||
DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
|
||||
if (!MinInstr)
|
||||
MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
|
||||
Traces->verifyAnalysis();
|
||||
TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
|
||||
InstrIdxForVirtReg);
|
||||
unsigned NewInstCount = InsInstrs.size();
|
||||
@ -436,23 +475,41 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
|
||||
if (ML && TII->isThroughputPattern(P))
|
||||
SubstituteAlways = true;
|
||||
|
||||
if (IncrementalUpdate) {
|
||||
// Update depths since the last incremental update.
|
||||
MinInstr->updateDepths(LastUpdate, std::next(BlockIter), RegUnits);
|
||||
LastUpdate = BlockIter;
|
||||
}
|
||||
|
||||
// Substitute when we optimize for codesize and the new sequence has
|
||||
// fewer instructions OR
|
||||
// the new sequence neither lengthens the critical path nor increases
|
||||
// resource pressure.
|
||||
if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
|
||||
insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
|
||||
insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
|
||||
RegUnits, IncrementalUpdate);
|
||||
// Eagerly stop after the first pattern fires.
|
||||
Changed = true;
|
||||
break;
|
||||
} else {
|
||||
// Calculating the trace metrics may be expensive,
|
||||
// so only do this when necessary.
|
||||
// For big basic blocks, we only compute the full trace the first time
|
||||
// we hit this. We do not invalidate the trace, but instead update the
|
||||
// instruction depths incrementally.
|
||||
// NOTE: Only the instruction depths up to MI are accurate. All other
|
||||
// trace information is not updated.
|
||||
MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
|
||||
Traces->verifyAnalysis();
|
||||
if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
|
||||
InstrIdxForVirtReg, P) &&
|
||||
InstrIdxForVirtReg, P,
|
||||
!IncrementalUpdate) &&
|
||||
preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
|
||||
insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
|
||||
if (MBB->size() > inc_threshold)
|
||||
// Use incremental depth updates for basic blocks above treshold
|
||||
IncrementalUpdate = true;
|
||||
|
||||
insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
|
||||
RegUnits, IncrementalUpdate);
|
||||
|
||||
// Eagerly stop after the first pattern fires.
|
||||
Changed = true;
|
||||
break;
|
||||
@ -467,6 +524,8 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
|
||||
}
|
||||
}
|
||||
|
||||
if (Changed && IncrementalUpdate)
|
||||
Traces->invalidate(MBB);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
|
@ -823,6 +823,14 @@ updateDepth(const MachineBasicBlock *MBB, const MachineInstr &UseMI,
|
||||
updateDepth(BlockInfo[MBB->getNumber()], UseMI, RegUnits);
|
||||
}
|
||||
|
||||
void MachineTraceMetrics::Ensemble::
|
||||
updateDepths(MachineBasicBlock::iterator Start,
|
||||
MachineBasicBlock::iterator End,
|
||||
SparseSet<LiveRegUnit> &RegUnits) {
|
||||
for (; Start != End; Start++)
|
||||
updateDepth(Start->getParent(), *Start, RegUnits);
|
||||
}
|
||||
|
||||
/// Compute instruction depths for all instructions above or in MBB in its
|
||||
/// trace. This assumes that the trace through MBB has already been computed.
|
||||
void MachineTraceMetrics::Ensemble::
|
||||
|
@ -1,5 +1,10 @@
|
||||
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -disable-post-ra < %s | FileCheck %s
|
||||
|
||||
; Incremental updates of the instruction depths should be enough for this test
|
||||
; case.
|
||||
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \
|
||||
; RUN: -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s
|
||||
|
||||
; Verify that the first two adds are independent regardless of how the inputs are
|
||||
; commuted. The destination registers are used as source registers for the third add.
|
||||
|
||||
|
@ -1,6 +1,11 @@
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
|
||||
|
||||
; Incremental updates of the instruction depths should be enough for this test
|
||||
; case.
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=AVX
|
||||
|
||||
; Verify that the first two adds are independent regardless of how the inputs are
|
||||
; commuted. The destination registers are used as source registers for the third add.
|
||||
|
||||
|
@ -1,6 +1,9 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
|
||||
|
||||
; Incremental updates of the instruction depths should be enough for this test
|
||||
; case.
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell -machine-combiner-inc-threshold=0| FileCheck %s --check-prefix=X64-HSW
|
||||
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @mult(i32, i32) local_unnamed_addr #0 {
|
||||
|
Loading…
x
Reference in New Issue
Block a user