mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
Recommit [MachineCombiner] Update instruction depths incrementally for large BBs.
This version of the patch fixes an off-by-one error causing PR34596. We do not need to use std::next(BlockIter) when calling updateDepths, as BlockIter already points to the next element. Original commit message: > For large basic blocks with lots of combinable instructions, the > MachineTraceMetrics computations in MachineCombiner can dominate the compile > time, as computing the trace information is quadratic in the number of > instructions in a BB and it's relevant successors/predecessors. > In most cases, knowing the instruction depth should be enough to make > combination decisions. As we already iterate over all instructions in a basic > block, the instruction depth can be computed incrementally. This reduces the > cost of machine-combine drastically in cases where lots of instructions > are combined. The major drawback is that AFAIK, computing the critical path > length cannot be done incrementally. Therefore we only compute > instruction depths incrementally, for basic blocks with more > instructions than inc_threshold. The -machine-combiner-inc-threshold > option can be used to set the threshold and allows for easier > experimenting and checking if using incremental updates for all basic > blocks has any impact on the performance. > > Reviewers: sanjoy, Gerolf, MatzeB, efriedma, fhahn > > Reviewed By: fhahn > > Subscribers: kiranchandramohan, javed.absar, efriedma, llvm-commits > > Differential Revision: https://reviews.llvm.org/D36619 llvm-svn: 313751
This commit is contained in:
parent
7765db43c2
commit
6c9eb841b2
@ -366,6 +366,12 @@ public:
|
||||
SparseSet<LiveRegUnit> &RegUnits);
|
||||
void updateDepth(const MachineBasicBlock *, const MachineInstr&,
|
||||
SparseSet<LiveRegUnit> &RegUnits);
|
||||
|
||||
/// Updates the depth of the instructions from Start to End.
|
||||
void updateDepths(MachineBasicBlock::iterator Start,
|
||||
MachineBasicBlock::iterator End,
|
||||
SparseSet<LiveRegUnit> &RegUnits);
|
||||
|
||||
};
|
||||
|
||||
/// Strategies for selecting traces.
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "llvm/CodeGen/MachineTraceMetrics.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/TargetSchedule.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Target/TargetInstrInfo.h"
|
||||
@ -34,6 +35,11 @@ using namespace llvm;
|
||||
|
||||
STATISTIC(NumInstCombined, "Number of machineinst combined");
|
||||
|
||||
static cl::opt<unsigned>
|
||||
inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
|
||||
cl::desc("Incremental depth computation will be used for basic "
|
||||
"blocks with more instructions."), cl::init(500));
|
||||
|
||||
namespace {
|
||||
class MachineCombiner : public MachineFunctionPass {
|
||||
const TargetInstrInfo *TII;
|
||||
@ -73,7 +79,7 @@ private:
|
||||
SmallVectorImpl<MachineInstr *> &InsInstrs,
|
||||
SmallVectorImpl<MachineInstr *> &DelInstrs,
|
||||
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
|
||||
MachineCombinerPattern Pattern);
|
||||
MachineCombinerPattern Pattern, bool SlackIsAccurate);
|
||||
bool preservesResourceLen(MachineBasicBlock *MBB,
|
||||
MachineTraceMetrics::Trace BlockTrace,
|
||||
SmallVectorImpl<MachineInstr *> &InsInstrs,
|
||||
@ -247,7 +253,8 @@ bool MachineCombiner::improvesCriticalPathLen(
|
||||
SmallVectorImpl<MachineInstr *> &InsInstrs,
|
||||
SmallVectorImpl<MachineInstr *> &DelInstrs,
|
||||
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
|
||||
MachineCombinerPattern Pattern) {
|
||||
MachineCombinerPattern Pattern,
|
||||
bool SlackIsAccurate) {
|
||||
assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
|
||||
"Missing machine model\n");
|
||||
// NewRoot is the last instruction in the \p InsInstrs vector.
|
||||
@ -258,7 +265,7 @@ bool MachineCombiner::improvesCriticalPathLen(
|
||||
unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
|
||||
unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
|
||||
|
||||
DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
|
||||
DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n";
|
||||
dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
|
||||
dbgs() << " RootDepth: " << RootDepth << "\n");
|
||||
|
||||
@ -281,17 +288,18 @@ bool MachineCombiner::improvesCriticalPathLen(
|
||||
RootLatency += TSchedModel.computeInstrLatency(I);
|
||||
|
||||
unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
|
||||
|
||||
unsigned NewCycleCount = NewRootDepth + NewRootLatency;
|
||||
unsigned OldCycleCount = RootDepth + RootLatency +
|
||||
(SlackIsAccurate ? RootSlack : 0);
|
||||
DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
|
||||
dbgs() << " RootLatency: " << RootLatency << "\n";
|
||||
dbgs() << " RootSlack: " << RootSlack << "\n";
|
||||
dbgs() << " RootSlack: " << RootSlack << " SlackIsAccurate="
|
||||
<< SlackIsAccurate << "\n";
|
||||
dbgs() << " NewRootDepth + NewRootLatency = "
|
||||
<< NewRootDepth + NewRootLatency << "\n";
|
||||
<< NewCycleCount << "\n";
|
||||
dbgs() << " RootDepth + RootLatency + RootSlack = "
|
||||
<< RootDepth + RootLatency + RootSlack << "\n";);
|
||||
|
||||
unsigned NewCycleCount = NewRootDepth + NewRootLatency;
|
||||
unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
|
||||
<< OldCycleCount << "\n";
|
||||
);
|
||||
|
||||
return NewCycleCount <= OldCycleCount;
|
||||
}
|
||||
@ -354,17 +362,44 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
|
||||
/// depths if requested.
|
||||
///
|
||||
/// \param MBB basic block to insert instructions in
|
||||
/// \param MI current machine instruction
|
||||
/// \param InsInstrs new instructions to insert in \p MBB
|
||||
/// \param DelInstrs instruction to delete from \p MBB
|
||||
/// \param MinInstr is a pointer to the machine trace information
|
||||
/// \param RegUnits set of live registers, needed to compute instruction depths
|
||||
/// \param IncrementalUpdate if true, compute instruction depths incrementally,
|
||||
/// otherwise invalidate the trace
|
||||
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
|
||||
SmallVector<MachineInstr *, 16> InsInstrs,
|
||||
SmallVector<MachineInstr *, 16> DelInstrs,
|
||||
MachineTraceMetrics *Traces) {
|
||||
MachineTraceMetrics::Ensemble *MinInstr,
|
||||
SparseSet<LiveRegUnit> &RegUnits,
|
||||
bool IncrementalUpdate) {
|
||||
for (auto *InstrPtr : InsInstrs)
|
||||
MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
|
||||
for (auto *InstrPtr : DelInstrs)
|
||||
|
||||
for (auto *InstrPtr : DelInstrs) {
|
||||
InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
|
||||
++NumInstCombined;
|
||||
Traces->invalidate(MBB);
|
||||
Traces->verifyAnalysis();
|
||||
// Erase all LiveRegs defined by the removed instruction
|
||||
for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
|
||||
if (I->MI == InstrPtr)
|
||||
I = RegUnits.erase(I);
|
||||
else
|
||||
I++;
|
||||
}
|
||||
}
|
||||
|
||||
if (IncrementalUpdate)
|
||||
for (auto *InstrPtr : InsInstrs)
|
||||
MinInstr->updateDepth(MBB, *InstrPtr, RegUnits);
|
||||
else
|
||||
MinInstr->invalidate(MBB);
|
||||
|
||||
NumInstCombined++;
|
||||
}
|
||||
|
||||
/// Substitute a slow code sequence with a faster one by
|
||||
@ -378,9 +413,16 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
|
||||
bool Changed = false;
|
||||
DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
|
||||
|
||||
bool IncrementalUpdate = false;
|
||||
auto BlockIter = MBB->begin();
|
||||
auto LastUpdate = BlockIter;
|
||||
// Check if the block is in a loop.
|
||||
const MachineLoop *ML = MLI->getLoopFor(MBB);
|
||||
if (!MinInstr)
|
||||
MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
|
||||
|
||||
SparseSet<LiveRegUnit> RegUnits;
|
||||
RegUnits.setUniverse(TRI->getNumRegUnits());
|
||||
|
||||
while (BlockIter != MBB->end()) {
|
||||
auto &MI = *BlockIter++;
|
||||
@ -419,9 +461,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
|
||||
SmallVector<MachineInstr *, 16> InsInstrs;
|
||||
SmallVector<MachineInstr *, 16> DelInstrs;
|
||||
DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
|
||||
if (!MinInstr)
|
||||
MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
|
||||
Traces->verifyAnalysis();
|
||||
TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
|
||||
InstrIdxForVirtReg);
|
||||
unsigned NewInstCount = InsInstrs.size();
|
||||
@ -436,23 +475,41 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
|
||||
if (ML && TII->isThroughputPattern(P))
|
||||
SubstituteAlways = true;
|
||||
|
||||
if (IncrementalUpdate) {
|
||||
// Update depths since the last incremental update.
|
||||
MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits);
|
||||
LastUpdate = BlockIter;
|
||||
}
|
||||
|
||||
// Substitute when we optimize for codesize and the new sequence has
|
||||
// fewer instructions OR
|
||||
// the new sequence neither lengthens the critical path nor increases
|
||||
// resource pressure.
|
||||
if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
|
||||
insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
|
||||
insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
|
||||
RegUnits, IncrementalUpdate);
|
||||
// Eagerly stop after the first pattern fires.
|
||||
Changed = true;
|
||||
break;
|
||||
} else {
|
||||
// Calculating the trace metrics may be expensive,
|
||||
// so only do this when necessary.
|
||||
// For big basic blocks, we only compute the full trace the first time
|
||||
// we hit this. We do not invalidate the trace, but instead update the
|
||||
// instruction depths incrementally.
|
||||
// NOTE: Only the instruction depths up to MI are accurate. All other
|
||||
// trace information is not updated.
|
||||
MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
|
||||
Traces->verifyAnalysis();
|
||||
if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
|
||||
InstrIdxForVirtReg, P) &&
|
||||
InstrIdxForVirtReg, P,
|
||||
!IncrementalUpdate) &&
|
||||
preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
|
||||
insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
|
||||
if (MBB->size() > inc_threshold)
|
||||
// Use incremental depth updates for basic blocks above treshold
|
||||
IncrementalUpdate = true;
|
||||
|
||||
insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
|
||||
RegUnits, IncrementalUpdate);
|
||||
|
||||
// Eagerly stop after the first pattern fires.
|
||||
Changed = true;
|
||||
break;
|
||||
@ -467,6 +524,8 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
|
||||
}
|
||||
}
|
||||
|
||||
if (Changed && IncrementalUpdate)
|
||||
Traces->invalidate(MBB);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
|
@ -824,6 +824,14 @@ updateDepth(const MachineBasicBlock *MBB, const MachineInstr &UseMI,
|
||||
updateDepth(BlockInfo[MBB->getNumber()], UseMI, RegUnits);
|
||||
}
|
||||
|
||||
void MachineTraceMetrics::Ensemble::
|
||||
updateDepths(MachineBasicBlock::iterator Start,
|
||||
MachineBasicBlock::iterator End,
|
||||
SparseSet<LiveRegUnit> &RegUnits) {
|
||||
for (; Start != End; Start++)
|
||||
updateDepth(Start->getParent(), *Start, RegUnits);
|
||||
}
|
||||
|
||||
/// Compute instruction depths for all instructions above or in MBB in its
|
||||
/// trace. This assumes that the trace through MBB has already been computed.
|
||||
void MachineTraceMetrics::Ensemble::
|
||||
|
@ -1,5 +1,10 @@
|
||||
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -disable-post-ra < %s | FileCheck %s
|
||||
|
||||
; Incremental updates of the instruction depths should be enough for this test
|
||||
; case.
|
||||
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \
|
||||
; RUN: -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s
|
||||
|
||||
; Verify that the first two adds are independent regardless of how the inputs are
|
||||
; commuted. The destination registers are used as source registers for the third add.
|
||||
|
||||
|
@ -1,6 +1,11 @@
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
|
||||
|
||||
; Incremental updates of the instruction depths should be enough for this test
|
||||
; case.
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=AVX
|
||||
|
||||
; Verify that the first two adds are independent regardless of how the inputs are
|
||||
; commuted. The destination registers are used as source registers for the third add.
|
||||
|
||||
|
@ -1,6 +1,9 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
|
||||
|
||||
; Incremental updates of the instruction depths should be enough for this test
|
||||
; case.
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell -machine-combiner-inc-threshold=0| FileCheck %s --check-prefix=X64-HSW
|
||||
|
||||
; Function Attrs: norecurse nounwind readnone uwtable
|
||||
define i32 @mult(i32, i32) local_unnamed_addr #0 {
|
||||
|
Loading…
x
Reference in New Issue
Block a user