1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[Hexagon] Eliminate subregisters from PHI nodes before pipelining

The pipeliner needs to remove instructions from the SlotIndexes
structure when they are deleted. Otherwise, the SlotIndexes map
has stale data, and an assert will occur when adding new
instructions.

This patch also changes the pipeliner to make the back-edge of
a loop carried dependence 1 cycle. The 1 cycle latency is added
to the anti-dependence that represents the back-edge. This
changes eliminates a couple of hacks added to the pipeliner to
handle the latency of the back-edge. It is needed to correctly
pipeline the test case for the sub-register elimination pass.

llvm-svn: 328113
This commit is contained in:
Krzysztof Parzyszek 2018-03-21 16:39:11 +00:00
parent 26f700f438
commit e6c7dc68db
2 changed files with 77 additions and 42 deletions

View File

@ -218,6 +218,7 @@ public:
}
private:
void preprocessPhiNodes(MachineBasicBlock &B);
bool canPipelineLoop(MachineLoop &L);
bool scheduleLoop(MachineLoop &L);
bool swingModuloScheduler(MachineLoop &L);
@ -357,20 +358,6 @@ public:
bool isLoopCarriedOrder(SUnit *Source, const SDep &Dep, bool isSucc = true);
/// The latency of the dependence.
unsigned getLatency(SUnit *Source, const SDep &Dep) {
// Anti dependences represent recurrences, so use the latency of the
// instruction on the back-edge.
if (Dep.getKind() == SDep::Anti) {
if (Source->getInstr()->isPHI())
return Dep.getSUnit()->Latency;
if (Dep.getSUnit()->getInstr()->isPHI())
return Source->Latency;
return Dep.getLatency();
}
return Dep.getLatency();
}
/// The distance function, which indicates that operation V of iteration I
/// depends on operations U of iteration I-distance.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
@ -484,12 +471,19 @@ class NodeSet {
int MaxDepth = 0;
unsigned Colocate = 0;
SUnit *ExceedPressure = nullptr;
unsigned Latency = 0;
public:
using iterator = SetVector<SUnit *>::const_iterator;
NodeSet() = default;
NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {}
NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
Latency = 0;
for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
for (const SDep &Succ : Nodes[i]->Succs)
if (Nodes.count(Succ.getSUnit()))
Latency += Succ.getLatency();
}
bool insert(SUnit *SU) { return Nodes.insert(SU); }
@ -820,18 +814,41 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
if (!L.getLoopPreheader())
return false;
// If any of the Phis contain subregs, then we can't pipeline
// because we don't know how to maintain subreg information in the
// VMap structure.
MachineBasicBlock *MBB = L.getHeader();
for (auto &PHI : MBB->phis())
for (unsigned i = 1; i != PHI.getNumOperands(); i += 2)
if (PHI.getOperand(i).getSubReg() != 0)
return false;
// Remove any subregisters from inputs to phi nodes.
preprocessPhiNodes(*L.getHeader());
return true;
}
void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
MachineRegisterInfo &MRI = MF->getRegInfo();
SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
MachineOperand &DefOp = PI.getOperand(0);
assert(DefOp.getSubReg() == 0);
auto *RC = MRI.getRegClass(DefOp.getReg());
for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
MachineOperand &RegOp = PI.getOperand(i);
if (RegOp.getSubReg() == 0)
continue;
// If the operand uses a subregister, replace it with a new register
// without subregisters, and generate a copy to the new register.
unsigned NewReg = MRI.createVirtualRegister(RC);
MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
MachineBasicBlock::iterator At = PredB.getFirstTerminator();
const DebugLoc &DL = PredB.findDebugLoc(At);
auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
.addReg(RegOp.getReg(), getRegState(RegOp),
RegOp.getSubReg());
Slots.insertMachineInstrInMaps(*Copy);
RegOp.setReg(NewReg);
RegOp.setSubReg(0);
}
}
}
/// The SMS algorithm consists of the following main steps:
/// 1. Computation and analysis of the dependence graph.
/// 2. Ordering of the nodes (instructions).
@ -1078,31 +1095,41 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
int64_t Offset1, Offset2;
if (!TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) ||
!TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
SU.addPred(SDep(Load, SDep::Barrier));
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
"What happened to the chain edge?");
SU.addPred(SDep(Load, SDep::Barrier));
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
// Second, the more expensive check that uses alias analysis on the
// base registers. If they alias, and the load offset is less than
// the store offset, the mark the dependence as loop carried.
if (!AA) {
SU.addPred(SDep(Load, SDep::Barrier));
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
MachineMemOperand *MMO2 = *MI.memoperands_begin();
if (!MMO1->getValue() || !MMO2->getValue()) {
SU.addPred(SDep(Load, SDep::Barrier));
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
if (MMO1->getValue() == MMO2->getValue() &&
MMO1->getOffset() <= MMO2->getOffset()) {
SU.addPred(SDep(Load, SDep::Barrier));
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
AliasResult AAResult = AA->alias(
@ -1111,8 +1138,11 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
MMO2->getAAInfo()));
if (AAResult != NoAlias)
SU.addPred(SDep(Load, SDep::Barrier));
if (AAResult != NoAlias) {
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
}
}
}
}
@ -1154,6 +1184,7 @@ void SwingSchedulerDAG::updatePhiDependences() {
if (SU != nullptr && UseMI->isPHI()) {
if (!MI->isPHI()) {
SDep Dep(SU, SDep::Anti, Reg);
Dep.setLatency(1);
I.addPred(Dep);
} else {
HasPhiDef = Reg;
@ -1599,12 +1630,12 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
EP = SU->Preds.end();
IP != EP; ++IP) {
SUnit *pred = IP->getSUnit();
if (getLatency(SU, *IP) == 0)
if (IP->getLatency() == 0)
zeroLatencyDepth =
std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
if (ignoreDependence(*IP, true))
continue;
asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
getDistance(pred, SU, *IP) * MII));
}
maxASAP = std::max(maxASAP, asap);
@ -1623,12 +1654,12 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
ES = SU->Succs.end();
IS != ES; ++IS) {
SUnit *succ = IS->getSUnit();
if (getLatency(SU, *IS) == 0)
if (IS->getLatency() == 0)
zeroLatencyHeight =
std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
if (ignoreDependence(*IS, true))
continue;
alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
getDistance(SU, succ, *IS) * MII));
}
@ -2340,6 +2371,8 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
// Remove the original loop since it's no longer referenced.
for (auto &I : *BB)
LIS.RemoveMachineInstrFromMaps(I);
BB->clear();
BB->eraseFromParent();
@ -2916,6 +2949,7 @@ void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
used = false;
}
if (!used) {
LIS.RemoveMachineInstrFromMaps(*MI);
MI++->eraseFromParent();
continue;
}
@ -2930,6 +2964,7 @@ void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
++BBI;
unsigned reg = MI->getOperand(0).getReg();
if (MRI.use_begin(reg) == MRI.use_end()) {
LIS.RemoveMachineInstrFromMaps(*MI);
MI->eraseFromParent();
}
}
@ -3636,7 +3671,7 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
const SDep &Dep = SU->Preds[i];
if (Dep.getSUnit() == I) {
if (!DAG->isBackedge(SU, Dep)) {
int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
int EarlyStart = cycle + Dep.getLatency() -
DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
*MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
if (DAG->isLoopCarriedOrder(SU, Dep, false)) {
@ -3644,7 +3679,7 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
*MinEnd = std::min(*MinEnd, End);
}
} else {
int LateStart = cycle - DAG->getLatency(SU, Dep) +
int LateStart = cycle - Dep.getLatency() +
DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
*MinLateStart = std::min(*MinLateStart, LateStart);
}
@ -3660,7 +3695,7 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
if (SU->Succs[i].getSUnit() == I) {
const SDep &Dep = SU->Succs[i];
if (!DAG->isBackedge(SU, Dep)) {
int LateStart = cycle - DAG->getLatency(SU, Dep) +
int LateStart = cycle - Dep.getLatency() +
DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
*MinLateStart = std::min(*MinLateStart, LateStart);
if (DAG->isLoopCarriedOrder(SU, Dep)) {
@ -3668,7 +3703,7 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
*MaxStart = std::max(*MaxStart, Start);
}
} else {
int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
int EarlyStart = cycle + Dep.getLatency() -
DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
*MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
}

View File

@ -9,16 +9,16 @@
; CHECK-NOT: vcmp.gt([[VREG]].uh,v{{[0-9]+}}.uh)
; CHECK: loop0
define void @f0() #0 {
define void @f0(<64 x i32> %a0, <32 x i32> %a1) #0 {
b0:
br i1 undef, label %b1, label %b5
b1: ; preds = %b0
%v0 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> undef)
%v0 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %a0)
br label %b2
b2: ; preds = %b4, %b1
%v1 = phi <32 x i32> [ undef, %b1 ], [ %v7, %b4 ]
%v1 = phi <32 x i32> [ %a1, %b1 ], [ %v7, %b4 ]
br label %b3
b3: ; preds = %b3, %b2