1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[AMDGPU] Fix shortfalls in WQM marking

When tracking defined lanes through phi nodes in the live range
graph each branch of the phi must be handled independently.
Also rewrite the marking algorithm to reduce unnecessary
operations.

Previously a shared set of defined lanes was used which caused
marking to stop prematurely. This was observable in existing lit
tests, but test patterns did not cover this detail.

Reviewed By: piotr

Differential Revision: https://reviews.llvm.org/D98614
This commit is contained in:
Carl Ritson 2021-03-15 21:32:09 +09:00
parent f127ae93b8
commit aa27a4d8bb
3 changed files with 111 additions and 28 deletions

View File

@ -318,38 +318,63 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
if (!UseLRQ.valueIn())
const VNInfo *Value = UseLRQ.valueIn();
if (!Value)
return;
// Note: this code assumes that lane masks on AMDGPU completely
// cover registers.
LaneBitmask DefinedLanes;
LaneBitmask UseLanes;
if (SubReg) {
UseLanes = TRI->getSubRegIndexLaneMask(SubReg);
} else if (Reg.isVirtual()) {
UseLanes = MRI->getMaxLaneMaskForVReg(Reg);
}
const LaneBitmask UseLanes =
SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
: (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
: LaneBitmask::getNone());
SmallPtrSet<const VNInfo *, 4> Visited;
SmallVector<const VNInfo *, 4> ToProcess;
ToProcess.push_back(UseLRQ.valueIn());
// Perform a depth-first iteration of the LiveRange graph marking defs.
// Stop processing of a given branch when all use lanes have been defined.
// The first definition stops processing for a physical register.
struct PhiEntry {
const VNInfo *Phi;
unsigned PredIdx;
unsigned VisitIdx;
LaneBitmask DefinedLanes;
PhiEntry(const VNInfo *Phi, unsigned PredIdx, unsigned VisitIdx,
LaneBitmask DefinedLanes)
: Phi(Phi), PredIdx(PredIdx), VisitIdx(VisitIdx),
DefinedLanes(DefinedLanes) {}
};
SmallSetVector<const VNInfo *, 4> Visited;
SmallVector<PhiEntry, 2> PhiStack;
LaneBitmask DefinedLanes;
unsigned NextPredIdx; // Only used for processing phi nodes
do {
const VNInfo *Value = ToProcess.pop_back_val();
Visited.insert(Value);
const VNInfo *NextValue = nullptr;
if (!Visited.count(Value)) {
Visited.insert(Value);
// On first visit to a phi then start processing first predecessor
NextPredIdx = 0;
}
if (Value->isPHIDef()) {
// Need to mark all defs used in the PHI node
// Each predecessor node in the phi must be processed as a subgraph
const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
assert(MBB && "Phi-def has no defining MBB");
for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
PE = MBB->pred_end();
PI != PE; ++PI) {
// Find next predecessor to process
unsigned Idx = NextPredIdx;
auto PI = MBB->pred_begin() + Idx;
auto PE = MBB->pred_end();
for (; PI != PE && !NextValue; ++PI, ++Idx) {
if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
if (!Visited.count(VN))
ToProcess.push_back(VN);
NextValue = VN;
}
}
// If there are more predecessors to process; add phi to stack
if (PI != PE)
PhiStack.emplace_back(Value, Idx, Visited.size(), DefinedLanes);
} else {
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
assert(MI && "Def has no defining instruction");
@ -370,17 +395,20 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
// Record if this instruction defined any of use
HasDef |= Overlap.any();
// Check if all lanes of use have been defined
// Mark any lanes defined
DefinedLanes |= OpLanes;
if ((DefinedLanes & UseLanes) != UseLanes) {
// Definition not complete; need to process input value
LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
if (const VNInfo *VN = LRQ.valueIn()) {
if (!Visited.count(VN))
ToProcess.push_back(VN);
}
}
// Check if all lanes of use have been defined
if ((DefinedLanes & UseLanes) != UseLanes) {
// Definition not complete; need to process input value
LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
if (const VNInfo *VN = LRQ.valueIn()) {
if (!Visited.count(VN))
NextValue = VN;
}
}
// Only mark the instruction if it defines some part of the use
if (HasDef)
markInstruction(*MI, Flag, Worklist);
@ -389,9 +417,21 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
markInstruction(*MI, Flag, Worklist);
}
}
} while (!ToProcess.empty());
assert(!Reg.isVirtual() || ((DefinedLanes & UseLanes) == UseLanes));
if (!NextValue && !PhiStack.empty()) {
// Reach end of chain; revert to processing last phi
PhiEntry &Entry = PhiStack.back();
NextValue = Entry.Phi;
NextPredIdx = Entry.PredIdx;
DefinedLanes = Entry.DefinedLanes;
// Rewind visited set to correct state
while (Visited.size() > Entry.VisitIdx)
Visited.pop_back();
PhiStack.pop_back();
}
Value = NextValue;
} while (Value);
}
void SIWholeQuadMode::markOperand(const MachineInstr &MI,

View File

@ -859,6 +859,10 @@ main_body:
; CHECK-NEXT: ; %entry
; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: s_wqm_b64 exec, exec
; CHECK: v_mov
; CHECK: v_mov
; CHECK: v_mov
; CHECK: v_mov
; CHECK: s_and_b64 exec, exec, [[LIVE]]
; CHECK: image_store
; CHECK: s_wqm_b64 exec, exec

View File

@ -259,3 +259,42 @@ body: |
$vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
...
---
# Check that WQM marking occurs correctly through phi nodes in live range graph.
# If not then initial V_MOV will not be in WQM.
#
#CHECK-LABEL: name: test_wqm_lr_phi
#CHECK: COPY $exec
#CHECK-NEXT: S_WQM
#CHECK-NEXT: V_MOV_B32_e32 -10
#CHECK-NEXT: V_MOV_B32_e32 0
name: test_wqm_lr_phi
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub0:vreg_64 = V_MOV_B32_e32 -10, implicit $exec
%0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
%1:sreg_64 = S_GETPC_B64
%2:sgpr_256 = S_LOAD_DWORDX8_IMM %1:sreg_64, 32, 0, 0
bb.1:
$vcc = V_CMP_LT_U32_e64 4, 4, implicit $exec
S_CBRANCH_VCCNZ %bb.3, implicit $vcc
S_BRANCH %bb.2
bb.2:
%0.sub0:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec
S_BRANCH %bb.3
bb.3:
%0.sub1:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec
S_BRANCH %bb.4
bb.4:
%3:sgpr_128 = IMPLICIT_DEF
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %2:sgpr_256, %3:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource")
$vgpr0 = COPY %4.sub0:vreg_128
$vgpr1 = COPY %4.sub1:vreg_128
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
...