1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 10:32:48 +02:00

Revert "[CSSPGO][llvm-profgen] Aggregate samples on call frame trie to speed up profile generation"

This reverts commit 1714ad2336293f351b15dd4b518f9e8618ec38f2.
This commit is contained in:
wlei 2021-02-03 21:16:49 -08:00
parent b7bd8d62b5
commit 924cc19d25
4 changed files with 108 additions and 232 deletions

View File

@ -28,12 +28,11 @@ void VirtualUnwinder::unwindCall(UnwindState &State) {
// 2nd frame is in prolog/epilog. In the future, we will switch to
// pro/epi tracker(Dwarf CFI) for the precise check.
uint64_t Source = State.getCurrentLBRSource();
auto *ParentFrame = State.getParentFrame();
if (ParentFrame == State.getDummyRootPtr() ||
ParentFrame->Address != Source) {
State.switchToFrame(Source);
auto Iter = State.CallStack.begin();
if (State.CallStack.size() == 1 || *(++Iter) != Source) {
State.CallStack.front() = Source;
} else {
State.popFrame();
State.CallStack.pop_front();
}
State.InstPtr.update(Source);
}
@ -42,29 +41,26 @@ void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
InstructionPointer &IP = State.InstPtr;
uint64_t Target = State.getCurrentLBRTarget();
uint64_t End = IP.Address;
if (Binary->usePseudoProbes()) {
// We don't need to top frame probe since it should be extracted
// from the range.
if (State.getBinary()->usePseudoProbes()) {
// The outcome of the virtual unwinding with pseudo probes is a
// map from a context key to the address range being unwound.
// This means basically linear unwinding is not needed for pseudo
// probes. The range will be simply recorded here and will be
// converted to a list of pseudo probes to report in ProfileGenerator.
State.getParentFrame()->recordRangeCount(Target, End, Repeat);
recordRangeCount(Target, End, State, Repeat);
} else {
// Unwind linear execution part
uint64_t LeafAddr = State.CurrentLeafFrame->Address;
while (IP.Address >= Target) {
uint64_t PrevIP = IP.Address;
IP.backward();
// Break into segments for implicit call/return due to inlining
bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address);
bool SameInlinee =
State.getBinary()->inlineContextEqual(PrevIP, IP.Address);
if (!SameInlinee || PrevIP == Target) {
State.switchToFrame(LeafAddr);
State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat);
recordRangeCount(PrevIP, End, State, Repeat);
End = IP.Address;
}
LeafAddr = IP.Address;
State.CallStack.front() = IP.Address;
}
}
}
@ -72,9 +68,9 @@ void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
void VirtualUnwinder::unwindReturn(UnwindState &State) {
// Add extra frame as we unwind through the return
const LBREntry &LBR = State.getCurrentLBR();
uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target);
State.switchToFrame(CallAddr);
State.pushFrame(LBR.Source);
uint64_t CallAddr = State.getBinary()->getCallAddrFromFrameAddr(LBR.Target);
State.CallStack.front() = CallAddr;
State.CallStack.push_front(LBR.Source);
State.InstPtr.update(LBR.Source);
}
@ -82,100 +78,79 @@ void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
// TODO: Tolerate tail call for now, as we may see tail call from libraries.
// This is only for intra function branches, excluding tail calls.
uint64_t Source = State.getCurrentLBRSource();
State.switchToFrame(Source);
State.CallStack.front() = Source;
State.InstPtr.update(Source);
}
std::shared_ptr<StringBasedCtxKey> FrameStack::getContextKey() {
SampleCounter &
VirtualUnwinder::getOrCreateCounter(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack) {
if (Binary->usePseudoProbes()) {
return getOrCreateCounterForProbe(Binary, CallStack);
}
std::shared_ptr<StringBasedCtxKey> KeyStr =
std::make_shared<StringBasedCtxKey>();
KeyStr->Context = Binary->getExpandedContextStr(Stack);
KeyStr->Context = Binary->getExpandedContextStr(CallStack);
KeyStr->genHashCode();
return KeyStr;
auto Ret =
CtxCounterMap->emplace(Hashable<ContextKey>(KeyStr), SampleCounter());
return Ret.first->second;
}
std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {
SampleCounter &
VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack) {
std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey =
std::make_shared<ProbeBasedCtxKey>();
for (auto CallProbe : Stack) {
ProbeBasedKey->Probes.emplace_back(CallProbe);
if (CallStack.size() > 1) {
// We don't need to top frame probe since it should be extracted
// from the range.
// The top of stack is an instruction from the function where
// the LBR address range physcially resides. Strip it since
// the function is not a part of the call context. We also
// don't need its inline context since the probes being unwound
// come with an inline context all the way back to the uninlined
// function in their prefix tree.
auto Iter = CallStack.rbegin();
auto EndT = std::prev(CallStack.rend());
for (; Iter != EndT; Iter++) {
uint64_t Address = *Iter;
const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Address);
// We may not find a probe for a merged or external callsite.
// Callsite merging may cause the loss of original probe IDs.
// Cutting off the context from here since the inline will
// not know how to consume a context with unknown callsites.
if (!CallProbe)
break;
ProbeBasedKey->Probes.emplace_back(CallProbe);
}
}
CSProfileGenerator::compressRecursionContext<const PseudoProbe *>(
ProbeBasedKey->Probes);
ProbeBasedKey->genHashCode();
return ProbeBasedKey;
Hashable<ContextKey> ContextId(ProbeBasedKey);
auto Ret = CtxCounterMap->emplace(ContextId, SampleCounter());
return Ret.first->second;
}
template <typename T>
void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur,
T &Stack) {
if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty())
return;
std::shared_ptr<ContextKey> Key = Stack.getContextKey();
auto Ret = CtxCounterMap->emplace(Hashable<ContextKey>(Key), SampleCounter());
SampleCounter &SCounter = Ret.first->second;
for (auto &Item : Cur->RangeSamples) {
uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item));
uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item));
SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item));
}
for (auto &Item : Cur->BranchSamples) {
uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item));
uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item));
SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item));
}
}
template <typename T>
void VirtualUnwinder::collectSamplesFromFrameTrie(
UnwindState::ProfiledFrame *Cur, T &Stack) {
if (!Cur->isDummyRoot()) {
if (!Stack.pushFrame(Cur)) {
// Process truncated context
for (const auto &Item : Cur->Children) {
// Start a new traversal ignoring its bottom context
collectSamplesFromFrameTrie(Item.second.get());
}
return;
}
}
collectSamplesFromFrame(Cur, Stack);
// Process children frame
for (const auto &Item : Cur->Children) {
collectSamplesFromFrameTrie(Item.second.get(), Stack);
}
// Recover the call stack
Stack.popFrame();
}
void VirtualUnwinder::collectSamplesFromFrameTrie(
UnwindState::ProfiledFrame *Cur) {
if (Binary->usePseudoProbes()) {
ProbeStack Stack(Binary);
collectSamplesFromFrameTrie<ProbeStack>(Cur, Stack);
} else {
FrameStack Stack(Binary);
collectSamplesFromFrameTrie<FrameStack>(Cur, Stack);
}
void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End,
UnwindState &State, uint64_t Repeat) {
uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start);
uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End);
SampleCounter &SCounter =
getOrCreateCounter(State.getBinary(), State.CallStack);
SCounter.recordRangeCount(StartOffset, EndOffset, Repeat);
}
void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
UnwindState &State, uint64_t Repeat) {
if (Branch.IsArtificial)
return;
if (Binary->usePseudoProbes()) {
// Same as recordRangeCount, We don't need to top frame probe since we will
// extract it from branch's source address
State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target,
Repeat);
} else {
State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target,
Repeat);
}
uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source);
uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target);
SampleCounter &SCounter =
getOrCreateCounter(State.getBinary(), State.CallStack);
SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat);
}
bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
@ -224,8 +199,6 @@ bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
// Record `branch` with calling context after unwinding.
recordBranchCount(Branch, State, Repeat);
}
// As samples are aggregated on trie, record them into counter map
collectSamplesFromFrameTrie(State.getDummyRootPtr());
return true;
}
@ -352,8 +325,7 @@ void PerfReader::printUnwinderOutput() {
void PerfReader::unwindSamples() {
for (const auto &Item : AggregatedSamples) {
const HybridSample *Sample = dyn_cast<HybridSample>(Item.first.getPtr());
VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary],
Sample->Binary);
VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]);
Unwinder.unwind(Sample, Item.second);
}
@ -362,7 +334,7 @@ void PerfReader::unwindSamples() {
}
bool PerfReader::extractLBRStack(TraceStream &TraceIt,
SmallVectorImpl<LBREntry> &LBRStack,
SmallVector<LBREntry, 16> &LBRStack,
ProfiledBinary *Binary) {
// The raw format of LBR stack is like:
// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
@ -426,7 +398,7 @@ bool PerfReader::extractLBRStack(TraceStream &TraceIt,
}
bool PerfReader::extractCallstack(TraceStream &TraceIt,
SmallVectorImpl<uint64_t> &CallStack) {
std::list<uint64_t> &CallStack) {
// The raw format of call stack is like:
// 4005dc # leaf frame
// 400634

View File

@ -133,7 +133,7 @@ struct HybridSample : public PerfSample {
// Profiled binary that current frame address belongs to
ProfiledBinary *Binary;
// Call stack recorded in FILO(leaf to root) order
SmallVector<uint64_t, 16> CallStack;
std::list<uint64_t> CallStack;
// LBR stack recorded in FIFO order
SmallVector<LBREntry, 16> LBRStack;
@ -147,7 +147,7 @@ struct HybridSample : public PerfSample {
const HybridSample *Other = dyn_cast<HybridSample>(K);
if (Other->Binary != Binary)
return false;
const SmallVector<uint64_t, 16> &OtherCallStack = Other->CallStack;
const std::list<uint64_t> &OtherCallStack = Other->CallStack;
const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack;
if (CallStack.size() != OtherCallStack.size() ||
@ -193,40 +193,14 @@ using AggregatedCounter =
std::unordered_map<Hashable<PerfSample>, uint64_t,
Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;
using SampleVector = SmallVector<std::tuple<uint64_t, uint64_t, uint64_t>, 16>;
// The state for the unwinder, it doesn't hold the data but only keep the
// pointer/index of the data, While unwinding, the CallStack is changed
// dynamicially and will be recorded as the context of the sample
struct UnwindState {
// Profiled binary that current frame address belongs to
const ProfiledBinary *Binary;
// Call stack trie node
struct ProfiledFrame {
const uint64_t Address = 0;
ProfiledFrame *Parent;
SampleVector RangeSamples;
SampleVector BranchSamples;
std::unordered_map<uint64_t, std::unique_ptr<ProfiledFrame>> Children;
ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr)
: Address(Addr), Parent(P) {}
ProfiledFrame *getOrCreateChildFrame(uint64_t Address) {
assert(Address && "Address can't be zero!");
auto Ret = Children.emplace(
Address, std::make_unique<ProfiledFrame>(Address, this));
return Ret.first->second.get();
}
void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) {
RangeSamples.emplace_back(std::make_tuple(Start, End, Count));
}
void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) {
BranchSamples.emplace_back(std::make_tuple(Source, Target, Count));
}
bool isDummyRoot() { return Address == 0; }
};
ProfiledFrame DummyTrieRoot;
ProfiledFrame *CurrentLeafFrame;
// TODO: switch to use trie for call stack
std::list<uint64_t> CallStack;
// Used to fall through the LBR stack
uint32_t LBRIndex = 0;
// Reference to HybridSample.LBRStack
@ -234,20 +208,19 @@ struct UnwindState {
// Used to iterate the address range
InstructionPointer InstPtr;
UnwindState(const HybridSample *Sample)
: Binary(Sample->Binary), LBRStack(Sample->LBRStack),
InstPtr(Sample->Binary, Sample->CallStack.front()) {
initFrameTrie(Sample->CallStack);
}
: Binary(Sample->Binary), CallStack(Sample->CallStack),
LBRStack(Sample->LBRStack),
InstPtr(Sample->Binary, Sample->CallStack.front()) {}
bool validateInitialState() {
uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
uint64_t LeafAddr = CurrentLeafFrame->Address;
uint64_t StackLeaf = CallStack.front();
// When we take a stack sample, ideally the sampling distance between the
// leaf IP of stack and the last LBR target shouldn't be very large.
// Use a heuristic size (0x100) to filter out broken records.
if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) {
if (StackLeaf < LBRLeaf || StackLeaf >= LBRLeaf + 0x100) {
WithColor::warning() << "Bogus trace: stack tip = "
<< format("%#010x", LeafAddr)
<< format("%#010x", StackLeaf)
<< ", LBR tip = " << format("%#010x\n", LBRLeaf);
return false;
}
@ -255,40 +228,19 @@ struct UnwindState {
}
void checkStateConsistency() {
assert(InstPtr.Address == CurrentLeafFrame->Address &&
assert(InstPtr.Address == CallStack.front() &&
"IP should align with context leaf");
}
std::string getExpandedContextStr() const {
return Binary->getExpandedContextStr(CallStack);
}
const ProfiledBinary *getBinary() const { return Binary; }
bool hasNextLBR() const { return LBRIndex < LBRStack.size(); }
uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; }
uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; }
const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; }
void advanceLBR() { LBRIndex++; }
ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; }
void pushFrame(uint64_t Address) {
CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address);
}
void switchToFrame(uint64_t Address) {
if (CurrentLeafFrame->Address == Address)
return;
CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address);
}
void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; }
void initFrameTrie(const SmallVectorImpl<uint64_t> &CallStack) {
ProfiledFrame *Cur = &DummyTrieRoot;
for (auto Address : reverse(CallStack)) {
Cur = Cur->getOrCreateChildFrame(Address);
}
CurrentLeafFrame = Cur;
}
ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; }
};
// Base class for sample counter key with context
@ -378,56 +330,6 @@ using ContextSampleCounterMap =
std::unordered_map<Hashable<ContextKey>, SampleCounter,
Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>;
struct FrameStack {
SmallVector<uint64_t, 16> Stack;
const ProfiledBinary *Binary;
FrameStack(const ProfiledBinary *B) : Binary(B) {}
bool pushFrame(UnwindState::ProfiledFrame *Cur) {
Stack.push_back(Cur->Address);
return true;
}
void popFrame() {
if (!Stack.empty())
Stack.pop_back();
}
std::shared_ptr<StringBasedCtxKey> getContextKey();
};
struct ProbeStack {
SmallVector<const PseudoProbe *, 16> Stack;
const ProfiledBinary *Binary;
ProbeStack(const ProfiledBinary *B) : Binary(B) {}
bool pushFrame(UnwindState::ProfiledFrame *Cur) {
const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address);
// We may not find a probe for a merged or external callsite.
// Callsite merging may cause the loss of original probe IDs.
// Cutting off the context from here since the inliner will
// not know how to consume a context with unknown callsites.
if (!CallProbe)
return false;
Stack.push_back(CallProbe);
return true;
}
void popFrame() {
if (!Stack.empty())
Stack.pop_back();
}
// Use pseudo probe based context key to get the sample counter
// A context stands for a call path from 'main' to an uninlined
// callee with all inline frames recovered on that path. The probes
// belonging to that call path is the probes either originated from
// the callee or from any functions inlined into the callee. Since
// pseudo probes are organized in a tri-tree style after decoded,
// the tree path from the tri-tree root (which is the uninlined
// callee) to the probe node forms an inline context.
// Here we use a list of probe(pointer) as the context key to speed up
// aggregation and the final context string will be generate in
// ProfileGenerator
std::shared_ptr<ProbeBasedCtxKey> getContextKey();
};
/*
As in hybrid sample we have a group of LBRs and the most recent sampling call
stack, we can walk through those LBRs to infer more call stacks which would be
@ -449,43 +351,47 @@ range as sample counter for further CS profile generation.
*/
class VirtualUnwinder {
public:
VirtualUnwinder(ContextSampleCounterMap *Counter, const ProfiledBinary *B)
: CtxCounterMap(Counter), Binary(B) {}
bool unwind(const HybridSample *Sample, uint64_t Repeat);
VirtualUnwinder(ContextSampleCounterMap *Counter) : CtxCounterMap(Counter) {}
private:
bool isCallState(UnwindState &State) const {
// The tail call frame is always missing here in stack sample, we will
// use a specific tail call tracker to infer it.
return Binary->addressIsCall(State.getCurrentLBRSource());
return State.getBinary()->addressIsCall(State.getCurrentLBRSource());
}
bool isReturnState(UnwindState &State) const {
// Simply check addressIsReturn, as ret is always reliable, both for
// regular call and tail call.
return Binary->addressIsReturn(State.getCurrentLBRSource());
return State.getBinary()->addressIsReturn(State.getCurrentLBRSource());
}
void unwindCall(UnwindState &State);
void unwindLinear(UnwindState &State, uint64_t Repeat);
void unwindReturn(UnwindState &State);
void unwindBranchWithinFrame(UnwindState &State);
template <typename T>
void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack);
// Collect each samples on trie node by DFS traversal
template <typename T>
void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack);
void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur);
bool unwind(const HybridSample *Sample, uint64_t Repeat);
void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
uint64_t Repeat);
void recordBranchCount(const LBREntry &Branch, UnwindState &State,
uint64_t Repeat);
SampleCounter &getOrCreateCounter(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack);
// Use pseudo probe based context key to get the sample counter
// A context stands for a call path from 'main' to an uninlined
// callee with all inline frames recovered on that path. The probes
// belonging to that call path is the probes either originated from
// the callee or from any functions inlined into the callee. Since
// pseudo probes are organized in a tri-tree style after decoded,
// the tree path from the tri-tree root (which is the uninlined
// callee) to the probe node forms an inline context.
// Here we use a list of probe(pointer) as the context key to speed up
// aggregation and the final context string will be generate in
// ProfileGenerator
SampleCounter &getOrCreateCounterForProbe(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack);
private:
ContextSampleCounterMap *CtxCounterMap;
// Profiled binary that current frame address belongs to
const ProfiledBinary *Binary;
};
// Filename to binary map
@ -551,11 +457,10 @@ private:
// Parse the hybrid sample including the call and LBR line
void parseHybridSample(TraceStream &TraceIt);
// Extract call stack from the perf trace lines
bool extractCallstack(TraceStream &TraceIt,
SmallVectorImpl<uint64_t> &CallStack);
bool extractCallstack(TraceStream &TraceIt, std::list<uint64_t> &CallStack);
// Extract LBR stack from one perf trace line
bool extractLBRStack(TraceStream &TraceIt,
SmallVectorImpl<LBREntry> &LBRStack,
SmallVector<LBREntry, 16> &LBRStack,
ProfiledBinary *Binary);
void checkAndSetPerfType(cl::list<std::string> &PerfTraceFilenames);
// Post process the profile after trace aggregation, we will do simple range

View File

@ -126,13 +126,13 @@ bool ProfiledBinary::inlineContextEqual(uint64_t Address1,
Context2.begin(), Context2.begin() + Context2.size() - 1);
}
std::string ProfiledBinary::getExpandedContextStr(
const SmallVectorImpl<uint64_t> &Stack) const {
std::string
ProfiledBinary::getExpandedContextStr(const std::list<uint64_t> &Stack) const {
std::string ContextStr;
SmallVector<std::string, 16> ContextVec;
// Process from frame root to leaf
for (auto Address : Stack) {
uint64_t Offset = virtualAddrToOffset(Address);
for (auto Iter = Stack.rbegin(); Iter != Stack.rend(); Iter++) {
uint64_t Offset = virtualAddrToOffset(*Iter);
const FrameLocationStack &ExpandedContext = getFrameLocationStack(Offset);
for (const auto &Loc : ExpandedContext) {
ContextVec.push_back(getCallSite(Loc));

View File

@ -236,8 +236,7 @@ public:
// Get the context string of the current stack with inline context filled in.
// It will search the disassembling info stored in Offset2LocStackMap. This is
// used as the key of function sample map
std::string
getExpandedContextStr(const SmallVectorImpl<uint64_t> &Stack) const;
std::string getExpandedContextStr(const std::list<uint64_t> &stack) const;
const PseudoProbe *getCallProbeForAddr(uint64_t Address) const {
return ProbeDecoder.getCallProbeForAddr(Address);