1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[CSSPGO][llvm-profgen] Refactor to unify hashable interface for trace sample and context-sensitive counter

As we plan to support both CSSPGO and AutoFDO for llvm-profgen, we will have different kinds of perf sample and different kinds of sample counter(cs/non-cs, with/without pseudo probe) which both need to do aggregation in hash map.  This change implements the hashable interface(`Hashable`) and the unified base class for them to have better extensibility and reusability.

Currently perf trace sample and sample counter with context implemented this `Hashable` and  the class hierarchy is like:

```
| Hashable
           | PerfSample
                          | HybridSample
                          | LBRSample
           | ContextKey
                          | StringBasedCtxKey
                          | ProbeBasedCtxKey
                          | CallsiteBasedCtxKey
           | ...
```

- Class specifying `Hashable` should implement `getHashCode` and `isEqual`. Here we make `getHashCode` a non-virtual function to avoid vtable overhead, so derived class should calculate and assign the base class's HashCode manually. This also provides the flexibility for calculating the hash code incrementally(like rolling hash) during frame stack unwinding
- `isEqual` is a virtual function, which will have perf overhead. In the future, if we redesign a better hash function, then we can just skip this or switch to non-virtual function.
- Added `PerfSample` and `ContextKey` as base class for perf sample and counter context key, leveraging llvm-style RTTI for this.
- Added `StringBasedCtxKey` class extending  `ContextKey` to use string as context id.
- Refactor `AggregationCounter` to take all kinds of `PerfSample` as key
- Refactor `ContextSampleCounter` to take all kinds of `ContextKey` as key
- Other refactoring work:
 - Create a wrapper class `SampleCounter` to wrap `RangeCounter` and `BranchCounter`
 - Hoist `ContextId` and `FunctionProfile` out of `populateFunctionBodySamples` and `populateFunctionBoundarySamples` to reuse them in ProfileGenerator

Differential Revision: https://reviews.llvm.org/D92584
This commit is contained in:
wlei 2020-12-02 23:10:11 -08:00
parent b32f59cb87
commit 35a868aba5
4 changed files with 279 additions and 157 deletions

View File

@ -72,26 +72,39 @@ void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
State.InstPtr.update(Source);
}
SampleCounter &
VirtualUnwinder::getOrCreateSampleCounter(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack) {
std::shared_ptr<StringBasedCtxKey> KeyStr =
std::make_shared<StringBasedCtxKey>();
KeyStr->Context = Binary->getExpandedContextStr(CallStack);
KeyStr->genHashCode();
auto Ret =
CtxCounterMap->emplace(Hashable<ContextKey>(KeyStr), SampleCounter());
return Ret.first->second;
}
void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End,
UnwindState &State, uint64_t Repeat) {
std::string &&ContextId = State.getExpandedContextStr();
uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start);
uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End);
SampleCounters->recordRangeCount(ContextId, StartOffset, EndOffset, Repeat);
SampleCounter &SCounter =
getOrCreateSampleCounter(State.getBinary(), State.CallStack);
SCounter.recordRangeCount(StartOffset, EndOffset, Repeat);
}
void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
UnwindState &State, uint64_t Repeat) {
if (Branch.IsArtificial)
return;
std::string &&ContextId = State.getExpandedContextStr();
uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source);
uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target);
SampleCounters->recordBranchCount(ContextId, SourceOffset, TargetOffset,
Repeat);
SampleCounter &SCounter =
getOrCreateSampleCounter(State.getBinary(), State.CallStack);
SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat);
}
bool VirtualUnwinder::unwind(const HybridSample &Sample, uint64_t Repeat) {
bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
// Capture initial state as starting point for unwinding.
UnwindState State(Sample);
@ -198,10 +211,10 @@ ProfiledBinary *PerfReader::getBinary(uint64_t Address) {
return Iter->second;
}
static void printSampleCounter(ContextRangeCounter &Counter) {
// Use ordered map to make the output deterministic
std::map<std::string, RangeSample> OrderedCounter(Counter.begin(),
Counter.end());
// Use ordered map to make the output deterministic
using OrderedCounterForPrint = std::map<StringRef, RangeSample>;
static void printSampleCounter(OrderedCounterForPrint &OrderedCounter) {
for (auto Range : OrderedCounter) {
outs() << Range.first << "\n";
for (auto I : Range.second) {
@ -211,20 +224,40 @@ static void printSampleCounter(ContextRangeCounter &Counter) {
}
}
static void printRangeCounter(ContextSampleCounterMap &Counter) {
OrderedCounterForPrint OrderedCounter;
for (auto &CI : Counter) {
const StringBasedCtxKey *CtxKey =
dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
OrderedCounter[CtxKey->Context] = CI.second.RangeCounter;
}
printSampleCounter(OrderedCounter);
}
static void printBranchCounter(ContextSampleCounterMap &Counter) {
OrderedCounterForPrint OrderedCounter;
for (auto &CI : Counter) {
const StringBasedCtxKey *CtxKey =
dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
OrderedCounter[CtxKey->Context] = CI.second.BranchCounter;
}
printSampleCounter(OrderedCounter);
}
void PerfReader::printUnwinderOutput() {
for (auto I : BinarySampleCounters) {
const ProfiledBinary *Binary = I.first;
outs() << "Binary(" << Binary->getName().str() << ")'s Range Counter:\n";
printSampleCounter(I.second.RangeCounter);
printRangeCounter(I.second);
outs() << "\nBinary(" << Binary->getName().str() << ")'s Branch Counter:\n";
printSampleCounter(I.second.BranchCounter);
printBranchCounter(I.second);
}
}
void PerfReader::unwindSamples() {
for (const auto &Item : AggregatedSamples) {
const HybridSample &Sample = Item.first;
VirtualUnwinder Unwinder(&BinarySampleCounters[Sample.Binary]);
const HybridSample *Sample = dyn_cast<HybridSample>(Item.first.getPtr());
VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]);
Unwinder.unwind(Sample, Item.second);
}
@ -366,26 +399,27 @@ void PerfReader::parseHybridSample(TraceStream &TraceIt) {
// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
// ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries
//
HybridSample Sample;
std::shared_ptr<HybridSample> Sample = std::make_shared<HybridSample>();
// Parsing call stack and populate into HybridSample.CallStack
if (!extractCallstack(TraceIt, Sample.CallStack)) {
if (!extractCallstack(TraceIt, Sample->CallStack)) {
// Skip the next LBR line matched current call stack
if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x"))
TraceIt.advance();
return;
}
// Set the binary current sample belongs to
Sample.Binary = getBinary(Sample.CallStack.front());
Sample->Binary = getBinary(Sample->CallStack.front());
if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) {
// Parsing LBR stack and populate into HybridSample.LBRStack
if (extractLBRStack(TraceIt, Sample.LBRStack, Sample.Binary)) {
if (extractLBRStack(TraceIt, Sample->LBRStack, Sample->Binary)) {
// Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR
// ranges
Sample.CallStack.front() = Sample.LBRStack[0].Target;
Sample->CallStack.front() = Sample->LBRStack[0].Target;
// Record samples by aggregation
AggregatedSamples[Sample]++;
Sample->genHashCode();
AggregatedSamples[Hashable<PerfSample>(Sample)]++;
}
} else {
// LBR sample is encoded in single line after stack sample

View File

@ -10,6 +10,7 @@
#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H
#include "ErrorHandling.h"
#include "ProfiledBinary.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Regex.h"
#include <fstream>
@ -75,8 +76,60 @@ struct LBREntry {
: Source(S), Target(T), IsArtificial(I) {}
};
// Hash interface for generic data of type T
// Data should implement a \fn getHashCode and a \fn isEqual
// Currently getHashCode is non-virtual to avoid the overhead of calling vtable,
// i.e we explicitly calculate hash of derived class, assign to base class's
// HashCode. This also provides the flexibility for calculating the hash code
// incrementally(like rolling hash) during frame stack unwinding since unwinding
// only changes the leaf of frame stack. \fn isEqual is a virtual function,
// which will have perf overhead. In the future, if we redesign a better hash
// function, then we can just skip this or switch to non-virtual function(like
// just ignore comparision if hash conflicts probabilities is low)
template <class T> class Hashable {
public:
std::shared_ptr<T> Data;
Hashable(const std::shared_ptr<T> &D) : Data(D) {}
// Hash code generation
struct Hash {
uint64_t operator()(const Hashable<T> &Key) const {
// Don't make it virtual for getHashCode
assert(Key.Data->getHashCode() && "Should generate HashCode for it!");
return Key.Data->getHashCode();
}
};
// Hash equal
struct Equal {
bool operator()(const Hashable<T> &LHS, const Hashable<T> &RHS) const {
// Precisely compare the data, vtable will have overhead.
return LHS.Data->isEqual(RHS.Data.get());
}
};
T *getPtr() const { return Data.get(); }
};
// Base class to extend for all types of perf sample
struct PerfSample {
uint64_t HashCode = 0;
virtual ~PerfSample() = default;
uint64_t getHashCode() const { return HashCode; }
virtual bool isEqual(const PerfSample *K) const {
return HashCode == K->HashCode;
};
// Utilities for LLVM-style RTTI
enum PerfKind { PK_HybridSample };
const PerfKind Kind;
PerfKind getKind() const { return Kind; }
PerfSample(PerfKind K) : Kind(K){};
};
// The parsed hybrid sample including call stack and LBR stack.
struct HybridSample {
struct HybridSample : public PerfSample {
// Profiled binary that current frame address belongs to
ProfiledBinary *Binary;
// Call stack recorded in FILO(leaf to root) order
@ -84,12 +137,18 @@ struct HybridSample {
// LBR stack recorded in FIFO order
SmallVector<LBREntry, 16> LBRStack;
HybridSample() : PerfSample(PK_HybridSample){};
static bool classof(const PerfSample *K) {
return K->getKind() == PK_HybridSample;
}
// Used for sample aggregation
bool operator==(const HybridSample &Other) const {
if (Other.Binary != Binary)
bool isEqual(const PerfSample *K) const override {
const HybridSample *Other = dyn_cast<HybridSample>(K);
if (Other->Binary != Binary)
return false;
const std::list<uint64_t> &OtherCallStack = Other.CallStack;
const SmallVector<LBREntry, 16> &OtherLBRStack = Other.LBRStack;
const std::list<uint64_t> &OtherCallStack = Other->CallStack;
const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack;
if (CallStack.size() != OtherCallStack.size() ||
LBRStack.size() != OtherLBRStack.size())
@ -108,8 +167,32 @@ struct HybridSample {
}
return true;
}
void genHashCode() {
// Use simple DJB2 hash
auto HashCombine = [](uint64_t H, uint64_t V) {
return ((H << 5) + H) + V;
};
uint64_t Hash = 5381;
Hash = HashCombine(Hash, reinterpret_cast<uint64_t>(Binary));
for (const auto &Value : CallStack) {
Hash = HashCombine(Hash, Value);
}
for (const auto &Entry : LBRStack) {
Hash = HashCombine(Hash, Entry.Source);
Hash = HashCombine(Hash, Entry.Target);
}
HashCode = Hash;
}
};
// After parsing the sample, we record the samples by aggregating them
// into this counter. The key stores the sample data and the value is
// the sample repeat times.
using AggregatedCounter =
std::unordered_map<Hashable<PerfSample>, uint64_t,
Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;
// The state for the unwinder, it doesn't hold the data but only keep the
// pointer/index of the data, While unwinding, the CallStack is changed
// dynamicially and will be recorded as the context of the sample
@ -124,10 +207,10 @@ struct UnwindState {
const SmallVector<LBREntry, 16> &LBRStack;
// Used to iterate the address range
InstructionPointer InstPtr;
UnwindState(const HybridSample &Sample)
: Binary(Sample.Binary), CallStack(Sample.CallStack),
LBRStack(Sample.LBRStack),
InstPtr(Sample.Binary, Sample.CallStack.front()) {}
UnwindState(const HybridSample *Sample)
: Binary(Sample->Binary), CallStack(Sample->CallStack),
LBRStack(Sample->LBRStack),
InstPtr(Sample->Binary, Sample->CallStack.front()) {}
bool validateInitialState() {
uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
@ -160,56 +243,61 @@ struct UnwindState {
void advanceLBR() { LBRIndex++; }
};
// Base class for sample counter key with context
struct ContextKey {
uint64_t HashCode = 0;
virtual ~ContextKey() = default;
uint64_t getHashCode() const { return HashCode; }
virtual bool isEqual(const ContextKey *K) const {
return HashCode == K->HashCode;
};
// Utilities for LLVM-style RTTI
enum ContextKind { CK_StringBased };
const ContextKind Kind;
ContextKind getKind() const { return Kind; }
ContextKey(ContextKind K) : Kind(K){};
};
// String based context id
struct StringBasedCtxKey : public ContextKey {
std::string Context;
StringBasedCtxKey() : ContextKey(CK_StringBased){};
static bool classof(const ContextKey *K) {
return K->getKind() == CK_StringBased;
}
bool isEqual(const ContextKey *K) const override {
const StringBasedCtxKey *Other = dyn_cast<StringBasedCtxKey>(K);
return Context == Other->Context;
}
void genHashCode() { HashCode = hash_value(Context); }
};
// The counter of branch samples for one function indexed by the branch,
// which is represented as the source and target offset pair.
using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
// The counter of range samples for one function indexed by the range,
// which is represented as the start and end offset pair.
using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
// Range sample counters indexed by the context string
using ContextRangeCounter = std::unordered_map<std::string, RangeSample>;
// Branch sample counters indexed by the context string
using ContextBranchCounter = std::unordered_map<std::string, BranchSample>;
// Wrapper for sample counters including range counter and branch counter
struct SampleCounter {
RangeSample RangeCounter;
BranchSample BranchCounter;
// For Hybrid sample counters
struct ContextSampleCounters {
ContextRangeCounter RangeCounter;
ContextBranchCounter BranchCounter;
void recordRangeCount(std::string &ContextId, uint64_t Start, uint64_t End,
uint64_t Repeat) {
RangeCounter[ContextId][{Start, End}] += Repeat;
void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) {
RangeCounter[{Start, End}] += Repeat;
}
void recordBranchCount(std::string &ContextId, uint64_t Source,
uint64_t Target, uint64_t Repeat) {
BranchCounter[ContextId][{Source, Target}] += Repeat;
void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) {
BranchCounter[{Source, Target}] += Repeat;
}
};
struct HybridSampleHash {
uint64_t hashCombine(uint64_t Hash, uint64_t Value) const {
// Simple DJB2 hash
return ((Hash << 5) + Hash) + Value;
}
uint64_t operator()(const HybridSample &Sample) const {
uint64_t Hash = 5381;
Hash = hashCombine(Hash, reinterpret_cast<uint64_t>(Sample.Binary));
for (const auto &Value : Sample.CallStack) {
Hash = hashCombine(Hash, Value);
}
for (const auto &Entry : Sample.LBRStack) {
Hash = hashCombine(Hash, Entry.Source);
Hash = hashCombine(Hash, Entry.Target);
}
return Hash;
}
};
// After parsing the sample, we record the samples by aggregating them
// into this structure and the value is the sample counter.
using AggregationCounter =
std::unordered_map<HybridSample, uint64_t, HybridSampleHash>;
// Sample counter with context to support context-sensitive profile
using ContextSampleCounterMap =
std::unordered_map<Hashable<ContextKey>, SampleCounter,
Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>;
/*
As in hybrid sample we have a group of LBRs and the most recent sampling call
@ -232,7 +320,7 @@ range as sample counter for further CS profile generation.
*/
class VirtualUnwinder {
public:
VirtualUnwinder(ContextSampleCounters *Counters) : SampleCounters(Counters) {}
VirtualUnwinder(ContextSampleCounterMap *Counter) : CtxCounterMap(Counter) {}
bool isCallState(UnwindState &State) const {
// The tail call frame is always missing here in stack sample, we will
@ -250,14 +338,16 @@ public:
void unwindLinear(UnwindState &State, uint64_t Repeat);
void unwindReturn(UnwindState &State);
void unwindBranchWithinFrame(UnwindState &State);
bool unwind(const HybridSample &Sample, uint64_t Repeat);
bool unwind(const HybridSample *Sample, uint64_t Repeat);
void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
uint64_t Repeat);
void recordBranchCount(const LBREntry &Branch, UnwindState &State,
uint64_t Repeat);
SampleCounter &getOrCreateSampleCounter(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack);
private:
ContextSampleCounters *SampleCounters;
ContextSampleCounterMap *CtxCounterMap;
};
// Filename to binary map
@ -268,7 +358,7 @@ using AddressBinaryMap = std::map<uint64_t, ProfiledBinary *>;
// same binary loaded at different addresses, they should share the same sample
// counter
using BinarySampleCounterMap =
std::unordered_map<ProfiledBinary *, ContextSampleCounters>;
std::unordered_map<ProfiledBinary *, ContextSampleCounterMap>;
// Load binaries and read perf trace to parse the events and samples
class PerfReader {
@ -344,7 +434,7 @@ private:
private:
BinarySampleCounterMap BinarySampleCounters;
// Samples with the repeating time generated by the perf reader
AggregationCounter AggregatedSamples;
AggregatedCounter AggregatedSamples;
PerfScriptType PerfType;
};

View File

@ -178,19 +178,13 @@ void CSProfileGenerator::updateBodySamplesforFunctionProfile(
}
}
void CSProfileGenerator::populateFunctionBodySamples() {
for (const auto &BI : BinarySampleCounters) {
ProfiledBinary *Binary = BI.first;
for (const auto &CI : BI.second.RangeCounter) {
StringRef ContextId(CI.first);
// Get or create function profile for the range
FunctionSamples &FunctionProfile =
getFunctionProfileForContext(ContextId);
void CSProfileGenerator::populateFunctionBodySamples(
FunctionSamples &FunctionProfile, const RangeSample &RangeCounter,
ProfiledBinary *Binary) {
// Compute disjoint ranges first, so we can use MAX
// for calculating count for each location.
RangeSample Ranges;
findDisjointRanges(Ranges, CI.second);
findDisjointRanges(Ranges, RangeCounter);
for (auto Range : Ranges) {
uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
@ -217,20 +211,13 @@ void CSProfileGenerator::populateFunctionBodySamples() {
IP.advance();
}
}
}
}
}
void CSProfileGenerator::populateFunctionBoundarySamples() {
for (const auto &BI : BinarySampleCounters) {
ProfiledBinary *Binary = BI.first;
for (const auto &CI : BI.second.BranchCounter) {
StringRef ContextId(CI.first);
// Get or create function profile for branch Source
FunctionSamples &FunctionProfile =
getFunctionProfileForContext(ContextId);
void CSProfileGenerator::populateFunctionBoundarySamples(
StringRef ContextId, FunctionSamples &FunctionProfile,
const BranchSample &BranchCounters, ProfiledBinary *Binary) {
for (auto Entry : CI.second) {
for (auto Entry : BranchCounters) {
uint64_t SourceOffset = Entry.first.first;
uint64_t TargetOffset = Entry.first.second;
uint64_t Count = Entry.second;
@ -241,8 +228,7 @@ void CSProfileGenerator::populateFunctionBoundarySamples() {
continue;
// Record called target sample and its count
const FrameLocation &LeafLoc =
Binary->getInlineLeafFrameLoc(SourceOffset);
const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset);
FunctionProfile.addCalledTargetSamples(LeafLoc.second.LineOffset,
LeafLoc.second.Discriminator,
@ -258,16 +244,11 @@ void CSProfileGenerator::populateFunctionBoundarySamples() {
ContextId.rsplit(" @ ").first.str() + " @ " + CalleeContextId;
}
if (ProfileMap.find(CalleeContextId) != ProfileMap.end()) {
FunctionSamples &CalleeProfile = ProfileMap[CalleeContextId];
FunctionSamples &CalleeProfile =
getFunctionProfileForContext(CalleeContextId);
assert(Count != 0 && "Unexpected zero weight branch");
if (CalleeProfile.getName().size()) {
CalleeProfile.addHeadSamples(Count);
}
}
}
}
}
}
static FrameLocation getCallerContext(StringRef CalleeContext,

View File

@ -64,12 +64,24 @@ public:
public:
void generateProfile() override {
for (const auto &BI : BinarySampleCounters) {
ProfiledBinary *Binary = BI.first;
for (const auto &CI : BI.second) {
const StringBasedCtxKey *CtxKey =
dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
StringRef ContextId(CtxKey->Context);
// Get or create function profile for the range
FunctionSamples &FunctionProfile =
getFunctionProfileForContext(ContextId);
// Fill in function body samples
populateFunctionBodySamples();
populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter,
Binary);
// Fill in boundary sample counts as well as call site samples for calls
populateFunctionBoundarySamples();
populateFunctionBoundarySamples(ContextId, FunctionProfile,
CI.second.BranchCounter, Binary);
}
}
// Fill in call site value sample for inlined calls and also use context to
// infer missing samples. Since we don't have call count for inlined
// functions, we estimate it from inlinee's profile using the entry of the
@ -85,8 +97,13 @@ private:
uint64_t Count);
// Lookup or create FunctionSamples for the context
FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
void populateFunctionBodySamples();
void populateFunctionBoundarySamples();
void populateFunctionBodySamples(FunctionSamples &FunctionProfile,
const RangeSample &RangeCounters,
ProfiledBinary *Binary);
void populateFunctionBoundarySamples(StringRef ContextId,
FunctionSamples &FunctionProfile,
const BranchSample &BranchCounters,
ProfiledBinary *Binary);
void populateInferredFunctionSamples();
};