mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 10:42:39 +01:00
[XRay] Account: recursion detection
Summary: Recursion detection can be non-trivial. Currently, the state-of-the-art for LLVM, as far as i'm concerned, is D72362 `[clang-tidy] misc-no-recursion: a new check`. However, it is quite limited: * It does very basic call-graph based analysis, in the sense it will report even dynamically-unreachable recursion. * It is inherently limited to a single TU * It is hard to gauge how problematic each recursion is in practice. Some of that can be addressed by adding clang analyzer-based check, then it would at least support multiple TU's. However, we can approach this problem from another angle - dynamic run-time analysis. We already have means to capture a run-time callgraph (XRay, duh), and there are already means to reconstruct it within `llvm-xray` tool. This proposes to add a `-recursive-calls-only` switch to the `account` tool. When the switch is on, when re-constructing callgraph for latency reconstruction, each time we enter/leave some function, we increment/decrement an entry for the function in a "recursion depth" map. If, when we leave the function, said entry was at `1`, then that means the function didn't call itself, however if it is at `2` or more, then that means the function (possibly indirectly) called itself. If the depth is 1, we don't account the time spent there, unless within this call stack the function already recursed into itself. Note that we don't pay for recursion depth tracking when `recursive-calls-only` is not on, and the perf impact is insignificant (+0.3% regression) The overhead of the option is actually negative, around -5.26% user time on a medium-sized (3.5G) XRay log. As a practical example, that 3.5G log is a capture of the entire middle-end opt pipeline at `-O3` for RawSpeed unity build. There are total of `5500` functions in the log, however `-recursive-calls-only` says that `269`, or 5%, are recursive. Having this functionality could be helpful for recursion eradication. Reviewers: dberris, mboerger Reviewed By: dberris Subscribers: llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D84582
This commit is contained in:
parent
8d83600e34
commit
2e1aaf1aa5
@ -0,0 +1,46 @@
|
||||
# RUN: llvm-xray account -d %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s
|
||||
# RUN: llvm-xray account -d -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s
|
||||
|
||||
---
|
||||
header:
|
||||
version: 1
|
||||
type: 0
|
||||
constant-tsc: true
|
||||
nonstop-tsc: true
|
||||
cycle-frequency: 0
|
||||
records:
|
||||
# Here we reconstruct the following call trace:
|
||||
#
|
||||
# f1()
|
||||
# f2()
|
||||
# f3()
|
||||
# f2()
|
||||
#
|
||||
# But we find that we're missing an exit record for f2() because it's
|
||||
# tail-called f3(). We make sure that if we see a trace like this that we can
|
||||
# deduce tail calls, and account the time (potentially wrongly) to f2() when
|
||||
# f1() exits. That is because we don't go back to f3()'s entry record to
|
||||
# properly do the math on the timing of f2().
|
||||
#
|
||||
# As a result, we can deduce that f2() is not recursive here.
|
||||
#
|
||||
# Note that by default, tail/sibling call deduction is disabled, and is enabled
|
||||
# with a flag "-d" or "-deduce-sibling-calls".
|
||||
#
|
||||
- { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 }
|
||||
- { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
|
||||
- { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }
|
||||
- { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 10003 }
|
||||
- { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10004 }
|
||||
- { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10005 }
|
||||
- { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 10006 }
|
||||
...
|
||||
|
||||
# ALL: Functions with latencies: 3
|
||||
# ALL-NEXT: funcid count [ min, med, 90p, 99p, max] sum function
|
||||
# ALL-NEXT: 1 1 [ 4.000000, 4.000000, 4.000000, 4.000000, 4.000000] 4.000000 <invalid>:0:0: @(1)
|
||||
# ALL-NEXT: 2 2 [ 1.000000, 3.000000, 3.000000, 3.000000, 3.000000] 4.000000 <invalid>:0:0: @(2)
|
||||
# ALL-NEXT: 3 1 [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] 1.000000 <invalid>:0:0: @(3)
|
||||
|
||||
# RECURSIVE: Functions with latencies: 0
|
||||
# RECURSIVE-NEXT: funcid count [ min, med, 90p, 99p, max] sum function
|
34
test/tools/llvm-xray/X86/account-recursive-calls-only.yaml
Normal file
34
test/tools/llvm-xray/X86/account-recursive-calls-only.yaml
Normal file
@ -0,0 +1,34 @@
|
||||
# RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s
|
||||
# RUN: llvm-xray account -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s
|
||||
|
||||
---
|
||||
header:
|
||||
version: 1
|
||||
type: 0
|
||||
constant-tsc: true
|
||||
nonstop-tsc: true
|
||||
cycle-frequency: 2601000000
|
||||
records:
|
||||
- { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 0 }
|
||||
- { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 100000000 }
|
||||
- { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 200000000 }
|
||||
- { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 300000000 }
|
||||
- { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 400000000 }
|
||||
- { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 500000000 }
|
||||
- { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 600000000 }
|
||||
- { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 700000000 }
|
||||
- { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 800000000 }
|
||||
- { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 900000000 }
|
||||
- { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 1000000000 }
|
||||
- { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 1100000000 }
|
||||
...
|
||||
|
||||
# ALL: Functions with latencies: 3
|
||||
# ALL-NEXT: funcid count [ min, med, 90p, 99p, max] sum function
|
||||
# ALL-NEXT: 1 1 [ 0.038447, 0.038447, 0.038447, 0.038447, 0.038447] 0.038447 <invalid>:0:0: @(1)
|
||||
# ALL-NEXT: 2 2 [ 0.038447, 0.192234, 0.192234, 0.192234, 0.192234] 0.230681 <invalid>:0:0: @(2)
|
||||
# ALL-NEXT: 3 3 [ 0.038447, 0.038447, 0.115340, 0.115340, 0.115340] 0.192234 <invalid>:0:0: @(3)
|
||||
|
||||
# RECURSIVE: Functions with latencies: 1
|
||||
# RECURSIVE-NEXT: funcid count [ min, med, 90p, 99p, max] sum function
|
||||
# RECURSIVE-NEXT: 3 2 [ 0.038447, 0.115340, 0.115340, 0.115340, 0.115340] 0.153787 <invalid>:0:0: @(3)
|
@ -35,6 +35,9 @@ static cl::opt<bool>
|
||||
cl::sub(Account), cl::init(false));
|
||||
static cl::alias AccountKeepGoing2("k", cl::aliasopt(AccountKeepGoing),
|
||||
cl::desc("Alias for -keep_going"));
|
||||
static cl::opt<bool> AccountRecursiveCallsOnly(
|
||||
"recursive-calls-only", cl::desc("Only count the calls that are recursive"),
|
||||
cl::sub(Account), cl::init(false));
|
||||
static cl::opt<bool> AccountDeduceSiblingCalls(
|
||||
"deduce-sibling-calls",
|
||||
cl::desc("Deduce sibling calls when unrolling function call stacks"),
|
||||
@ -126,6 +129,32 @@ template <class T> T diff(T L, T R) { return std::max(L, R) - std::min(L, R); }
|
||||
|
||||
} // namespace
|
||||
|
||||
using RecursionStatus = LatencyAccountant::FunctionStack::RecursionStatus;
|
||||
RecursionStatus &RecursionStatus::operator++() {
|
||||
auto Depth = Bitfield::get<RecursionStatus::Depth>(Storage);
|
||||
assert(Depth >= 0 && Depth < std::numeric_limits<decltype(Depth)>::max());
|
||||
++Depth;
|
||||
Bitfield::set<RecursionStatus::Depth>(Storage, Depth); // ++Storage
|
||||
// Did this function just (maybe indirectly) call itself the first time?
|
||||
if (!isRecursive() && Depth == 2) // Storage == 2 / Storage s> 1
|
||||
Bitfield::set<RecursionStatus::IsRecursive>(Storage,
|
||||
true); // Storage |= INT_MIN
|
||||
return *this;
|
||||
}
|
||||
RecursionStatus &RecursionStatus::operator--() {
|
||||
auto Depth = Bitfield::get<RecursionStatus::Depth>(Storage);
|
||||
assert(Depth > 0);
|
||||
--Depth;
|
||||
Bitfield::set<RecursionStatus::Depth>(Storage, Depth); // --Storage
|
||||
// Did we leave a function that previouly (maybe indirectly) called itself?
|
||||
if (isRecursive() && Depth == 0) // Storage == INT_MIN
|
||||
Bitfield::set<RecursionStatus::IsRecursive>(Storage, false); // Storage = 0
|
||||
return *this;
|
||||
}
|
||||
bool RecursionStatus::isRecursive() const {
|
||||
return Bitfield::get<RecursionStatus::IsRecursive>(Storage); // Storage s< 0
|
||||
}
|
||||
|
||||
bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
|
||||
setMinMax(PerThreadMinMaxTSC[Record.TId], Record.TSC);
|
||||
setMinMax(PerCPUMinMaxTSC[Record.CPU], Record.TSC);
|
||||
@ -137,6 +166,8 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
|
||||
return false;
|
||||
|
||||
auto &ThreadStack = PerThreadFunctionStack[Record.TId];
|
||||
if (RecursiveCallsOnly && !ThreadStack.RecursionDepth)
|
||||
ThreadStack.RecursionDepth.emplace();
|
||||
switch (Record.Type) {
|
||||
case RecordTypes::CUSTOM_EVENT:
|
||||
case RecordTypes::TYPED_EVENT:
|
||||
@ -144,18 +175,24 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
|
||||
return true;
|
||||
case RecordTypes::ENTER:
|
||||
case RecordTypes::ENTER_ARG: {
|
||||
ThreadStack.emplace_back(Record.FuncId, Record.TSC);
|
||||
ThreadStack.Stack.emplace_back(Record.FuncId, Record.TSC);
|
||||
if (ThreadStack.RecursionDepth)
|
||||
++(*ThreadStack.RecursionDepth)[Record.FuncId];
|
||||
break;
|
||||
}
|
||||
case RecordTypes::EXIT:
|
||||
case RecordTypes::TAIL_EXIT: {
|
||||
if (ThreadStack.empty())
|
||||
if (ThreadStack.Stack.empty())
|
||||
return false;
|
||||
|
||||
if (ThreadStack.back().first == Record.FuncId) {
|
||||
const auto &Top = ThreadStack.back();
|
||||
recordLatency(Top.first, diff(Top.second, Record.TSC));
|
||||
ThreadStack.pop_back();
|
||||
if (ThreadStack.Stack.back().first == Record.FuncId) {
|
||||
const auto &Top = ThreadStack.Stack.back();
|
||||
if (!ThreadStack.RecursionDepth ||
|
||||
(*ThreadStack.RecursionDepth)[Top.first].isRecursive())
|
||||
recordLatency(Top.first, diff(Top.second, Record.TSC));
|
||||
if (ThreadStack.RecursionDepth)
|
||||
--(*ThreadStack.RecursionDepth)[Top.first];
|
||||
ThreadStack.Stack.pop_back();
|
||||
break;
|
||||
}
|
||||
|
||||
@ -164,11 +201,11 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
|
||||
|
||||
// Look for the parent up the stack.
|
||||
auto Parent =
|
||||
std::find_if(ThreadStack.rbegin(), ThreadStack.rend(),
|
||||
std::find_if(ThreadStack.Stack.rbegin(), ThreadStack.Stack.rend(),
|
||||
[&](const std::pair<const int32_t, uint64_t> &E) {
|
||||
return E.first == Record.FuncId;
|
||||
});
|
||||
if (Parent == ThreadStack.rend())
|
||||
if (Parent == ThreadStack.Stack.rend())
|
||||
return false;
|
||||
|
||||
// Account time for this apparently sibling call exit up the stack.
|
||||
@ -199,11 +236,17 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
|
||||
// complexity to do correctly (need to backtrack, etc.).
|
||||
//
|
||||
// FIXME: Potentially implement the more complex deduction algorithm?
|
||||
auto I = std::next(Parent).base();
|
||||
for (auto &E : make_range(I, ThreadStack.end())) {
|
||||
recordLatency(E.first, diff(E.second, Record.TSC));
|
||||
auto R = make_range(std::next(Parent).base(), ThreadStack.Stack.end());
|
||||
for (auto &E : R) {
|
||||
if (!ThreadStack.RecursionDepth ||
|
||||
(*ThreadStack.RecursionDepth)[E.first].isRecursive())
|
||||
recordLatency(E.first, diff(E.second, Record.TSC));
|
||||
}
|
||||
for (auto &Top : reverse(R)) {
|
||||
if (ThreadStack.RecursionDepth)
|
||||
--(*ThreadStack.RecursionDepth)[Top.first];
|
||||
ThreadStack.Stack.pop_back();
|
||||
}
|
||||
ThreadStack.erase(I, ThreadStack.end());
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -425,7 +468,8 @@ static CommandRegistration Unused(&Account, []() -> Error {
|
||||
symbolize::LLVMSymbolizer Symbolizer;
|
||||
llvm::xray::FuncIdConversionHelper FuncIdHelper(AccountInstrMap, Symbolizer,
|
||||
FunctionAddresses);
|
||||
xray::LatencyAccountant FCA(FuncIdHelper, AccountDeduceSiblingCalls);
|
||||
xray::LatencyAccountant FCA(FuncIdHelper, AccountRecursiveCallsOnly,
|
||||
AccountDeduceSiblingCalls);
|
||||
auto TraceOrErr = loadTraceFile(AccountInput);
|
||||
if (!TraceOrErr)
|
||||
return joinErrors(
|
||||
@ -447,12 +491,12 @@ static CommandRegistration Unused(&Account, []() -> Error {
|
||||
<< '\n';
|
||||
for (const auto &ThreadStack : FCA.getPerThreadFunctionStack()) {
|
||||
errs() << "Thread ID: " << ThreadStack.first << "\n";
|
||||
if (ThreadStack.second.empty()) {
|
||||
if (ThreadStack.second.Stack.empty()) {
|
||||
errs() << " (empty stack)\n";
|
||||
continue;
|
||||
}
|
||||
auto Level = ThreadStack.second.size();
|
||||
for (const auto &Entry : llvm::reverse(ThreadStack.second))
|
||||
auto Level = ThreadStack.second.Stack.size();
|
||||
for (const auto &Entry : llvm::reverse(ThreadStack.second.Stack))
|
||||
errs() << " #" << Level-- << "\t"
|
||||
<< FuncIdHelper.SymbolOrNumber(Entry.first) << '\n';
|
||||
}
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "func-id-helper.h"
|
||||
#include "llvm/ADT/Bitfields.h"
|
||||
#include "llvm/Support/Program.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/XRay/XRayRecord.h"
|
||||
@ -33,7 +34,19 @@ public:
|
||||
PerThreadMinMaxTSCMap;
|
||||
typedef llvm::DenseMap<uint8_t, std::pair<uint64_t, uint64_t>>
|
||||
PerCPUMinMaxTSCMap;
|
||||
typedef llvm::SmallVector<std::pair<int32_t, uint64_t>, 32> FunctionStack;
|
||||
struct FunctionStack {
|
||||
llvm::SmallVector<std::pair<int32_t, uint64_t>, 32> Stack;
|
||||
class RecursionStatus {
|
||||
uint32_t Storage = 0;
|
||||
using Depth = Bitfield::Element<int32_t, 0, 31>; // Low 31 bits.
|
||||
using IsRecursive = Bitfield::Element<bool, 31, 1>; // Sign bit.
|
||||
public:
|
||||
RecursionStatus &operator++();
|
||||
RecursionStatus &operator--();
|
||||
bool isRecursive() const;
|
||||
};
|
||||
Optional<llvm::DenseMap<int32_t, RecursionStatus>> RecursionDepth;
|
||||
};
|
||||
typedef llvm::DenseMap<uint32_t, FunctionStack> PerThreadFunctionStackMap;
|
||||
|
||||
private:
|
||||
@ -43,6 +56,7 @@ private:
|
||||
PerCPUMinMaxTSCMap PerCPUMinMaxTSC;
|
||||
FuncIdConversionHelper &FuncIdHelper;
|
||||
|
||||
bool RecursiveCallsOnly = false;
|
||||
bool DeduceSiblingCalls = false;
|
||||
uint64_t CurrentMaxTSC = 0;
|
||||
|
||||
@ -52,8 +66,9 @@ private:
|
||||
|
||||
public:
|
||||
explicit LatencyAccountant(FuncIdConversionHelper &FuncIdHelper,
|
||||
bool DeduceSiblingCalls)
|
||||
: FuncIdHelper(FuncIdHelper), DeduceSiblingCalls(DeduceSiblingCalls) {}
|
||||
bool RecursiveCallsOnly, bool DeduceSiblingCalls)
|
||||
: FuncIdHelper(FuncIdHelper), RecursiveCallsOnly(RecursiveCallsOnly),
|
||||
DeduceSiblingCalls(DeduceSiblingCalls) {}
|
||||
|
||||
const FunctionLatencyMap &getFunctionLatencies() const {
|
||||
return FunctionLatencies;
|
||||
|
Loading…
Reference in New Issue
Block a user