Add ThinLtoJIT example
Summary:
Prototype of a JIT compiler that utilizes ThinLTO summaries to compile modules ahead of time. This is an implementation of the concept I presented in my "ThinLTO Summaries in JIT Compilation" talk at the 2018 Developers' Meeting: http://llvm.org/devmtg/2018-10/talk-abstracts.html#lt8
Upfront the JIT first populates the *combined ThinLTO module index*, which provides fast access to the global call-graph and module paths by function. Next, it loads the main function's module and compiles it. All functions in the module will be emitted with prolog instructions that *fire a discovery flag* once execution reaches them. In parallel, the *discovery thread* is busy-watching the existing flags. Once it detects one has fired, it uses the module index to find all functions that are reachable from it within a given number of calls and submits their defining modules to the compilation pipeline.
While execution continues, more flags are fired and further modules added. Ideally the JIT can be tuned in a way, so that in the majority of cases the code on the execution path can be compiled ahead of time. In cases where it doesn't work, the JIT has a *definition generator* in place that loads modules if missing functions are reached.
Reviewers: lhames, dblaikie, jfb, tejohnson, pree-jackie, AlexDenisov, kavon
Subscribers: mgorny, mehdi_amini, inglorion, hiraditya, steven_wu, dexonsmith, arphaman, jfb, merge_guards_bot, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72486
2020-01-11 01:09:42 +01:00
|
|
|
#include "ThinLtoModuleIndex.h"
|
|
|
|
|
|
|
|
#include "llvm/Bitcode/BitcodeReader.h"
|
|
|
|
#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
|
|
|
|
#include "llvm/IR/LLVMContext.h"
|
|
|
|
#include "llvm/IRReader/IRReader.h"
|
|
|
|
#include "llvm/Support/SourceMgr.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
|
|
|
|
#include <memory>
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "thinltojit"
|
|
|
|
|
|
|
|
namespace llvm {
|
|
|
|
namespace orc {
|
|
|
|
|
|
|
|
Error ThinLtoModuleIndex::add(StringRef InputPath) {
|
|
|
|
auto Buffer = errorOrToExpected(MemoryBuffer::getFile(InputPath));
|
|
|
|
if (!Buffer)
|
|
|
|
return Buffer.takeError();
|
|
|
|
|
|
|
|
Error ParseErr = readModuleSummaryIndex((*Buffer)->getMemBufferRef(),
|
|
|
|
CombinedSummaryIndex, NextModuleId);
|
|
|
|
if (ParseErr)
|
|
|
|
return ParseErr;
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
auto Paths = getAllModulePaths();
|
|
|
|
unsigned TotalPaths = Paths.size();
|
|
|
|
std::sort(Paths.begin(), Paths.end());
|
|
|
|
Paths.erase(std::unique(Paths.begin(), Paths.end()), Paths.end());
|
|
|
|
assert(TotalPaths == Paths.size() && "Module paths must be unique");
|
|
|
|
#endif
|
|
|
|
|
|
|
|
++NextModuleId;
|
|
|
|
return Error::success();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<StringRef> ThinLtoModuleIndex::getAllModulePaths() const {
|
|
|
|
auto ModuleTable = CombinedSummaryIndex.modulePaths();
|
|
|
|
|
|
|
|
std::vector<StringRef> Paths;
|
|
|
|
Paths.resize(ModuleTable.size());
|
|
|
|
|
|
|
|
for (const auto &KV : ModuleTable) {
|
|
|
|
assert(Paths[KV.second.first].empty() && "IDs are unique and continuous");
|
|
|
|
Paths[KV.second.first] = KV.first();
|
|
|
|
}
|
|
|
|
|
|
|
|
return Paths;
|
|
|
|
}
|
|
|
|
|
|
|
|
GlobalValueSummary *
|
|
|
|
ThinLtoModuleIndex::getSummary(GlobalValue::GUID Function) const {
|
|
|
|
ValueInfo VI = CombinedSummaryIndex.getValueInfo(Function);
|
|
|
|
if (!VI || VI.getSummaryList().empty())
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// There can be more than one symbol with the same GUID, in the case of same-
|
|
|
|
// named locals in different but same-named source files that were compiled in
|
|
|
|
// their respective directories (so the source file name and resulting GUID is
|
|
|
|
// the same). We avoid this by checking that module paths are unique upon
|
|
|
|
// add().
|
|
|
|
//
|
|
|
|
// TODO: We can still get duplicates on symbols declared with
|
|
|
|
// attribute((weak)), a GNU extension supported by gcc and clang.
|
|
|
|
// We should support it by looking for a symbol in the current module
|
|
|
|
// or in the same module as the caller.
|
|
|
|
assert(VI.getSummaryList().size() == 1 && "Weak symbols not yet supported");
|
|
|
|
|
|
|
|
return VI.getSummaryList().front().get()->getBaseObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
Optional<StringRef>
|
|
|
|
ThinLtoModuleIndex::getModulePathForSymbol(StringRef Name) const {
|
|
|
|
if (GlobalValueSummary *S = getSummary(GlobalValue::getGUID(Name)))
|
|
|
|
return S->modulePath();
|
|
|
|
return None; // We don't know the symbol.
|
|
|
|
}
|
|
|
|
|
|
|
|
void ThinLtoModuleIndex::scheduleModuleParsingPrelocked(StringRef Path) {
|
|
|
|
// Once the module was scheduled, we can call takeModule().
|
|
|
|
auto ScheduledIt = ScheduledModules.find(Path);
|
|
|
|
if (ScheduledIt != ScheduledModules.end())
|
|
|
|
return;
|
|
|
|
|
|
|
|
auto Worker = [this](std::string Path) {
|
|
|
|
if (auto TSM = doParseModule(Path)) {
|
|
|
|
std::lock_guard<std::mutex> Lock(ParsedModulesLock);
|
|
|
|
ParsedModules[Path] = std::move(*TSM);
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << "Finished parsing module: " << Path << "\n");
|
|
|
|
} else {
|
|
|
|
ES.reportError(TSM.takeError());
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << "Schedule module for parsing: " << Path << "\n");
|
|
|
|
ScheduledModules[Path] = ParseModuleWorkers.async(Worker, Path.str());
|
|
|
|
}
|
|
|
|
|
|
|
|
ThreadSafeModule ThinLtoModuleIndex::takeModule(StringRef Path) {
|
|
|
|
std::unique_lock<std::mutex> ParseLock(ParsedModulesLock);
|
|
|
|
|
|
|
|
auto ParsedIt = ParsedModules.find(Path);
|
|
|
|
if (ParsedIt == ParsedModules.end()) {
|
|
|
|
ParseLock.unlock();
|
|
|
|
|
|
|
|
// The module is not ready, wait for the future we stored.
|
|
|
|
std::unique_lock<std::mutex> ScheduleLock(ScheduledModulesLock);
|
|
|
|
auto ScheduledIt = ScheduledModules.find(Path);
|
|
|
|
assert(ScheduledIt != ScheduledModules.end() &&
|
|
|
|
"Don't call for unscheduled modules");
|
|
|
|
std::shared_future<void> Future = ScheduledIt->getValue();
|
|
|
|
ScheduleLock.unlock();
|
|
|
|
Future.get();
|
|
|
|
|
|
|
|
ParseLock.lock();
|
|
|
|
ParsedIt = ParsedModules.find(Path);
|
|
|
|
assert(ParsedIt != ParsedModules.end() && "Must be ready now");
|
|
|
|
}
|
|
|
|
|
|
|
|
// We only add each module once. If it's not here anymore, we can skip it.
|
|
|
|
ThreadSafeModule TSM = std::move(ParsedIt->getValue());
|
|
|
|
ParsedIt->getValue() = ThreadSafeModule();
|
|
|
|
return TSM;
|
|
|
|
}
|
|
|
|
|
|
|
|
ThreadSafeModule ThinLtoModuleIndex::parseModuleFromFile(StringRef Path) {
|
|
|
|
{
|
|
|
|
std::lock_guard<std::mutex> ScheduleLock(ScheduledModulesLock);
|
|
|
|
scheduleModuleParsingPrelocked(Path);
|
|
|
|
}
|
|
|
|
return takeModule(Path);
|
|
|
|
}
|
|
|
|
|
|
|
|
Expected<ThreadSafeModule> ThinLtoModuleIndex::doParseModule(StringRef Path) {
|
|
|
|
// TODO: make a SMDiagnosticError class for this
|
|
|
|
SMDiagnostic Err;
|
|
|
|
auto Ctx = std::make_unique<LLVMContext>();
|
|
|
|
auto M = parseIRFile(Path, Err, *Ctx);
|
|
|
|
if (!M) {
|
|
|
|
std::string ErrDescription;
|
|
|
|
{
|
|
|
|
raw_string_ostream S(ErrDescription);
|
|
|
|
Err.print("ThinLtoJIT", S);
|
|
|
|
}
|
|
|
|
return createStringError(inconvertibleErrorCode(),
|
|
|
|
"Failed to load module from file '%s' (%s)",
|
|
|
|
Path.data(), ErrDescription.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
return ThreadSafeModule(std::move(M), std::move(Ctx));
|
|
|
|
}
|
|
|
|
|
|
|
|
// We don't filter visited functions. Discovery will often be retriggered
|
|
|
|
// from the middle of already visited functions and it aims to reach a little
|
|
|
|
// further each time.
|
|
|
|
void ThinLtoModuleIndex::discoverCalleeModulePaths(FunctionSummary *S,
|
|
|
|
unsigned LookaheadLevels) {
|
|
|
|
// Populate initial worklist
|
|
|
|
std::vector<FunctionSummary *> Worklist;
|
|
|
|
addToWorklist(Worklist, S->calls());
|
|
|
|
unsigned Distance = 0;
|
|
|
|
|
|
|
|
while (++Distance < LookaheadLevels) {
|
|
|
|
// Process current worklist and populate a new one.
|
|
|
|
std::vector<FunctionSummary *> NextWorklist;
|
|
|
|
for (FunctionSummary *F : Worklist) {
|
|
|
|
updatePathRank(F->modulePath(), Distance);
|
|
|
|
addToWorklist(NextWorklist, F->calls());
|
|
|
|
}
|
|
|
|
Worklist = std::move(NextWorklist);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Process the last worklist without filling a new one
|
|
|
|
for (FunctionSummary *F : Worklist) {
|
|
|
|
updatePathRank(F->modulePath(), Distance);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reset counts for known paths (includes both, scheduled and parsed modules).
|
|
|
|
std::lock_guard<std::mutex> Lock(ScheduledModulesLock);
|
|
|
|
for (const auto &KV : ScheduledModules) {
|
|
|
|
PathRank[KV.first()].Count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void ThinLtoModuleIndex::addToWorklist(
|
|
|
|
std::vector<FunctionSummary *> &List,
|
|
|
|
ArrayRef<FunctionSummary::EdgeTy> Calls) {
|
|
|
|
for (const auto &Edge : Calls) {
|
|
|
|
const auto &SummaryList = Edge.first.getSummaryList();
|
|
|
|
if (!SummaryList.empty()) {
|
|
|
|
GlobalValueSummary *S = SummaryList.front().get()->getBaseObject();
|
|
|
|
assert(isa<FunctionSummary>(S) && "Callees must be functions");
|
|
|
|
List.push_back(cast<FunctionSummary>(S));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// PathRank is global and continuous.
|
|
|
|
void ThinLtoModuleIndex::updatePathRank(StringRef Path, unsigned Distance) {
|
|
|
|
auto &Entry = PathRank[Path];
|
|
|
|
Entry.Count += 1;
|
|
|
|
Entry.MinDist = std::min(Entry.MinDist, Distance);
|
|
|
|
assert(Entry.MinDist > 0 && "We want it as a divisor");
|
2020-03-18 15:21:14 +01:00
|
|
|
}
|
Add ThinLtoJIT example
Summary:
Prototype of a JIT compiler that utilizes ThinLTO summaries to compile modules ahead of time. This is an implementation of the concept I presented in my "ThinLTO Summaries in JIT Compilation" talk at the 2018 Developers' Meeting: http://llvm.org/devmtg/2018-10/talk-abstracts.html#lt8
Upfront the JIT first populates the *combined ThinLTO module index*, which provides fast access to the global call-graph and module paths by function. Next, it loads the main function's module and compiles it. All functions in the module will be emitted with prolog instructions that *fire a discovery flag* once execution reaches them. In parallel, the *discovery thread* is busy-watching the existing flags. Once it detects one has fired, it uses the module index to find all functions that are reachable from it within a given number of calls and submits their defining modules to the compilation pipeline.
While execution continues, more flags are fired and further modules added. Ideally the JIT can be tuned in a way, so that in the majority of cases the code on the execution path can be compiled ahead of time. In cases where it doesn't work, the JIT has a *definition generator* in place that loads modules if missing functions are reached.
Reviewers: lhames, dblaikie, jfb, tejohnson, pree-jackie, AlexDenisov, kavon
Subscribers: mgorny, mehdi_amini, inglorion, hiraditya, steven_wu, dexonsmith, arphaman, jfb, merge_guards_bot, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72486
2020-01-11 01:09:42 +01:00
|
|
|
|
|
|
|
// TODO: The size of a ThreadPool's task queue is not accessible. It would
|
|
|
|
// be great to know in order to estimate how many modules we schedule. The
|
|
|
|
// more we schedule, the less precise is the ranking. The less we schedule,
|
|
|
|
// the higher the risk for downtime.
|
|
|
|
std::vector<std::string> ThinLtoModuleIndex::selectNextPaths() {
|
|
|
|
struct ScorePath {
|
|
|
|
float Score;
|
|
|
|
unsigned MinDist;
|
|
|
|
StringRef Path;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<ScorePath> Candidates;
|
|
|
|
Candidates.reserve(PathRank.size());
|
|
|
|
for (const auto &KV : PathRank) {
|
|
|
|
float Score = static_cast<float>(KV.second.Count) / KV.second.MinDist;
|
|
|
|
if (Score > .0f) {
|
|
|
|
Candidates.push_back({Score, KV.second.MinDist, KV.first()});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort candidates by descending score.
|
|
|
|
std::sort(Candidates.begin(), Candidates.end(),
|
|
|
|
[](const ScorePath &LHS, const ScorePath &RHS) {
|
|
|
|
return LHS.Score > RHS.Score;
|
|
|
|
});
|
|
|
|
|
|
|
|
// Sort highest score candidates by ascending minimal distance.
|
|
|
|
size_t Selected =
|
|
|
|
std::min(std::max<size_t>(NumParseModuleThreads, Candidates.size() / 2),
|
|
|
|
Candidates.size());
|
|
|
|
std::sort(Candidates.begin(), Candidates.begin() + Selected,
|
|
|
|
[](const ScorePath &LHS, const ScorePath &RHS) {
|
|
|
|
return LHS.MinDist < RHS.MinDist;
|
|
|
|
});
|
|
|
|
|
|
|
|
std::vector<std::string> Paths;
|
|
|
|
Paths.reserve(Selected);
|
|
|
|
for (unsigned i = 0; i < Selected; i++) {
|
|
|
|
Paths.push_back(Candidates[i].Path.str());
|
|
|
|
}
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << "ModuleIndex: select " << Paths.size() << " out of "
|
|
|
|
<< Candidates.size() << " discovered paths\n");
|
|
|
|
|
|
|
|
return Paths;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned ThinLtoModuleIndex::getNumDiscoveredModules() const {
|
|
|
|
// TODO: It would probably be more efficient to track the number of
|
|
|
|
// unscheduled modules.
|
|
|
|
unsigned NonNullItems = 0;
|
|
|
|
for (const auto &KV : PathRank)
|
|
|
|
if (KV.second.Count > 0)
|
|
|
|
++NonNullItems;
|
|
|
|
return NonNullItems;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace orc
|
|
|
|
} // namespace llvm
|