mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
Retry: [llvm-profdata] Speed up merging by using a thread pool
Add a "-j" option to llvm-profdata to control the number of threads used. Auto-detect NumThreads when it isn't specified, and avoid spawning threads when they wouldn't be beneficial. I tested this patch using a raw profile produced by clang (147MB). Here is the time taken to merge 4 copies together on my laptop: No thread pool: 112.87s user 5.92s system 97% cpu 2:01.08 total With 2 threads: 134.99s user 26.54s system 164% cpu 1:33.31 total Changes since the initial commit: - When handling odd-length inputs, call ThreadPool::wait() before merging the last profile. Should fix a race/off-by-one (see r275937). Differential Revision: https://reviews.llvm.org/D22438 llvm-svn: 275938
This commit is contained in:
parent
338daec4d5
commit
78dfceef4b
@ -106,6 +106,11 @@ OPTIONS
|
|||||||
conjunction with -instr. Defaults to false, since it can inhibit compiler
|
conjunction with -instr. Defaults to false, since it can inhibit compiler
|
||||||
optimization during PGO.
|
optimization during PGO.
|
||||||
|
|
||||||
|
.. option:: -num-threads=N, -j=N
|
||||||
|
|
||||||
|
Use N threads to perform profile merging. When N=0, llvm-profdata auto-detects
|
||||||
|
an appropriate number of threads to use. This is the default.
|
||||||
|
|
||||||
EXAMPLES
|
EXAMPLES
|
||||||
^^^^^^^^
|
^^^^^^^^
|
||||||
Basic Usage
|
Basic Usage
|
||||||
|
@ -47,6 +47,8 @@ public:
|
|||||||
/// for this function and the hash and number of counts match, each counter is
|
/// for this function and the hash and number of counts match, each counter is
|
||||||
/// summed. Optionally scale counts by \p Weight.
|
/// summed. Optionally scale counts by \p Weight.
|
||||||
Error addRecord(InstrProfRecord &&I, uint64_t Weight = 1);
|
Error addRecord(InstrProfRecord &&I, uint64_t Weight = 1);
|
||||||
|
/// Merge existing function counts from the given writer.
|
||||||
|
Error mergeRecordsFromWriter(InstrProfWriter &&IPW);
|
||||||
/// Write the profile to \c OS
|
/// Write the profile to \c OS
|
||||||
void write(raw_fd_ostream &OS);
|
void write(raw_fd_ostream &OS);
|
||||||
/// Write the profile in text format to \c OS
|
/// Write the profile in text format to \c OS
|
||||||
|
@ -182,6 +182,14 @@ Error InstrProfWriter::addRecord(InstrProfRecord &&I, uint64_t Weight) {
|
|||||||
return Dest.takeError();
|
return Dest.takeError();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Error InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW) {
|
||||||
|
for (auto &I : IPW.FunctionData)
|
||||||
|
for (auto &Func : I.getValue())
|
||||||
|
if (Error E = addRecord(std::move(Func.second), 1))
|
||||||
|
return E;
|
||||||
|
return Error::success();
|
||||||
|
}
|
||||||
|
|
||||||
bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
|
bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
|
||||||
if (!Sparse)
|
if (!Sparse)
|
||||||
return true;
|
return true;
|
||||||
|
@ -51,3 +51,43 @@ DISJOINT-2: Block counts: [2, 3]
|
|||||||
DISJOINT: Total functions: 2
|
DISJOINT: Total functions: 2
|
||||||
DISJOINT: Maximum function count: 1
|
DISJOINT: Maximum function count: 1
|
||||||
DISJOINT: Maximum internal block count: 3
|
DISJOINT: Maximum internal block count: 3
|
||||||
|
|
||||||
|
RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: -num-threads 2 -o %t
|
||||||
|
RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
|
||||||
|
RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: -j 3 -o %t
|
||||||
|
RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
|
||||||
|
FOO4: foo:
|
||||||
|
FOO4: Counters: 3
|
||||||
|
FOO4: Function count: 4
|
||||||
|
FOO4: Block counts: [8, 12]
|
||||||
|
FOO4: Total functions: 1
|
||||||
|
FOO4: Maximum function count: 4
|
||||||
|
FOO4: Maximum internal block count: 12
|
||||||
|
|
||||||
|
RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext -j 2 -o %t
|
||||||
|
RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
|
||||||
|
RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext -j 3 -o %t
|
||||||
|
RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
|
||||||
|
RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext -o %t
|
||||||
|
RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
|
||||||
|
RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
|
||||||
|
RUN: %p/Inputs/foo3-1.proftext -j 1 -o %t
|
||||||
|
RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
|
||||||
|
FOO5: foo:
|
||||||
|
FOO5: Counters: 3
|
||||||
|
FOO5: Function count: 5
|
||||||
|
FOO5: Block counts: [10, 15]
|
||||||
|
FOO5: Total functions: 1
|
||||||
|
FOO5: Maximum function count: 5
|
||||||
|
FOO5: Maximum internal block count: 15
|
||||||
|
@ -29,6 +29,7 @@
|
|||||||
#include "llvm/Support/Path.h"
|
#include "llvm/Support/Path.h"
|
||||||
#include "llvm/Support/PrettyStackTrace.h"
|
#include "llvm/Support/PrettyStackTrace.h"
|
||||||
#include "llvm/Support/Signals.h"
|
#include "llvm/Support/Signals.h"
|
||||||
|
#include "llvm/Support/ThreadPool.h"
|
||||||
#include "llvm/Support/raw_ostream.h"
|
#include "llvm/Support/raw_ostream.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -117,9 +118,68 @@ struct WeightedFile {
|
|||||||
};
|
};
|
||||||
typedef SmallVector<WeightedFile, 5> WeightedFileVector;
|
typedef SmallVector<WeightedFile, 5> WeightedFileVector;
|
||||||
|
|
||||||
|
/// Keep track of merged data and reported errors.
|
||||||
|
struct WriterContext {
|
||||||
|
std::mutex Lock;
|
||||||
|
InstrProfWriter Writer;
|
||||||
|
Error Err;
|
||||||
|
StringRef ErrWhence;
|
||||||
|
std::mutex &ErrLock;
|
||||||
|
SmallSet<instrprof_error, 4> &WriterErrorCodes;
|
||||||
|
|
||||||
|
WriterContext(bool IsSparse, std::mutex &ErrLock,
|
||||||
|
SmallSet<instrprof_error, 4> &WriterErrorCodes)
|
||||||
|
: Lock(), Writer(IsSparse), Err(Error::success()), ErrWhence(""),
|
||||||
|
ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Load an input into a writer context.
|
||||||
|
static void loadInput(const WeightedFile &Input, WriterContext *WC) {
|
||||||
|
std::unique_lock<std::mutex> CtxGuard{WC->Lock};
|
||||||
|
|
||||||
|
// If there's a pending hard error, don't do more work.
|
||||||
|
if (WC->Err)
|
||||||
|
return;
|
||||||
|
|
||||||
|
WC->ErrWhence = Input.Filename;
|
||||||
|
|
||||||
|
auto ReaderOrErr = InstrProfReader::create(Input.Filename);
|
||||||
|
if ((WC->Err = ReaderOrErr.takeError()))
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto Reader = std::move(ReaderOrErr.get());
|
||||||
|
bool IsIRProfile = Reader->isIRLevelProfile();
|
||||||
|
if (WC->Writer.setIsIRLevelProfile(IsIRProfile)) {
|
||||||
|
WC->Err = make_error<StringError>(
|
||||||
|
"Merge IR generated profile with Clang generated profile.",
|
||||||
|
std::error_code());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto &I : *Reader) {
|
||||||
|
if (Error E = WC->Writer.addRecord(std::move(I), Input.Weight)) {
|
||||||
|
// Only show hint the first time an error occurs.
|
||||||
|
instrprof_error IPE = InstrProfError::take(std::move(E));
|
||||||
|
std::unique_lock<std::mutex> ErrGuard{WC->ErrLock};
|
||||||
|
bool firstTime = WC->WriterErrorCodes.insert(IPE).second;
|
||||||
|
handleMergeWriterError(make_error<InstrProfError>(IPE), Input.Filename,
|
||||||
|
I.Name, firstTime);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Reader->hasError())
|
||||||
|
WC->Err = Reader->getError();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge the \p Src writer context into \p Dst.
|
||||||
|
static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) {
|
||||||
|
if (Error E = Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer)))
|
||||||
|
Dst->Err = std::move(E);
|
||||||
|
}
|
||||||
|
|
||||||
static void mergeInstrProfile(const WeightedFileVector &Inputs,
|
static void mergeInstrProfile(const WeightedFileVector &Inputs,
|
||||||
StringRef OutputFilename,
|
StringRef OutputFilename,
|
||||||
ProfileFormat OutputFormat, bool OutputSparse) {
|
ProfileFormat OutputFormat, bool OutputSparse,
|
||||||
|
unsigned NumThreads) {
|
||||||
if (OutputFilename.compare("-") == 0)
|
if (OutputFilename.compare("-") == 0)
|
||||||
exitWithError("Cannot write indexed profdata format to stdout.");
|
exitWithError("Cannot write indexed profdata format to stdout.");
|
||||||
|
|
||||||
@ -131,30 +191,59 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
|
|||||||
if (EC)
|
if (EC)
|
||||||
exitWithErrorCode(EC, OutputFilename);
|
exitWithErrorCode(EC, OutputFilename);
|
||||||
|
|
||||||
InstrProfWriter Writer(OutputSparse);
|
std::mutex ErrorLock;
|
||||||
SmallSet<instrprof_error, 4> WriterErrorCodes;
|
SmallSet<instrprof_error, 4> WriterErrorCodes;
|
||||||
for (const auto &Input : Inputs) {
|
|
||||||
auto ReaderOrErr = InstrProfReader::create(Input.Filename);
|
|
||||||
if (Error E = ReaderOrErr.takeError())
|
|
||||||
exitWithError(std::move(E), Input.Filename);
|
|
||||||
|
|
||||||
auto Reader = std::move(ReaderOrErr.get());
|
// If NumThreads is not specified, auto-detect a good default.
|
||||||
bool IsIRProfile = Reader->isIRLevelProfile();
|
if (NumThreads == 0)
|
||||||
if (Writer.setIsIRLevelProfile(IsIRProfile))
|
NumThreads = std::max(1U, std::min(std::thread::hardware_concurrency(),
|
||||||
exitWithError("Merge IR generated profile with Clang generated profile.");
|
unsigned(Inputs.size() / 2)));
|
||||||
|
|
||||||
for (auto &I : *Reader) {
|
// Initialize the writer contexts.
|
||||||
if (Error E = Writer.addRecord(std::move(I), Input.Weight)) {
|
SmallVector<std::unique_ptr<WriterContext>, 4> Contexts;
|
||||||
// Only show hint the first time an error occurs.
|
for (unsigned I = 0; I < NumThreads; ++I)
|
||||||
instrprof_error IPE = InstrProfError::take(std::move(E));
|
Contexts.emplace_back(llvm::make_unique<WriterContext>(
|
||||||
bool firstTime = WriterErrorCodes.insert(IPE).second;
|
OutputSparse, ErrorLock, WriterErrorCodes));
|
||||||
handleMergeWriterError(make_error<InstrProfError>(IPE), Input.Filename,
|
|
||||||
I.Name, firstTime);
|
if (NumThreads == 1) {
|
||||||
}
|
for (const auto &Input : Inputs)
|
||||||
|
loadInput(Input, Contexts[0].get());
|
||||||
|
} else {
|
||||||
|
ThreadPool Pool(NumThreads);
|
||||||
|
|
||||||
|
// Load the inputs in parallel (N/NumThreads serial steps).
|
||||||
|
unsigned Ctx = 0;
|
||||||
|
for (const auto &Input : Inputs) {
|
||||||
|
Pool.async(loadInput, Input, Contexts[Ctx].get());
|
||||||
|
Ctx = (Ctx + 1) % NumThreads;
|
||||||
}
|
}
|
||||||
if (Reader->hasError())
|
Pool.wait();
|
||||||
exitWithError(Reader->getError(), Input.Filename);
|
|
||||||
|
// Merge the writer contexts together (~ lg(NumThreads) serial steps).
|
||||||
|
unsigned Mid = Contexts.size() / 2;
|
||||||
|
unsigned End = Contexts.size();
|
||||||
|
assert(Mid > 0 && "Expected more than one context");
|
||||||
|
do {
|
||||||
|
for (unsigned I = 0; I < Mid; ++I)
|
||||||
|
Pool.async(mergeWriterContexts, Contexts[I].get(),
|
||||||
|
Contexts[I + Mid].get());
|
||||||
|
Pool.wait();
|
||||||
|
if (End & 1) {
|
||||||
|
Pool.async(mergeWriterContexts, Contexts[0].get(),
|
||||||
|
Contexts[End - 1].get());
|
||||||
|
Pool.wait();
|
||||||
|
}
|
||||||
|
End = Mid;
|
||||||
|
Mid /= 2;
|
||||||
|
} while (Mid > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle deferred hard errors encountered during merging.
|
||||||
|
for (std::unique_ptr<WriterContext> &WC : Contexts)
|
||||||
|
if (WC->Err)
|
||||||
|
exitWithError(std::move(WC->Err), WC->ErrWhence);
|
||||||
|
|
||||||
|
InstrProfWriter &Writer = Contexts[0]->Writer;
|
||||||
if (OutputFormat == PF_Text)
|
if (OutputFormat == PF_Text)
|
||||||
Writer.writeText(Output);
|
Writer.writeText(Output);
|
||||||
else
|
else
|
||||||
@ -288,6 +377,11 @@ static int merge_main(int argc, const char *argv[]) {
|
|||||||
clEnumValEnd));
|
clEnumValEnd));
|
||||||
cl::opt<bool> OutputSparse("sparse", cl::init(false),
|
cl::opt<bool> OutputSparse("sparse", cl::init(false),
|
||||||
cl::desc("Generate a sparse profile (only meaningful for -instr)"));
|
cl::desc("Generate a sparse profile (only meaningful for -instr)"));
|
||||||
|
cl::opt<unsigned> NumThreads(
|
||||||
|
"num-threads", cl::init(0),
|
||||||
|
cl::desc("Number of merge threads to use (default: autodetect)"));
|
||||||
|
cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"),
|
||||||
|
cl::aliasopt(NumThreads));
|
||||||
|
|
||||||
cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
|
cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
|
||||||
|
|
||||||
@ -314,7 +408,7 @@ static int merge_main(int argc, const char *argv[]) {
|
|||||||
|
|
||||||
if (ProfileKind == instr)
|
if (ProfileKind == instr)
|
||||||
mergeInstrProfile(WeightedInputs, OutputFilename, OutputFormat,
|
mergeInstrProfile(WeightedInputs, OutputFilename, OutputFormat,
|
||||||
OutputSparse);
|
OutputSparse, NumThreads);
|
||||||
else
|
else
|
||||||
mergeSampleProfile(WeightedInputs, OutputFilename, OutputFormat);
|
mergeSampleProfile(WeightedInputs, OutputFilename, OutputFormat);
|
||||||
|
|
||||||
|
@ -204,6 +204,31 @@ TEST_F(InstrProfTest, get_profile_summary) {
|
|||||||
delete PSFromMD;
|
delete PSFromMD;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(InstrProfTest, test_writer_merge) {
|
||||||
|
InstrProfRecord Record1("func1", 0x1234, {42});
|
||||||
|
NoError(Writer.addRecord(std::move(Record1)));
|
||||||
|
|
||||||
|
InstrProfWriter Writer2;
|
||||||
|
InstrProfRecord Record2("func2", 0x1234, {0, 0});
|
||||||
|
NoError(Writer2.addRecord(std::move(Record2)));
|
||||||
|
|
||||||
|
NoError(Writer.mergeRecordsFromWriter(std::move(Writer2)));
|
||||||
|
|
||||||
|
auto Profile = Writer.writeBuffer();
|
||||||
|
readProfile(std::move(Profile));
|
||||||
|
|
||||||
|
Expected<InstrProfRecord> R = Reader->getInstrProfRecord("func1", 0x1234);
|
||||||
|
ASSERT_TRUE(NoError(R.takeError()));
|
||||||
|
ASSERT_EQ(1U, R->Counts.size());
|
||||||
|
ASSERT_EQ(42U, R->Counts[0]);
|
||||||
|
|
||||||
|
R = Reader->getInstrProfRecord("func2", 0x1234);
|
||||||
|
ASSERT_TRUE(NoError(R.takeError()));
|
||||||
|
ASSERT_EQ(2U, R->Counts.size());
|
||||||
|
ASSERT_EQ(0U, R->Counts[0]);
|
||||||
|
ASSERT_EQ(0U, R->Counts[1]);
|
||||||
|
}
|
||||||
|
|
||||||
static const char callee1[] = "callee1";
|
static const char callee1[] = "callee1";
|
||||||
static const char callee2[] = "callee2";
|
static const char callee2[] = "callee2";
|
||||||
static const char callee3[] = "callee3";
|
static const char callee3[] = "callee3";
|
||||||
|
Loading…
Reference in New Issue
Block a user