1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00

[ThinLTO] Allow usage of all hardware threads in the system

Before this patch, it wasn't possible to extend the ThinLTO threads to all SMT/CMT threads in the system. Only one thread per core was allowed, instructed by usage of llvm::heavyweight_hardware_concurrency() in the ThinLTO code. Any number passed to the LLD flag /opt:lldltojobs=..., or any other ThinLTO-specific flag, was previously interpreted in the context of llvm::heavyweight_hardware_concurrency(), which means SMT disabled.

One can now say in LLD:
/opt:lldltojobs=0 -- Use one std::thread / hardware core in the system (no SMT). Default value if flag not specified.
/opt:lldltojobs=N -- Limit usage to N threads, regardless of usage of heavyweight_hardware_concurrency().
/opt:lldltojobs=all -- Use all hardware threads in the system. Equivalent to /opt:lldltojobs=$(nproc) on Linux and /opt:lldltojobs=%NUMBER_OF_PROCESSORS% on Windows. When an affinity mask is set for the process, threads will be created only for the cores selected by the mask.

When N > number-of-hardware-threads-in-the-system, the threads in the thread pool will be dispatched equally on all CPU sockets (tested only on Windows).
When N <= number-of-hardware-threads-on-a-CPU-socket, the threads will remain on the CPU socket where the process started (only on Windows).

Differential Revision: https://reviews.llvm.org/D75153
This commit is contained in:
Alexandre Ganea 2020-03-27 10:20:39 -04:00
parent 153a6c2dbd
commit 61ed3dc5bf
8 changed files with 106 additions and 48 deletions

View File

@ -228,7 +228,7 @@ using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(
/// This ThinBackend runs the individual backend jobs in-process.
/// The default value means to use one job per hardware core (not hyper-thread).
ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0);
ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism);
/// This ThinBackend writes individual module indexes to files, instead of
/// running the individual backend jobs. This backend is for distributed builds

View File

@ -166,8 +166,20 @@ void llvm_execute_on_thread_async(
/// sockets. \p ThreadPoolNum represents a number bounded by [0,
/// compute_thread_count()).
void apply_thread_strategy(unsigned ThreadPoolNum) const;
/// Finds the CPU socket where a thread should go. Returns 'None' if the
/// thread shall remain on the actual CPU socket.
Optional<unsigned> compute_cpu_socket(unsigned ThreadPoolNum) const;
};
/// Build a strategy from a number of threads as a string provided in \p Num.
/// When Num is above the max number of threads specified by the \p Default
/// strategy, we attempt to equally allocate the threads on all CPU sockets.
/// "0" or an empty string will return the \p Default strategy.
/// "all" for using all hardware threads.
Optional<ThreadPoolStrategy>
get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default = {});
/// Returns a thread strategy for tasks requiring significant memory or other
/// resources. To be used for workloads where hardware_concurrency() proves to
/// be less efficient. Avoid this strategy if doing lots of I/O. Currently
@ -182,6 +194,18 @@ void llvm_execute_on_thread_async(
return S;
}
/// Like heavyweight_hardware_concurrency() above, but builds a strategy
/// based on the rules described for get_threadpool_strategy().
/// If \p Num is invalid, returns a default strategy where one thread per
/// hardware core is used.
inline ThreadPoolStrategy heavyweight_hardware_concurrency(StringRef Num) {
Optional<ThreadPoolStrategy> S =
get_threadpool_strategy(Num, heavyweight_hardware_concurrency());
if (S)
return *S;
return heavyweight_hardware_concurrency();
}
/// Returns a default thread strategy where all available hardware ressources
/// are to be used, except for those initially excluded by an affinity mask.
/// This function takes affinity into consideration. Returns 1 when LLVM is

View File

@ -477,7 +477,8 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
: Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
if (!Backend)
this->Backend = createInProcessThinBackend();
this->Backend =
createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
}
LTO::LTO(Config Conf, ThinBackend Backend,
@ -1090,13 +1091,12 @@ class InProcessThinBackend : public ThinBackendProc {
public:
InProcessThinBackend(
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
unsigned ThinLTOParallelismLevel,
ThreadPoolStrategy ThinLTOParallelism,
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, NativeObjectCache Cache)
: ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
BackendThreadPool(
heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
Cache(std::move(Cache)) {
for (auto &Name : CombinedIndex.cfiFunctionDefs())
CfiFunctionDefs.insert(
GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
@ -1192,13 +1192,13 @@ public:
};
} // end anonymous namespace
ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) {
ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) {
return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, NativeObjectCache Cache) {
return std::make_unique<InProcessThinBackend>(
Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries,
AddStream, Cache);
Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream,
Cache);
};
}

View File

@ -84,16 +84,34 @@ void llvm::llvm_execute_on_thread_async(
int computeHostNumHardwareThreads();
unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
if (ThreadsRequested > 0)
return ThreadsRequested;
int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
: sys::getHostNumPhysicalCores();
if (MaxThreadCount <= 0)
MaxThreadCount = 1;
return MaxThreadCount;
}
// No need to create more threads than there are hardware threads, it would
// uselessly induce more context-switching and cache eviction.
if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount)
return MaxThreadCount;
return ThreadsRequested;
Optional<ThreadPoolStrategy>
llvm::get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default) {
if (Num == "all")
return llvm::hardware_concurrency();
if (Num.empty())
return Default;
unsigned V;
if (Num.getAsInteger(10, V))
return None; // malformed 'Num' value
if (V == 0)
return Default;
// Do not take the Default into account. This effectively disables
// heavyweight_hardware_concurrency() if the user asks for any number of
// threads on the cmd-line.
ThreadPoolStrategy S = llvm::hardware_concurrency();
S.ThreadsRequested = V;
return S;
}
namespace {

View File

@ -273,7 +273,7 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
int computeHostNumHardwareThreads() {
#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
cpu_set_t Set;
if (sched_getaffinity(0, sizeof(Set), &Set))
if (sched_getaffinity(0, sizeof(Set), &Set) == 0)
return CPU_COUNT(&Set);
#endif
// Guard against std::thread::hardware_concurrency() returning 0.

View File

@ -131,6 +131,10 @@ struct ProcessorGroup {
unsigned UsableThreads;
unsigned ThreadsPerCore;
uint64_t Affinity;
unsigned useableCores() const {
return std::max(1U, UsableThreads / ThreadsPerCore);
}
};
template <typename F>
@ -232,33 +236,41 @@ int computeHostNumHardwareThreads() {
return Threads;
}
// Assign the current thread to a more appropriate CPU socket or CPU group
void llvm::ThreadPoolStrategy::apply_thread_strategy(
unsigned ThreadPoolNum) const {
// Finds the proper CPU socket where a thread number should go. Returns 'None'
// if the thread shall remain on the actual CPU socket.
Optional<unsigned>
llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
// Only one CPU socket in the system or process affinity was set, no need to
// move the thread(s) to another CPU socket.
if (Groups.size() <= 1)
return None;
// We ask for less threads than there are hardware threads per CPU socket, no
// need to dispatch threads to other CPU sockets.
unsigned MaxThreadsPerSocket =
UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
if (compute_thread_count() <= MaxThreadsPerSocket)
return None;
assert(ThreadPoolNum < compute_thread_count() &&
"The thread index is not within thread strategy's range!");
// In this mode, the ThreadNumber represents the core number, not the
// hyper-thread number. Assumes all NUMA groups have the same amount of
// hyper-threads.
if (!UseHyperThreads)
ThreadPoolNum *= Groups[0].ThreadsPerCore;
// Assumes the same number of hardware threads per CPU socket.
return (ThreadPoolNum * Groups.size()) / compute_thread_count();
}
unsigned ThreadRangeStart = 0;
for (unsigned I = 0; I < Groups.size(); ++I) {
const ProcessorGroup &G = Groups[I];
if (ThreadPoolNum >= ThreadRangeStart &&
ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {
GROUP_AFFINITY Affinity{};
Affinity.Group = G.ID;
Affinity.Mask = G.Affinity;
SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
}
ThreadRangeStart += G.UsableThreads;
}
// Assign the current thread to a more appropriate CPU socket or CPU group
void llvm::ThreadPoolStrategy::apply_thread_strategy(
unsigned ThreadPoolNum) const {
Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
if (!Socket)
return;
ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
GROUP_AFFINITY Affinity{};
Affinity.Group = Groups[*Socket].ID;
Affinity.Mask = Groups[*Socket].Affinity;
SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
}
llvm::BitVector llvm::get_thread_affinity_mask() {

View File

@ -28,6 +28,7 @@
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/Threading.h"
#include "llvm/Support/raw_ostream.h"
#include <list>
#include <map>
@ -134,11 +135,9 @@ namespace options {
};
static OutputType TheOutputType = OT_NORMAL;
static unsigned OptLevel = 2;
// Default parallelism of 0 used to indicate that user did not specify.
// Actual parallelism default value depends on implementation.
// Currently only affects ThinLTO, where the default is the max cores in the
// system.
static unsigned Parallelism = 0;
// system. See llvm::get_threadpool_strategy() for acceptable values.
static std::string Parallelism;
// Default regular LTO codegen parallelism (number of partitions).
static unsigned ParallelCodeGenParallelismLevel = 1;
#ifdef NDEBUG
@ -272,8 +271,10 @@ namespace options {
message(LDPL_FATAL, "Optimization level must be between 0 and 3");
OptLevel = opt[1] - '0';
} else if (opt.startswith("jobs=")) {
if (StringRef(opt_ + 5).getAsInteger(10, Parallelism))
message(LDPL_FATAL, "Invalid parallelism level: %s", opt_ + 5);
StringRef Num(opt_ + 5);
if (!get_threadpool_strategy(Num))
message(LDPL_FATAL, "Invalid parallelism level: %s", Num.data());
Parallelism = Num;
} else if (opt.startswith("lto-partitions=")) {
if (opt.substr(strlen("lto-partitions="))
.getAsInteger(10, ParallelCodeGenParallelismLevel))
@ -877,14 +878,15 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
Conf.PTO.LoopVectorization = options::OptLevel > 1;
Conf.PTO.SLPVectorization = options::OptLevel > 1;
if (options::Parallelism)
Backend = createInProcessThinBackend(options::Parallelism);
if (options::thinlto_index_only) {
std::string OldPrefix, NewPrefix;
getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix);
Backend = createWriteIndexesThinBackend(OldPrefix, NewPrefix,
options::thinlto_emit_imports_files,
LinkedObjectsFile, OnIndexWrite);
} else {
Backend = createInProcessThinBackend(
llvm::heavyweight_hardware_concurrency(options::Parallelism));
}
Conf.OverrideTriple = options::triple;

View File

@ -68,9 +68,10 @@ static cl::opt<bool>
"distributed backend case"));
// Default to using all available threads in the system, but using only one
// thread per core, as indicated by the usage of
// heavyweight_hardware_concurrency() in the InProcessThinBackend constructor.
static cl::opt<int> Threads("thinlto-threads", cl::init(0));
// thread per core (no SMT).
// Use -thinlto-threads=all to use hardware_concurrency() instead, which means
// to use all hardware threads or cores in the system.
static cl::opt<std::string> Threads("thinlto-threads");
static cl::list<std::string> SymbolResolutions(
"r",
@ -286,7 +287,8 @@ static int run(int argc, char **argv) {
/* LinkedObjectsFile */ nullptr,
/* OnWrite */ {});
else
Backend = createInProcessThinBackend(Threads);
Backend = createInProcessThinBackend(
llvm::heavyweight_hardware_concurrency(Threads));
LTO Lto(std::move(Conf), std::move(Backend));
bool HasErrors = false;