mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 12:12:47 +01:00
[ThinLTO] Allow usage of all hardware threads in the system
Before this patch, it wasn't possible to extend the ThinLTO threads to all SMT/CMT threads in the system. Only one thread per core was allowed, instructed by usage of llvm::heavyweight_hardware_concurrency() in the ThinLTO code. Any number passed to the LLD flag /opt:lldltojobs=..., or any other ThinLTO-specific flag, was previously interpreted in the context of llvm::heavyweight_hardware_concurrency(), which means SMT disabled. One can now say in LLD: /opt:lldltojobs=0 -- Use one std::thread / hardware core in the system (no SMT). Default value if flag not specified. /opt:lldltojobs=N -- Limit usage to N threads, regardless of usage of heavyweight_hardware_concurrency(). /opt:lldltojobs=all -- Use all hardware threads in the system. Equivalent to /opt:lldltojobs=$(nproc) on Linux and /opt:lldltojobs=%NUMBER_OF_PROCESSORS% on Windows. When an affinity mask is set for the process, threads will be created only for the cores selected by the mask. When N > number-of-hardware-threads-in-the-system, the threads in the thread pool will be dispatched equally on all CPU sockets (tested only on Windows). When N <= number-of-hardware-threads-on-a-CPU-socket, the threads will remain on the CPU socket where the process started (only on Windows). Differential Revision: https://reviews.llvm.org/D75153
This commit is contained in:
parent
153a6c2dbd
commit
61ed3dc5bf
@ -228,7 +228,7 @@ using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(
|
||||
|
||||
/// This ThinBackend runs the individual backend jobs in-process.
|
||||
/// The default value means to use one job per hardware core (not hyper-thread).
|
||||
ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0);
|
||||
ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism);
|
||||
|
||||
/// This ThinBackend writes individual module indexes to files, instead of
|
||||
/// running the individual backend jobs. This backend is for distributed builds
|
||||
|
@ -166,8 +166,20 @@ void llvm_execute_on_thread_async(
|
||||
/// sockets. \p ThreadPoolNum represents a number bounded by [0,
|
||||
/// compute_thread_count()).
|
||||
void apply_thread_strategy(unsigned ThreadPoolNum) const;
|
||||
|
||||
/// Finds the CPU socket where a thread should go. Returns 'None' if the
|
||||
/// thread shall remain on the actual CPU socket.
|
||||
Optional<unsigned> compute_cpu_socket(unsigned ThreadPoolNum) const;
|
||||
};
|
||||
|
||||
/// Build a strategy from a number of threads as a string provided in \p Num.
|
||||
/// When Num is above the max number of threads specified by the \p Default
|
||||
/// strategy, we attempt to equally allocate the threads on all CPU sockets.
|
||||
/// "0" or an empty string will return the \p Default strategy.
|
||||
/// "all" for using all hardware threads.
|
||||
Optional<ThreadPoolStrategy>
|
||||
get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default = {});
|
||||
|
||||
/// Returns a thread strategy for tasks requiring significant memory or other
|
||||
/// resources. To be used for workloads where hardware_concurrency() proves to
|
||||
/// be less efficient. Avoid this strategy if doing lots of I/O. Currently
|
||||
@ -182,6 +194,18 @@ void llvm_execute_on_thread_async(
|
||||
return S;
|
||||
}
|
||||
|
||||
/// Like heavyweight_hardware_concurrency() above, but builds a strategy
|
||||
/// based on the rules described for get_threadpool_strategy().
|
||||
/// If \p Num is invalid, returns a default strategy where one thread per
|
||||
/// hardware core is used.
|
||||
inline ThreadPoolStrategy heavyweight_hardware_concurrency(StringRef Num) {
|
||||
Optional<ThreadPoolStrategy> S =
|
||||
get_threadpool_strategy(Num, heavyweight_hardware_concurrency());
|
||||
if (S)
|
||||
return *S;
|
||||
return heavyweight_hardware_concurrency();
|
||||
}
|
||||
|
||||
/// Returns a default thread strategy where all available hardware ressources
|
||||
/// are to be used, except for those initially excluded by an affinity mask.
|
||||
/// This function takes affinity into consideration. Returns 1 when LLVM is
|
||||
|
@ -477,7 +477,8 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
|
||||
LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
|
||||
: Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
|
||||
if (!Backend)
|
||||
this->Backend = createInProcessThinBackend();
|
||||
this->Backend =
|
||||
createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
|
||||
}
|
||||
|
||||
LTO::LTO(Config Conf, ThinBackend Backend,
|
||||
@ -1090,13 +1091,12 @@ class InProcessThinBackend : public ThinBackendProc {
|
||||
public:
|
||||
InProcessThinBackend(
|
||||
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
|
||||
unsigned ThinLTOParallelismLevel,
|
||||
ThreadPoolStrategy ThinLTOParallelism,
|
||||
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
|
||||
AddStreamFn AddStream, NativeObjectCache Cache)
|
||||
: ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
|
||||
BackendThreadPool(
|
||||
heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
|
||||
AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
|
||||
BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
|
||||
Cache(std::move(Cache)) {
|
||||
for (auto &Name : CombinedIndex.cfiFunctionDefs())
|
||||
CfiFunctionDefs.insert(
|
||||
GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
|
||||
@ -1192,13 +1192,13 @@ public:
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) {
|
||||
ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) {
|
||||
return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
|
||||
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
|
||||
AddStreamFn AddStream, NativeObjectCache Cache) {
|
||||
return std::make_unique<InProcessThinBackend>(
|
||||
Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries,
|
||||
AddStream, Cache);
|
||||
Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream,
|
||||
Cache);
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -84,16 +84,34 @@ void llvm::llvm_execute_on_thread_async(
|
||||
int computeHostNumHardwareThreads();
|
||||
|
||||
unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
|
||||
if (ThreadsRequested > 0)
|
||||
return ThreadsRequested;
|
||||
|
||||
int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
|
||||
: sys::getHostNumPhysicalCores();
|
||||
if (MaxThreadCount <= 0)
|
||||
MaxThreadCount = 1;
|
||||
return MaxThreadCount;
|
||||
}
|
||||
|
||||
// No need to create more threads than there are hardware threads, it would
|
||||
// uselessly induce more context-switching and cache eviction.
|
||||
if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount)
|
||||
return MaxThreadCount;
|
||||
return ThreadsRequested;
|
||||
Optional<ThreadPoolStrategy>
|
||||
llvm::get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default) {
|
||||
if (Num == "all")
|
||||
return llvm::hardware_concurrency();
|
||||
if (Num.empty())
|
||||
return Default;
|
||||
unsigned V;
|
||||
if (Num.getAsInteger(10, V))
|
||||
return None; // malformed 'Num' value
|
||||
if (V == 0)
|
||||
return Default;
|
||||
|
||||
// Do not take the Default into account. This effectively disables
|
||||
// heavyweight_hardware_concurrency() if the user asks for any number of
|
||||
// threads on the cmd-line.
|
||||
ThreadPoolStrategy S = llvm::hardware_concurrency();
|
||||
S.ThreadsRequested = V;
|
||||
return S;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
@ -273,7 +273,7 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
|
||||
int computeHostNumHardwareThreads() {
|
||||
#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
|
||||
cpu_set_t Set;
|
||||
if (sched_getaffinity(0, sizeof(Set), &Set))
|
||||
if (sched_getaffinity(0, sizeof(Set), &Set) == 0)
|
||||
return CPU_COUNT(&Set);
|
||||
#endif
|
||||
// Guard against std::thread::hardware_concurrency() returning 0.
|
||||
|
@ -131,6 +131,10 @@ struct ProcessorGroup {
|
||||
unsigned UsableThreads;
|
||||
unsigned ThreadsPerCore;
|
||||
uint64_t Affinity;
|
||||
|
||||
unsigned useableCores() const {
|
||||
return std::max(1U, UsableThreads / ThreadsPerCore);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename F>
|
||||
@ -232,33 +236,41 @@ int computeHostNumHardwareThreads() {
|
||||
return Threads;
|
||||
}
|
||||
|
||||
// Assign the current thread to a more appropriate CPU socket or CPU group
|
||||
void llvm::ThreadPoolStrategy::apply_thread_strategy(
|
||||
unsigned ThreadPoolNum) const {
|
||||
// Finds the proper CPU socket where a thread number should go. Returns 'None'
|
||||
// if the thread shall remain on the actual CPU socket.
|
||||
Optional<unsigned>
|
||||
llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
|
||||
ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
|
||||
// Only one CPU socket in the system or process affinity was set, no need to
|
||||
// move the thread(s) to another CPU socket.
|
||||
if (Groups.size() <= 1)
|
||||
return None;
|
||||
|
||||
// We ask for less threads than there are hardware threads per CPU socket, no
|
||||
// need to dispatch threads to other CPU sockets.
|
||||
unsigned MaxThreadsPerSocket =
|
||||
UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
|
||||
if (compute_thread_count() <= MaxThreadsPerSocket)
|
||||
return None;
|
||||
|
||||
assert(ThreadPoolNum < compute_thread_count() &&
|
||||
"The thread index is not within thread strategy's range!");
|
||||
|
||||
// In this mode, the ThreadNumber represents the core number, not the
|
||||
// hyper-thread number. Assumes all NUMA groups have the same amount of
|
||||
// hyper-threads.
|
||||
if (!UseHyperThreads)
|
||||
ThreadPoolNum *= Groups[0].ThreadsPerCore;
|
||||
// Assumes the same number of hardware threads per CPU socket.
|
||||
return (ThreadPoolNum * Groups.size()) / compute_thread_count();
|
||||
}
|
||||
|
||||
unsigned ThreadRangeStart = 0;
|
||||
for (unsigned I = 0; I < Groups.size(); ++I) {
|
||||
const ProcessorGroup &G = Groups[I];
|
||||
if (ThreadPoolNum >= ThreadRangeStart &&
|
||||
ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {
|
||||
|
||||
GROUP_AFFINITY Affinity{};
|
||||
Affinity.Group = G.ID;
|
||||
Affinity.Mask = G.Affinity;
|
||||
SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
|
||||
}
|
||||
ThreadRangeStart += G.UsableThreads;
|
||||
}
|
||||
// Assign the current thread to a more appropriate CPU socket or CPU group
|
||||
void llvm::ThreadPoolStrategy::apply_thread_strategy(
|
||||
unsigned ThreadPoolNum) const {
|
||||
Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
|
||||
if (!Socket)
|
||||
return;
|
||||
ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
|
||||
GROUP_AFFINITY Affinity{};
|
||||
Affinity.Group = Groups[*Socket].ID;
|
||||
Affinity.Mask = Groups[*Socket].Affinity;
|
||||
SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
|
||||
}
|
||||
|
||||
llvm::BitVector llvm::get_thread_affinity_mask() {
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/Path.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Support/Threading.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <list>
|
||||
#include <map>
|
||||
@ -134,11 +135,9 @@ namespace options {
|
||||
};
|
||||
static OutputType TheOutputType = OT_NORMAL;
|
||||
static unsigned OptLevel = 2;
|
||||
// Default parallelism of 0 used to indicate that user did not specify.
|
||||
// Actual parallelism default value depends on implementation.
|
||||
// Currently only affects ThinLTO, where the default is the max cores in the
|
||||
// system.
|
||||
static unsigned Parallelism = 0;
|
||||
// system. See llvm::get_threadpool_strategy() for acceptable values.
|
||||
static std::string Parallelism;
|
||||
// Default regular LTO codegen parallelism (number of partitions).
|
||||
static unsigned ParallelCodeGenParallelismLevel = 1;
|
||||
#ifdef NDEBUG
|
||||
@ -272,8 +271,10 @@ namespace options {
|
||||
message(LDPL_FATAL, "Optimization level must be between 0 and 3");
|
||||
OptLevel = opt[1] - '0';
|
||||
} else if (opt.startswith("jobs=")) {
|
||||
if (StringRef(opt_ + 5).getAsInteger(10, Parallelism))
|
||||
message(LDPL_FATAL, "Invalid parallelism level: %s", opt_ + 5);
|
||||
StringRef Num(opt_ + 5);
|
||||
if (!get_threadpool_strategy(Num))
|
||||
message(LDPL_FATAL, "Invalid parallelism level: %s", Num.data());
|
||||
Parallelism = Num;
|
||||
} else if (opt.startswith("lto-partitions=")) {
|
||||
if (opt.substr(strlen("lto-partitions="))
|
||||
.getAsInteger(10, ParallelCodeGenParallelismLevel))
|
||||
@ -877,14 +878,15 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
|
||||
Conf.PTO.LoopVectorization = options::OptLevel > 1;
|
||||
Conf.PTO.SLPVectorization = options::OptLevel > 1;
|
||||
|
||||
if (options::Parallelism)
|
||||
Backend = createInProcessThinBackend(options::Parallelism);
|
||||
if (options::thinlto_index_only) {
|
||||
std::string OldPrefix, NewPrefix;
|
||||
getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix);
|
||||
Backend = createWriteIndexesThinBackend(OldPrefix, NewPrefix,
|
||||
options::thinlto_emit_imports_files,
|
||||
LinkedObjectsFile, OnIndexWrite);
|
||||
} else {
|
||||
Backend = createInProcessThinBackend(
|
||||
llvm::heavyweight_hardware_concurrency(options::Parallelism));
|
||||
}
|
||||
|
||||
Conf.OverrideTriple = options::triple;
|
||||
|
@ -68,9 +68,10 @@ static cl::opt<bool>
|
||||
"distributed backend case"));
|
||||
|
||||
// Default to using all available threads in the system, but using only one
|
||||
// thread per core, as indicated by the usage of
|
||||
// heavyweight_hardware_concurrency() in the InProcessThinBackend constructor.
|
||||
static cl::opt<int> Threads("thinlto-threads", cl::init(0));
|
||||
// thread per core (no SMT).
|
||||
// Use -thinlto-threads=all to use hardware_concurrency() instead, which means
|
||||
// to use all hardware threads or cores in the system.
|
||||
static cl::opt<std::string> Threads("thinlto-threads");
|
||||
|
||||
static cl::list<std::string> SymbolResolutions(
|
||||
"r",
|
||||
@ -286,7 +287,8 @@ static int run(int argc, char **argv) {
|
||||
/* LinkedObjectsFile */ nullptr,
|
||||
/* OnWrite */ {});
|
||||
else
|
||||
Backend = createInProcessThinBackend(Threads);
|
||||
Backend = createInProcessThinBackend(
|
||||
llvm::heavyweight_hardware_concurrency(Threads));
|
||||
LTO Lto(std::move(Conf), std::move(Backend));
|
||||
|
||||
bool HasErrors = false;
|
||||
|
Loading…
Reference in New Issue
Block a user