1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00
llvm-mirror/lib/Support/Windows/Threading.inc

299 lines
9.7 KiB
PHP
Raw Normal View History

//===- Windows/Threading.inc - Win32 Threading Implementation - -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides the Win32 specific implementation of Threading functions.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Support/Windows/WindowsSupport.h"
#include <process.h>
[Support] On Windows, ensure hardware_concurrency() extends to all CPU sockets and all NUMA groups The goal of this patch is to maximize CPU utilization on multi-socket or high core count systems, so that parallel computations such as LLD/ThinLTO can use all hardware threads in the system. Before this patch, on Windows, a maximum of 64 hardware threads could be used at most, in some cases dispatched only on one CPU socket. == Background == Windows doesn't have a flat cpu_set_t like Linux. Instead, it projects hardware CPUs (or NUMA nodes) to applications through a concept of "processor groups". A "processor" is the smallest unit of execution on a CPU, that is, an hyper-thread if SMT is active; a core otherwise. There's a limit of 32-bit processors on older 32-bit versions of Windows, which later was raised to 64-processors with 64-bit versions of Windows. This limit comes from the affinity mask, which historically is represented by the sizeof(void*). Consequently, the concept of "processor groups" was introduced for dealing with systems with more than 64 hyper-threads. By default, the Windows OS assigns only one "processor group" to each starting application, in a round-robin manner. If the application wants to use more processors, it needs to programmatically enable it, by assigning threads to other "processor groups". This also means that affinity cannot cross "processor group" boundaries; one can only specify a "preferred" group on start-up, but the application is free to allocate more groups if it wants to. This creates a peculiar situation, where newer CPUs like the AMD EPYC 7702P (64-cores, 128-hyperthreads) are projected by the OS as two (2) "processor groups". This means that by default, an application can only use half of the cores. This situation could only get worse in the years to come, as dies with more cores will appear on the market. == The problem == The heavyweight_hardware_concurrency() API was introduced so that only *one hardware thread per core* was used. Once that API returns, that original intention is lost, only the number of threads is retained. Consider a situation, on Windows, where the system has 2 CPU sockets, 18 cores each, each core having 2 hyper-threads, for a total of 72 hyper-threads. Both heavyweight_hardware_concurrency() and hardware_concurrency() currently return 36, because on Windows they are simply wrappers over std::thread::hardware_concurrency() -- which can only return processors from the current "processor group". == The changes in this patch == To solve this situation, we capture (and retain) the initial intention until the point of usage, through a new ThreadPoolStrategy class. The number of threads to use is deferred as late as possible, until the moment where the std::threads are created (ThreadPool in the case of ThinLTO). When using hardware_concurrency(), setting ThreadCount to 0 now means to use all the possible hardware CPU (SMT) threads. Providing a ThreadCount above to the maximum number of threads will have no effect, the maximum will be used instead. The heavyweight_hardware_concurrency() is similar to hardware_concurrency(), except that only one thread per hardware *core* will be used. When LLVM_ENABLE_THREADS is OFF, the threading APIs will always return 1, to ensure any caller loops will be exercised at least once. Differential Revision: https://reviews.llvm.org/D71775
2020-02-14 04:49:57 +01:00
#include <bitset>
// Windows will at times define MemoryFence.
#ifdef MemoryFence
#undef MemoryFence
#endif
static unsigned __stdcall threadFuncSync(void *Arg) {
SyncThreadInfo *TI = static_cast<SyncThreadInfo *>(Arg);
TI->UserFn(TI->UserData);
return 0;
}
static unsigned __stdcall threadFuncAsync(void *Arg) {
std::unique_ptr<AsyncThreadInfo> Info(static_cast<AsyncThreadInfo *>(Arg));
(*Info)();
return 0;
}
static void
llvm_execute_on_thread_impl(unsigned (__stdcall *ThreadFunc)(void *), void *Arg,
llvm::Optional<unsigned> StackSizeInBytes,
JoiningPolicy JP) {
HANDLE hThread = (HANDLE)::_beginthreadex(
NULL, StackSizeInBytes.getValueOr(0), ThreadFunc, Arg, 0, NULL);
if (!hThread) {
ReportLastErrorFatal("_beginthreadex failed");
}
if (JP == JoiningPolicy::Join) {
if (::WaitForSingleObject(hThread, INFINITE) == WAIT_FAILED) {
ReportLastErrorFatal("WaitForSingleObject failed");
}
}
if (::CloseHandle(hThread) == FALSE) {
ReportLastErrorFatal("CloseHandle failed");
}
}
uint64_t llvm::get_threadid() {
return uint64_t(::GetCurrentThreadId());
}
uint32_t llvm::get_max_thread_name_length() { return 0; }
#if defined(_MSC_VER)
static void SetThreadName(DWORD Id, LPCSTR Name) {
constexpr DWORD MS_VC_EXCEPTION = 0x406D1388;
#pragma pack(push, 8)
struct THREADNAME_INFO {
DWORD dwType; // Must be 0x1000.
LPCSTR szName; // Pointer to thread name
DWORD dwThreadId; // Thread ID (-1 == current thread)
DWORD dwFlags; // Reserved. Do not use.
};
#pragma pack(pop)
THREADNAME_INFO info;
info.dwType = 0x1000;
info.szName = Name;
info.dwThreadId = Id;
info.dwFlags = 0;
__try {
::RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR),
(ULONG_PTR *)&info);
}
__except (EXCEPTION_EXECUTE_HANDLER) {
}
}
#endif
void llvm::set_thread_name(const Twine &Name) {
#if defined(_MSC_VER)
// Make sure the input is null terminated.
SmallString<64> Storage;
StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
SetThreadName(::GetCurrentThreadId(), NameStr.data());
#endif
}
void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
// "Name" is not an inherent property of a thread on Windows. In fact, when
// you "set" the name, you are only firing a one-time message to a debugger
// which it interprets as a program setting its threads' name. We may be
// able to get fancy by creating a TLS entry when someone calls
// set_thread_name so that subsequent calls to get_thread_name return this
// value.
Name.clear();
}
SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
// https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority
// Begin background processing mode. The system lowers the resource scheduling
// priorities of the thread so that it can perform background work without
// significantly affecting activity in the foreground.
// End background processing mode. The system restores the resource scheduling
// priorities of the thread as they were before the thread entered background
// processing mode.
return SetThreadPriority(GetCurrentThread(),
Priority == ThreadPriority::Background
? THREAD_MODE_BACKGROUND_BEGIN
: THREAD_MODE_BACKGROUND_END)
? SetThreadPriorityResult::SUCCESS
: SetThreadPriorityResult::FAILURE;
}
[Support] On Windows, ensure hardware_concurrency() extends to all CPU sockets and all NUMA groups The goal of this patch is to maximize CPU utilization on multi-socket or high core count systems, so that parallel computations such as LLD/ThinLTO can use all hardware threads in the system. Before this patch, on Windows, a maximum of 64 hardware threads could be used at most, in some cases dispatched only on one CPU socket. == Background == Windows doesn't have a flat cpu_set_t like Linux. Instead, it projects hardware CPUs (or NUMA nodes) to applications through a concept of "processor groups". A "processor" is the smallest unit of execution on a CPU, that is, an hyper-thread if SMT is active; a core otherwise. There's a limit of 32-bit processors on older 32-bit versions of Windows, which later was raised to 64-processors with 64-bit versions of Windows. This limit comes from the affinity mask, which historically is represented by the sizeof(void*). Consequently, the concept of "processor groups" was introduced for dealing with systems with more than 64 hyper-threads. By default, the Windows OS assigns only one "processor group" to each starting application, in a round-robin manner. If the application wants to use more processors, it needs to programmatically enable it, by assigning threads to other "processor groups". This also means that affinity cannot cross "processor group" boundaries; one can only specify a "preferred" group on start-up, but the application is free to allocate more groups if it wants to. This creates a peculiar situation, where newer CPUs like the AMD EPYC 7702P (64-cores, 128-hyperthreads) are projected by the OS as two (2) "processor groups". This means that by default, an application can only use half of the cores. This situation could only get worse in the years to come, as dies with more cores will appear on the market. == The problem == The heavyweight_hardware_concurrency() API was introduced so that only *one hardware thread per core* was used. Once that API returns, that original intention is lost, only the number of threads is retained. Consider a situation, on Windows, where the system has 2 CPU sockets, 18 cores each, each core having 2 hyper-threads, for a total of 72 hyper-threads. Both heavyweight_hardware_concurrency() and hardware_concurrency() currently return 36, because on Windows they are simply wrappers over std::thread::hardware_concurrency() -- which can only return processors from the current "processor group". == The changes in this patch == To solve this situation, we capture (and retain) the initial intention until the point of usage, through a new ThreadPoolStrategy class. The number of threads to use is deferred as late as possible, until the moment where the std::threads are created (ThreadPool in the case of ThinLTO). When using hardware_concurrency(), setting ThreadCount to 0 now means to use all the possible hardware CPU (SMT) threads. Providing a ThreadCount above to the maximum number of threads will have no effect, the maximum will be used instead. The heavyweight_hardware_concurrency() is similar to hardware_concurrency(), except that only one thread per hardware *core* will be used. When LLVM_ENABLE_THREADS is OFF, the threading APIs will always return 1, to ensure any caller loops will be exercised at least once. Differential Revision: https://reviews.llvm.org/D71775
2020-02-14 04:49:57 +01:00
struct ProcessorGroup {
unsigned ID;
unsigned AllThreads;
unsigned UsableThreads;
unsigned ThreadsPerCore;
uint64_t Affinity;
unsigned useableCores() const {
return std::max(1U, UsableThreads / ThreadsPerCore);
}
[Support] On Windows, ensure hardware_concurrency() extends to all CPU sockets and all NUMA groups The goal of this patch is to maximize CPU utilization on multi-socket or high core count systems, so that parallel computations such as LLD/ThinLTO can use all hardware threads in the system. Before this patch, on Windows, a maximum of 64 hardware threads could be used at most, in some cases dispatched only on one CPU socket. == Background == Windows doesn't have a flat cpu_set_t like Linux. Instead, it projects hardware CPUs (or NUMA nodes) to applications through a concept of "processor groups". A "processor" is the smallest unit of execution on a CPU, that is, an hyper-thread if SMT is active; a core otherwise. There's a limit of 32-bit processors on older 32-bit versions of Windows, which later was raised to 64-processors with 64-bit versions of Windows. This limit comes from the affinity mask, which historically is represented by the sizeof(void*). Consequently, the concept of "processor groups" was introduced for dealing with systems with more than 64 hyper-threads. By default, the Windows OS assigns only one "processor group" to each starting application, in a round-robin manner. If the application wants to use more processors, it needs to programmatically enable it, by assigning threads to other "processor groups". This also means that affinity cannot cross "processor group" boundaries; one can only specify a "preferred" group on start-up, but the application is free to allocate more groups if it wants to. This creates a peculiar situation, where newer CPUs like the AMD EPYC 7702P (64-cores, 128-hyperthreads) are projected by the OS as two (2) "processor groups". This means that by default, an application can only use half of the cores. This situation could only get worse in the years to come, as dies with more cores will appear on the market. == The problem == The heavyweight_hardware_concurrency() API was introduced so that only *one hardware thread per core* was used. Once that API returns, that original intention is lost, only the number of threads is retained. Consider a situation, on Windows, where the system has 2 CPU sockets, 18 cores each, each core having 2 hyper-threads, for a total of 72 hyper-threads. Both heavyweight_hardware_concurrency() and hardware_concurrency() currently return 36, because on Windows they are simply wrappers over std::thread::hardware_concurrency() -- which can only return processors from the current "processor group". == The changes in this patch == To solve this situation, we capture (and retain) the initial intention until the point of usage, through a new ThreadPoolStrategy class. The number of threads to use is deferred as late as possible, until the moment where the std::threads are created (ThreadPool in the case of ThinLTO). When using hardware_concurrency(), setting ThreadCount to 0 now means to use all the possible hardware CPU (SMT) threads. Providing a ThreadCount above to the maximum number of threads will have no effect, the maximum will be used instead. The heavyweight_hardware_concurrency() is similar to hardware_concurrency(), except that only one thread per hardware *core* will be used. When LLVM_ENABLE_THREADS is OFF, the threading APIs will always return 1, to ensure any caller loops will be exercised at least once. Differential Revision: https://reviews.llvm.org/D71775
2020-02-14 04:49:57 +01:00
};
template <typename F>
static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
DWORD Len = 0;
BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
return false;
}
auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len);
R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
if (R) {
auto *End =
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len);
for (auto *Curr = Info; Curr < End;
Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr +
Curr->Size)) {
if (Curr->Relationship != Relationship)
continue;
Fn(Curr);
}
}
free(Info);
return true;
}
static ArrayRef<ProcessorGroup> getProcessorGroups() {
auto computeGroups = []() {
SmallVector<ProcessorGroup, 4> Groups;
auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
GROUP_RELATIONSHIP &El = ProcInfo->Group;
for (unsigned J = 0; J < El.ActiveGroupCount; ++J) {
ProcessorGroup G;
G.ID = Groups.size();
G.AllThreads = El.GroupInfo[J].MaximumProcessorCount;
G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount;
assert(G.UsableThreads <= 64);
G.Affinity = El.GroupInfo[J].ActiveProcessorMask;
Groups.push_back(G);
}
};
if (!IterateProcInfo(RelationGroup, HandleGroup))
return std::vector<ProcessorGroup>();
auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor;
assert(El.GroupCount == 1);
unsigned NumHyperThreads = 1;
// If the flag is set, each core supports more than one hyper-thread.
if (El.Flags & LTP_PC_SMT)
NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count();
unsigned I = El.GroupMask[0].Group;
Groups[I].ThreadsPerCore = NumHyperThreads;
};
if (!IterateProcInfo(RelationProcessorCore, HandleProc))
return std::vector<ProcessorGroup>();
// If there's an affinity mask set on one of the CPUs, then assume the user
// wants to constrain the current process to only a single CPU.
for (auto &G : Groups) {
if (G.UsableThreads != G.AllThreads) {
ProcessorGroup NewG{G};
[Support] On Windows, ensure hardware_concurrency() extends to all CPU sockets and all NUMA groups The goal of this patch is to maximize CPU utilization on multi-socket or high core count systems, so that parallel computations such as LLD/ThinLTO can use all hardware threads in the system. Before this patch, on Windows, a maximum of 64 hardware threads could be used at most, in some cases dispatched only on one CPU socket. == Background == Windows doesn't have a flat cpu_set_t like Linux. Instead, it projects hardware CPUs (or NUMA nodes) to applications through a concept of "processor groups". A "processor" is the smallest unit of execution on a CPU, that is, an hyper-thread if SMT is active; a core otherwise. There's a limit of 32-bit processors on older 32-bit versions of Windows, which later was raised to 64-processors with 64-bit versions of Windows. This limit comes from the affinity mask, which historically is represented by the sizeof(void*). Consequently, the concept of "processor groups" was introduced for dealing with systems with more than 64 hyper-threads. By default, the Windows OS assigns only one "processor group" to each starting application, in a round-robin manner. If the application wants to use more processors, it needs to programmatically enable it, by assigning threads to other "processor groups". This also means that affinity cannot cross "processor group" boundaries; one can only specify a "preferred" group on start-up, but the application is free to allocate more groups if it wants to. This creates a peculiar situation, where newer CPUs like the AMD EPYC 7702P (64-cores, 128-hyperthreads) are projected by the OS as two (2) "processor groups". This means that by default, an application can only use half of the cores. This situation could only get worse in the years to come, as dies with more cores will appear on the market. == The problem == The heavyweight_hardware_concurrency() API was introduced so that only *one hardware thread per core* was used. Once that API returns, that original intention is lost, only the number of threads is retained. Consider a situation, on Windows, where the system has 2 CPU sockets, 18 cores each, each core having 2 hyper-threads, for a total of 72 hyper-threads. Both heavyweight_hardware_concurrency() and hardware_concurrency() currently return 36, because on Windows they are simply wrappers over std::thread::hardware_concurrency() -- which can only return processors from the current "processor group". == The changes in this patch == To solve this situation, we capture (and retain) the initial intention until the point of usage, through a new ThreadPoolStrategy class. The number of threads to use is deferred as late as possible, until the moment where the std::threads are created (ThreadPool in the case of ThinLTO). When using hardware_concurrency(), setting ThreadCount to 0 now means to use all the possible hardware CPU (SMT) threads. Providing a ThreadCount above to the maximum number of threads will have no effect, the maximum will be used instead. The heavyweight_hardware_concurrency() is similar to hardware_concurrency(), except that only one thread per hardware *core* will be used. When LLVM_ENABLE_THREADS is OFF, the threading APIs will always return 1, to ensure any caller loops will be exercised at least once. Differential Revision: https://reviews.llvm.org/D71775
2020-02-14 04:49:57 +01:00
Groups.clear();
Groups.push_back(NewG);
break;
[Support] On Windows, ensure hardware_concurrency() extends to all CPU sockets and all NUMA groups The goal of this patch is to maximize CPU utilization on multi-socket or high core count systems, so that parallel computations such as LLD/ThinLTO can use all hardware threads in the system. Before this patch, on Windows, a maximum of 64 hardware threads could be used at most, in some cases dispatched only on one CPU socket. == Background == Windows doesn't have a flat cpu_set_t like Linux. Instead, it projects hardware CPUs (or NUMA nodes) to applications through a concept of "processor groups". A "processor" is the smallest unit of execution on a CPU, that is, an hyper-thread if SMT is active; a core otherwise. There's a limit of 32-bit processors on older 32-bit versions of Windows, which later was raised to 64-processors with 64-bit versions of Windows. This limit comes from the affinity mask, which historically is represented by the sizeof(void*). Consequently, the concept of "processor groups" was introduced for dealing with systems with more than 64 hyper-threads. By default, the Windows OS assigns only one "processor group" to each starting application, in a round-robin manner. If the application wants to use more processors, it needs to programmatically enable it, by assigning threads to other "processor groups". This also means that affinity cannot cross "processor group" boundaries; one can only specify a "preferred" group on start-up, but the application is free to allocate more groups if it wants to. This creates a peculiar situation, where newer CPUs like the AMD EPYC 7702P (64-cores, 128-hyperthreads) are projected by the OS as two (2) "processor groups". This means that by default, an application can only use half of the cores. This situation could only get worse in the years to come, as dies with more cores will appear on the market. == The problem == The heavyweight_hardware_concurrency() API was introduced so that only *one hardware thread per core* was used. Once that API returns, that original intention is lost, only the number of threads is retained. Consider a situation, on Windows, where the system has 2 CPU sockets, 18 cores each, each core having 2 hyper-threads, for a total of 72 hyper-threads. Both heavyweight_hardware_concurrency() and hardware_concurrency() currently return 36, because on Windows they are simply wrappers over std::thread::hardware_concurrency() -- which can only return processors from the current "processor group". == The changes in this patch == To solve this situation, we capture (and retain) the initial intention until the point of usage, through a new ThreadPoolStrategy class. The number of threads to use is deferred as late as possible, until the moment where the std::threads are created (ThreadPool in the case of ThinLTO). When using hardware_concurrency(), setting ThreadCount to 0 now means to use all the possible hardware CPU (SMT) threads. Providing a ThreadCount above to the maximum number of threads will have no effect, the maximum will be used instead. The heavyweight_hardware_concurrency() is similar to hardware_concurrency(), except that only one thread per hardware *core* will be used. When LLVM_ENABLE_THREADS is OFF, the threading APIs will always return 1, to ensure any caller loops will be exercised at least once. Differential Revision: https://reviews.llvm.org/D71775
2020-02-14 04:49:57 +01:00
}
}
return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
};
static auto Groups = computeGroups();
return ArrayRef<ProcessorGroup>(Groups);
}
template <typename R, typename UnaryPredicate>
static unsigned aggregate(R &&Range, UnaryPredicate P) {
unsigned I{};
for (const auto &It : Range)
I += P(It);
return I;
}
// for sys::getHostNumPhysicalCores
int computeHostNumPhysicalCores() {
static unsigned Cores =
aggregate(getProcessorGroups(), [](const ProcessorGroup &G) {
return G.UsableThreads / G.ThreadsPerCore;
});
return Cores;
}
int computeHostNumHardwareThreads() {
static unsigned Threads =
aggregate(getProcessorGroups(),
[](const ProcessorGroup &G) { return G.UsableThreads; });
return Threads;
}
// Finds the proper CPU socket where a thread number should go. Returns 'None'
// if the thread shall remain on the actual CPU socket.
Optional<unsigned>
llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
[Support] On Windows, ensure hardware_concurrency() extends to all CPU sockets and all NUMA groups The goal of this patch is to maximize CPU utilization on multi-socket or high core count systems, so that parallel computations such as LLD/ThinLTO can use all hardware threads in the system. Before this patch, on Windows, a maximum of 64 hardware threads could be used at most, in some cases dispatched only on one CPU socket. == Background == Windows doesn't have a flat cpu_set_t like Linux. Instead, it projects hardware CPUs (or NUMA nodes) to applications through a concept of "processor groups". A "processor" is the smallest unit of execution on a CPU, that is, an hyper-thread if SMT is active; a core otherwise. There's a limit of 32-bit processors on older 32-bit versions of Windows, which later was raised to 64-processors with 64-bit versions of Windows. This limit comes from the affinity mask, which historically is represented by the sizeof(void*). Consequently, the concept of "processor groups" was introduced for dealing with systems with more than 64 hyper-threads. By default, the Windows OS assigns only one "processor group" to each starting application, in a round-robin manner. If the application wants to use more processors, it needs to programmatically enable it, by assigning threads to other "processor groups". This also means that affinity cannot cross "processor group" boundaries; one can only specify a "preferred" group on start-up, but the application is free to allocate more groups if it wants to. This creates a peculiar situation, where newer CPUs like the AMD EPYC 7702P (64-cores, 128-hyperthreads) are projected by the OS as two (2) "processor groups". This means that by default, an application can only use half of the cores. This situation could only get worse in the years to come, as dies with more cores will appear on the market. == The problem == The heavyweight_hardware_concurrency() API was introduced so that only *one hardware thread per core* was used. Once that API returns, that original intention is lost, only the number of threads is retained. Consider a situation, on Windows, where the system has 2 CPU sockets, 18 cores each, each core having 2 hyper-threads, for a total of 72 hyper-threads. Both heavyweight_hardware_concurrency() and hardware_concurrency() currently return 36, because on Windows they are simply wrappers over std::thread::hardware_concurrency() -- which can only return processors from the current "processor group". == The changes in this patch == To solve this situation, we capture (and retain) the initial intention until the point of usage, through a new ThreadPoolStrategy class. The number of threads to use is deferred as late as possible, until the moment where the std::threads are created (ThreadPool in the case of ThinLTO). When using hardware_concurrency(), setting ThreadCount to 0 now means to use all the possible hardware CPU (SMT) threads. Providing a ThreadCount above to the maximum number of threads will have no effect, the maximum will be used instead. The heavyweight_hardware_concurrency() is similar to hardware_concurrency(), except that only one thread per hardware *core* will be used. When LLVM_ENABLE_THREADS is OFF, the threading APIs will always return 1, to ensure any caller loops will be exercised at least once. Differential Revision: https://reviews.llvm.org/D71775
2020-02-14 04:49:57 +01:00
ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
// Only one CPU socket in the system or process affinity was set, no need to
// move the thread(s) to another CPU socket.
if (Groups.size() <= 1)
return None;
// We ask for less threads than there are hardware threads per CPU socket, no
// need to dispatch threads to other CPU sockets.
unsigned MaxThreadsPerSocket =
UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
if (compute_thread_count() <= MaxThreadsPerSocket)
return None;
[Support] On Windows, ensure hardware_concurrency() extends to all CPU sockets and all NUMA groups The goal of this patch is to maximize CPU utilization on multi-socket or high core count systems, so that parallel computations such as LLD/ThinLTO can use all hardware threads in the system. Before this patch, on Windows, a maximum of 64 hardware threads could be used at most, in some cases dispatched only on one CPU socket. == Background == Windows doesn't have a flat cpu_set_t like Linux. Instead, it projects hardware CPUs (or NUMA nodes) to applications through a concept of "processor groups". A "processor" is the smallest unit of execution on a CPU, that is, an hyper-thread if SMT is active; a core otherwise. There's a limit of 32-bit processors on older 32-bit versions of Windows, which later was raised to 64-processors with 64-bit versions of Windows. This limit comes from the affinity mask, which historically is represented by the sizeof(void*). Consequently, the concept of "processor groups" was introduced for dealing with systems with more than 64 hyper-threads. By default, the Windows OS assigns only one "processor group" to each starting application, in a round-robin manner. If the application wants to use more processors, it needs to programmatically enable it, by assigning threads to other "processor groups". This also means that affinity cannot cross "processor group" boundaries; one can only specify a "preferred" group on start-up, but the application is free to allocate more groups if it wants to. This creates a peculiar situation, where newer CPUs like the AMD EPYC 7702P (64-cores, 128-hyperthreads) are projected by the OS as two (2) "processor groups". This means that by default, an application can only use half of the cores. This situation could only get worse in the years to come, as dies with more cores will appear on the market. == The problem == The heavyweight_hardware_concurrency() API was introduced so that only *one hardware thread per core* was used. Once that API returns, that original intention is lost, only the number of threads is retained. Consider a situation, on Windows, where the system has 2 CPU sockets, 18 cores each, each core having 2 hyper-threads, for a total of 72 hyper-threads. Both heavyweight_hardware_concurrency() and hardware_concurrency() currently return 36, because on Windows they are simply wrappers over std::thread::hardware_concurrency() -- which can only return processors from the current "processor group". == The changes in this patch == To solve this situation, we capture (and retain) the initial intention until the point of usage, through a new ThreadPoolStrategy class. The number of threads to use is deferred as late as possible, until the moment where the std::threads are created (ThreadPool in the case of ThinLTO). When using hardware_concurrency(), setting ThreadCount to 0 now means to use all the possible hardware CPU (SMT) threads. Providing a ThreadCount above to the maximum number of threads will have no effect, the maximum will be used instead. The heavyweight_hardware_concurrency() is similar to hardware_concurrency(), except that only one thread per hardware *core* will be used. When LLVM_ENABLE_THREADS is OFF, the threading APIs will always return 1, to ensure any caller loops will be exercised at least once. Differential Revision: https://reviews.llvm.org/D71775
2020-02-14 04:49:57 +01:00
assert(ThreadPoolNum < compute_thread_count() &&
"The thread index is not within thread strategy's range!");
// Assumes the same number of hardware threads per CPU socket.
return (ThreadPoolNum * Groups.size()) / compute_thread_count();
}
// Assign the current thread to a more appropriate CPU socket or CPU group
void llvm::ThreadPoolStrategy::apply_thread_strategy(
unsigned ThreadPoolNum) const {
Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
if (!Socket)
return;
ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
GROUP_AFFINITY Affinity{};
Affinity.Group = Groups[*Socket].ID;
Affinity.Mask = Groups[*Socket].Affinity;
SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
[Support] On Windows, ensure hardware_concurrency() extends to all CPU sockets and all NUMA groups The goal of this patch is to maximize CPU utilization on multi-socket or high core count systems, so that parallel computations such as LLD/ThinLTO can use all hardware threads in the system. Before this patch, on Windows, a maximum of 64 hardware threads could be used at most, in some cases dispatched only on one CPU socket. == Background == Windows doesn't have a flat cpu_set_t like Linux. Instead, it projects hardware CPUs (or NUMA nodes) to applications through a concept of "processor groups". A "processor" is the smallest unit of execution on a CPU, that is, an hyper-thread if SMT is active; a core otherwise. There's a limit of 32-bit processors on older 32-bit versions of Windows, which later was raised to 64-processors with 64-bit versions of Windows. This limit comes from the affinity mask, which historically is represented by the sizeof(void*). Consequently, the concept of "processor groups" was introduced for dealing with systems with more than 64 hyper-threads. By default, the Windows OS assigns only one "processor group" to each starting application, in a round-robin manner. If the application wants to use more processors, it needs to programmatically enable it, by assigning threads to other "processor groups". This also means that affinity cannot cross "processor group" boundaries; one can only specify a "preferred" group on start-up, but the application is free to allocate more groups if it wants to. This creates a peculiar situation, where newer CPUs like the AMD EPYC 7702P (64-cores, 128-hyperthreads) are projected by the OS as two (2) "processor groups". This means that by default, an application can only use half of the cores. This situation could only get worse in the years to come, as dies with more cores will appear on the market. == The problem == The heavyweight_hardware_concurrency() API was introduced so that only *one hardware thread per core* was used. Once that API returns, that original intention is lost, only the number of threads is retained. Consider a situation, on Windows, where the system has 2 CPU sockets, 18 cores each, each core having 2 hyper-threads, for a total of 72 hyper-threads. Both heavyweight_hardware_concurrency() and hardware_concurrency() currently return 36, because on Windows they are simply wrappers over std::thread::hardware_concurrency() -- which can only return processors from the current "processor group". == The changes in this patch == To solve this situation, we capture (and retain) the initial intention until the point of usage, through a new ThreadPoolStrategy class. The number of threads to use is deferred as late as possible, until the moment where the std::threads are created (ThreadPool in the case of ThinLTO). When using hardware_concurrency(), setting ThreadCount to 0 now means to use all the possible hardware CPU (SMT) threads. Providing a ThreadCount above to the maximum number of threads will have no effect, the maximum will be used instead. The heavyweight_hardware_concurrency() is similar to hardware_concurrency(), except that only one thread per hardware *core* will be used. When LLVM_ENABLE_THREADS is OFF, the threading APIs will always return 1, to ensure any caller loops will be exercised at least once. Differential Revision: https://reviews.llvm.org/D71775
2020-02-14 04:49:57 +01:00
}
llvm::BitVector llvm::get_thread_affinity_mask() {
GROUP_AFFINITY Affinity{};
GetThreadGroupAffinity(GetCurrentThread(), &Affinity);
static unsigned All =
aggregate(getProcessorGroups(),
[](const ProcessorGroup &G) { return G.AllThreads; });
unsigned StartOffset =
aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) {
return G.ID < Affinity.Group ? G.AllThreads : 0;
});
llvm::BitVector V;
V.resize(All);
for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) {
if ((Affinity.Mask >> I) & 1)
V.set(StartOffset + I);
}
return V;
}
unsigned llvm::get_cpus() { return getProcessorGroups().size(); }