[ThinLTO] Allow usage of all hardware threads in the system

Before this patch, it wasn't possible to extend the ThinLTO threads to all SMT/CMT threads in the system. Only one thread per core was allowed, instructed by usage of llvm::heavyweight_hardware_concurrency() in the ThinLTO code. Any number passed to the LLD flag /opt:lldltojobs=..., or any other ThinLTO-specific flag, was previously interpreted in the context of llvm::heavyweight_hardware_concurrency(), which means SMT disabled. One can now say in LLD: /opt:lldltojobs=0 -- Use one std::thread / hardware core in the system (no SMT). Default value if flag not specified. /opt:lldltojobs=N -- Limit usage to N threads, regardless of usage of heavyweight_hardware_concurrency(). /opt:lldltojobs=all -- Use all hardware threads in the system. Equivalent to /opt:lldltojobs=$(nproc) on Linux and /opt:lldltojobs=%NUMBER_OF_PROCESSORS% on Windows. When an affinity mask is set for the process, threads will be created only for the cores selected by the mask. When N > number-of-hardware-threads-in-the-system, the threads in the thread pool will be dispatched equally on all CPU sockets (tested only on Windows). When N <= number-of-hardware-threads-on-a-CPU-socket, the threads will remain on the CPU socket where the process started (only on Windows). Differential Revision: https://reviews.llvm.org/D75153
2024-11-25 12:12:47 +01:00 · 2020-03-27 10:20:39 -04:00 · 2020-03-27 10:20:39 -04:00 · 61ed3dc5bf
commit 61ed3dc5bf
parent 153a6c2dbd
8 changed files with 106 additions and 48 deletions
--- a/include/llvm/LTO/LTO.h
+++ b/include/llvm/LTO/LTO.h
@ -228,7 +228,7 @@ using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(

 /// This ThinBackend runs the individual backend jobs in-process.
 /// The default value means to use one job per hardware core (not hyper-thread).
-ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0);
+ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism);

 /// This ThinBackend writes individual module indexes to files, instead of
 /// running the individual backend jobs. This backend is for distributed builds
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@ -166,8 +166,20 @@ void llvm_execute_on_thread_async(
    /// sockets. \p ThreadPoolNum represents a number bounded by [0,
    /// compute_thread_count()).
    void apply_thread_strategy(unsigned ThreadPoolNum) const;
+
+    /// Finds the CPU socket where a thread should go. Returns 'None' if the
+    /// thread shall remain on the actual CPU socket.
+    Optional<unsigned> compute_cpu_socket(unsigned ThreadPoolNum) const;
  };

+  /// Build a strategy from a number of threads as a string provided in \p Num.
+  /// When Num is above the max number of threads specified by the \p Default
+  /// strategy, we attempt to equally allocate the threads on all CPU sockets.
+  /// "0" or an empty string will return the \p Default strategy.
+  /// "all" for using all hardware threads.
+  Optional<ThreadPoolStrategy>
+  get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default = {});
+
  /// Returns a thread strategy for tasks requiring significant memory or other
  /// resources. To be used for workloads where hardware_concurrency() proves to
  /// be less efficient. Avoid this strategy if doing lots of I/O. Currently
@ -182,6 +194,18 @@ void llvm_execute_on_thread_async(
    return S;
  }

+  /// Like heavyweight_hardware_concurrency() above, but builds a strategy
+  /// based on the rules described for get_threadpool_strategy().
+  /// If \p Num is invalid, returns a default strategy where one thread per
+  /// hardware core is used.
+  inline ThreadPoolStrategy heavyweight_hardware_concurrency(StringRef Num) {
+    Optional<ThreadPoolStrategy> S =
+        get_threadpool_strategy(Num, heavyweight_hardware_concurrency());
+    if (S)
+      return *S;
+    return heavyweight_hardware_concurrency();
+  }
+
  /// Returns a default thread strategy where all available hardware ressources
  /// are to be used, except for those initially excluded by an affinity mask.
  /// This function takes affinity into consideration. Returns 1 when LLVM is
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@ -477,7 +477,8 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
 LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
    : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
  if (!Backend)
-    this->Backend = createInProcessThinBackend();
+    this->Backend =
+        createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
 }

 LTO::LTO(Config Conf, ThinBackend Backend,
@ -1090,13 +1091,12 @@ class InProcessThinBackend : public ThinBackendProc {
 public:
  InProcessThinBackend(
      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
-      unsigned ThinLTOParallelismLevel,
+      ThreadPoolStrategy ThinLTOParallelism,
      const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
      AddStreamFn AddStream, NativeObjectCache Cache)
      : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
-        BackendThreadPool(
-            heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
-        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
+        BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
+        Cache(std::move(Cache)) {
    for (auto &Name : CombinedIndex.cfiFunctionDefs())
      CfiFunctionDefs.insert(
          GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
@ -1192,13 +1192,13 @@ public:
 };
 } // end anonymous namespace

-ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) {
+ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) {
  return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
             const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
             AddStreamFn AddStream, NativeObjectCache Cache) {
    return std::make_unique<InProcessThinBackend>(
-        Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries,
-        AddStream, Cache);
+        Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream,
+        Cache);
  };
 }

--- a/lib/Support/Threading.cpp
+++ b/lib/Support/Threading.cpp
@ -84,16 +84,34 @@ void llvm::llvm_execute_on_thread_async(
 int computeHostNumHardwareThreads();

 unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  if (ThreadsRequested > 0)
+    return ThreadsRequested;
+
  int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
                                       : sys::getHostNumPhysicalCores();
  if (MaxThreadCount <= 0)
    MaxThreadCount = 1;
+  return MaxThreadCount;
+}

-  // No need to create more threads than there are hardware threads, it would
-  // uselessly induce more context-switching and cache eviction.
-  if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount)
-    return MaxThreadCount;
-  return ThreadsRequested;
+Optional<ThreadPoolStrategy>
+llvm::get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default) {
+  if (Num == "all")
+    return llvm::hardware_concurrency();
+  if (Num.empty())
+    return Default;
+  unsigned V;
+  if (Num.getAsInteger(10, V))
+    return None; // malformed 'Num' value
+  if (V == 0)
+    return Default;
+
+  // Do not take the Default into account. This effectively disables
+  // heavyweight_hardware_concurrency() if the user asks for any number of
+  // threads on the cmd-line.
+  ThreadPoolStrategy S = llvm::hardware_concurrency();
+  S.ThreadsRequested = V;
+  return S;
 }

 namespace {
--- a/lib/Support/Unix/Threading.inc
+++ b/lib/Support/Unix/Threading.inc
@ -273,7 +273,7 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
 int computeHostNumHardwareThreads() {
 #if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
  cpu_set_t Set;
-  if (sched_getaffinity(0, sizeof(Set), &Set))
+  if (sched_getaffinity(0, sizeof(Set), &Set) == 0)
    return CPU_COUNT(&Set);
 #endif
  // Guard against std::thread::hardware_concurrency() returning 0.
--- a/lib/Support/Windows/Threading.inc
+++ b/lib/Support/Windows/Threading.inc
@ -131,6 +131,10 @@ struct ProcessorGroup {
  unsigned UsableThreads;
  unsigned ThreadsPerCore;
  uint64_t Affinity;
+
+  unsigned useableCores() const {
+    return std::max(1U, UsableThreads / ThreadsPerCore);
+  }
 };

 template <typename F>
@ -232,33 +236,41 @@ int computeHostNumHardwareThreads() {
  return Threads;
 }

-// Assign the current thread to a more appropriate CPU socket or CPU group
-void llvm::ThreadPoolStrategy::apply_thread_strategy(
-    unsigned ThreadPoolNum) const {
+// Finds the proper CPU socket where a thread number should go. Returns 'None'
+// if the thread shall remain on the actual CPU socket.
+Optional<unsigned>
+llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
  ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+  // Only one CPU socket in the system or process affinity was set, no need to
+  // move the thread(s) to another CPU socket.
+  if (Groups.size() <= 1)
+    return None;
+
+  // We ask for less threads than there are hardware threads per CPU socket, no
+  // need to dispatch threads to other CPU sockets.
+  unsigned MaxThreadsPerSocket =
+      UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
+  if (compute_thread_count() <= MaxThreadsPerSocket)
+    return None;

  assert(ThreadPoolNum < compute_thread_count() &&
         "The thread index is not within thread strategy's range!");

-  // In this mode, the ThreadNumber represents the core number, not the
-  // hyper-thread number. Assumes all NUMA groups have the same amount of
-  // hyper-threads.
-  if (!UseHyperThreads)
-    ThreadPoolNum *= Groups[0].ThreadsPerCore;
+  // Assumes the same number of hardware threads per CPU socket.
+  return (ThreadPoolNum * Groups.size()) / compute_thread_count();
+}

-  unsigned ThreadRangeStart = 0;
-  for (unsigned I = 0; I < Groups.size(); ++I) {
-    const ProcessorGroup &G = Groups[I];
-    if (ThreadPoolNum >= ThreadRangeStart &&
-        ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {
-
-      GROUP_AFFINITY Affinity{};
-      Affinity.Group = G.ID;
-      Affinity.Mask = G.Affinity;
-      SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
-    }
-    ThreadRangeStart += G.UsableThreads;
-  }
+// Assign the current thread to a more appropriate CPU socket or CPU group
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {
+  Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
+  if (!Socket)
+    return;
+  ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+  GROUP_AFFINITY Affinity{};
+  Affinity.Group = Groups[*Socket].ID;
+  Affinity.Mask = Groups[*Socket].Affinity;
+  SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
 }

 llvm::BitVector llvm::get_thread_affinity_mask() {
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@ -28,6 +28,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 #include <list>
 #include <map>
@ -134,11 +135,9 @@ namespace options {
  };
  static OutputType TheOutputType = OT_NORMAL;
  static unsigned OptLevel = 2;
-  // Default parallelism of 0 used to indicate that user did not specify.
-  // Actual parallelism default value depends on implementation.
  // Currently only affects ThinLTO, where the default is the max cores in the
-  // system.
-  static unsigned Parallelism = 0;
+  // system. See llvm::get_threadpool_strategy() for acceptable values.
+  static std::string Parallelism;
  // Default regular LTO codegen parallelism (number of partitions).
  static unsigned ParallelCodeGenParallelismLevel = 1;
 #ifdef NDEBUG
@ -272,8 +271,10 @@ namespace options {
        message(LDPL_FATAL, "Optimization level must be between 0 and 3");
      OptLevel = opt[1] - '0';
    } else if (opt.startswith("jobs=")) {
-      if (StringRef(opt_ + 5).getAsInteger(10, Parallelism))
-        message(LDPL_FATAL, "Invalid parallelism level: %s", opt_ + 5);
+      StringRef Num(opt_ + 5);
+      if (!get_threadpool_strategy(Num))
+        message(LDPL_FATAL, "Invalid parallelism level: %s", Num.data());
+      Parallelism = Num;
    } else if (opt.startswith("lto-partitions=")) {
      if (opt.substr(strlen("lto-partitions="))
              .getAsInteger(10, ParallelCodeGenParallelismLevel))
@ -877,14 +878,15 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
  Conf.PTO.LoopVectorization = options::OptLevel > 1;
  Conf.PTO.SLPVectorization = options::OptLevel > 1;

-  if (options::Parallelism)
-    Backend = createInProcessThinBackend(options::Parallelism);
  if (options::thinlto_index_only) {
    std::string OldPrefix, NewPrefix;
    getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix);
    Backend = createWriteIndexesThinBackend(OldPrefix, NewPrefix,
                                            options::thinlto_emit_imports_files,
                                            LinkedObjectsFile, OnIndexWrite);
+  } else {
+    Backend = createInProcessThinBackend(
+        llvm::heavyweight_hardware_concurrency(options::Parallelism));
  }

  Conf.OverrideTriple = options::triple;
--- a/tools/llvm-lto2/llvm-lto2.cpp
+++ b/tools/llvm-lto2/llvm-lto2.cpp
@ -68,9 +68,10 @@ static cl::opt<bool>
                                       "distributed backend case"));

 // Default to using all available threads in the system, but using only one
-// thread per core, as indicated by the usage of
-// heavyweight_hardware_concurrency() in the InProcessThinBackend constructor.
-static cl::opt<int> Threads("thinlto-threads", cl::init(0));
+// thread per core (no SMT).
+// Use -thinlto-threads=all to use hardware_concurrency() instead, which means
+// to use all hardware threads or cores in the system.
+static cl::opt<std::string> Threads("thinlto-threads");

 static cl::list<std::string> SymbolResolutions(
    "r",
@ -286,7 +287,8 @@ static int run(int argc, char **argv) {
                                            /* LinkedObjectsFile */ nullptr,
                                            /* OnWrite */ {});
  else
-    Backend = createInProcessThinBackend(Threads);
+    Backend = createInProcessThinBackend(
+        llvm::heavyweight_hardware_concurrency(Threads));
  LTO Lto(std::move(Conf), std::move(Backend));

  bool HasErrors = false;