cell/scheduler: Manage thread placement depending on cpu hardware

2025-01-31 20:41:45 +01:00 · 2017-10-21 14:21:37 +03:00 · 2017-10-21 14:21:37 +03:00 · cbc8bf01a1
commit cbc8bf01a1
parent 54fbde0de1
13 changed files with 185 additions and 40 deletions
--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@ -6,7 +6,9 @@
 #include "Emu/Cell/lv2/sys_mmapper.h"
 #include "Emu/Cell/lv2/sys_event.h"
 #include "Thread.h"
+#include "sysinfo.h"
 #include <typeinfo>
+#include <thread>

 #ifdef _WIN32
 #include <Windows.h>
@ -1547,6 +1549,8 @@ thread_local DECLARE(thread_ctrl::g_tls_this_thread) = nullptr;

 extern thread_local std::string(*g_tls_log_prefix)();

+DECLARE(thread_ctrl::g_native_core_layout) { native_core_arrangement::undefined };
+
 void thread_ctrl::start(const std::shared_ptr<thread_ctrl>& ctrl, task_stack task)
 {
 #ifdef _WIN32
@ -1853,6 +1857,89 @@ void thread_ctrl::test()
 	}
 }

+void thread_ctrl::detect_cpu_layout()
+{
+	if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic))
+		return;
+
+	const auto system_id = utils::get_system_info();
+	if (system_id.find("Ryzen") != std::string::npos)
+	{
+		g_native_core_layout.store(native_core_arrangement::amd_ccx);
+	}
+	else if (system_id.find("i3") != std::string::npos || system_id.find("i7") != std::string::npos)
+	{
+		g_native_core_layout.store(native_core_arrangement::intel_ht);
+	}
+}
+
+u16 thread_ctrl::get_affinity_mask(thread_class group)
+{
+	detect_cpu_layout();
+
+	if (const auto thread_count = std::thread::hardware_concurrency())
+	{
+		const u16 all_cores_mask = thread_count < 16 ? (u16)(~(UINT16_MAX << thread_count)): UINT16_MAX;
+
+		switch (g_native_core_layout)
+		{
+		default:
+		case native_core_arrangement::generic:
+		{
+			return all_cores_mask;
+		}
+		case native_core_arrangement::amd_ccx:
+		{
+			u16 primary_ccx_unit_mask;
+			if (thread_count >= 16)
+			{
+				// Threadripper, R7
+				// Assign threads 8-16
+				// It appears some windows code is bound to lower core addresses, binding 8-16 is alot faster than 0-7
+				primary_ccx_unit_mask = 0b1111111100000000;
+			}
+			else
+			{
+				// R5 & R3 don't seem to improve performance no matter how these are shuffled (including 1600)
+				primary_ccx_unit_mask = 0b11111111 & all_cores_mask;
+			}
+
+			switch (group)
+			{
+			default:
+			case thread_class::general:
+				return all_cores_mask;
+			case thread_class::rsx:
+			case thread_class::ppu:
+			case thread_class::spu:
+				return primary_ccx_unit_mask;
+			}
+		}
+		case native_core_arrangement::intel_ht:
+		{
+			if (thread_count <= 4)
+			{
+				//i3 or worse
+				switch (group)
+				{
+				case thread_class::rsx:
+				case thread_class::ppu:
+					return (0b0101 & all_cores_mask);
+				case thread_class::spu:
+					return (0b1010 & all_cores_mask);
+				case thread_class::general:
+					return all_cores_mask;
+				}
+			}
+
+			return all_cores_mask;
+		}
+		}
+	}
+
+	return UINT16_MAX;
+}
+
 void thread_ctrl::set_native_priority(int priority)
 {
 #ifdef _WIN32
@ -1886,24 +1973,31 @@ void thread_ctrl::set_native_priority(int priority)
 #endif
 }

-void thread_ctrl::set_ideal_processor_core(int core)
+void thread_ctrl::set_thread_affinity_mask(u16 mask)
 {
 #ifdef _WIN32
 	HANDLE _this_thread = GetCurrentThread();
-	SetThreadIdealProcessor(_this_thread, core);
+	SetThreadAffinityMask(_this_thread, (DWORD_PTR)mask);
 #elif __APPLE__
-	thread_affinity_policy_data_t policy = { static_cast<integer_t>(core) };
+	thread_affinity_policy_data_t policy = { static_cast<integer_t>(mask) };
 	thread_port_t mach_thread = pthread_mach_thread_np(pthread_self());
 	thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1);
 #elif defined(__linux__) || defined(__DragonFly__) || defined(__FreeBSD__)
 	cpu_set_t cs;
 	CPU_ZERO(&cs);
-	CPU_SET(core, &cs);
+
+	for (u32 core = 0; core < 16u; ++core)
+	{
+		if ((u32)mask & (1u << core))
+		{
+			CPU_SET(core, &cs);
+		}
+	}
+
 	pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cs);
 #endif
 }

-
 named_thread::named_thread()
 {
 }
--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@ -16,6 +16,23 @@
 // Will report exception and call std::abort() if put in catch(...)
 [[noreturn]] void catch_all_exceptions();

+// Hardware core layout
+enum class native_core_arrangement : u32
+{
+	undefined,
+	generic,
+	intel_ht,
+	amd_ccx
+};
+
+enum class thread_class : u32
+{
+	general,
+	rsx,
+	spu,
+	ppu
+};
+
 // Simple list of void() functors
 class task_stack
 {
@ -91,6 +108,9 @@ class thread_ctrl final
 	// Current thread
 	static thread_local thread_ctrl* g_tls_this_thread;

+	// Target cpu core layout
+	static atomic_t<native_core_arrangement> g_native_core_layout;
+
 	// Self pointer
 	std::shared_ptr<thread_ctrl> m_self;

@ -234,8 +254,17 @@ public:
 		thread_ctrl::start(out, std::forward<F>(func));
 	}

+	// Detect layout
+	static void detect_cpu_layout();
+
+	// Returns a core affinity mask. Set whether to generate the high priority set or not
+	static u16 get_affinity_mask(thread_class group);
+
+	// Sets the native thread priority
 	static void set_native_priority(int priority);
-	static void set_ideal_processor_core(int core);
+
+	// Sets the preferred affinity mask for this thread
+	static void set_thread_affinity_mask(u16 mask);
 };

 class named_thread
--- a/rpcs3/Emu/Cell/MFC.cpp
+++ b/rpcs3/Emu/Cell/MFC.cpp
@ -3,6 +3,7 @@
 #include "Emu/Memory/vm.h"
 #include "Emu/Cell/SPUThread.h"
 #include "Emu/Cell/lv2/sys_sync.h"
+#include "Emu/System.h"
 #include "MFC.h"

 const bool s_use_rtm = utils::has_rtm();
@ -375,3 +376,12 @@ void mfc_thread::add_spu(spu_ptr _spu)

 	run();
 }
+
+void mfc_thread::on_spawn()
+{
+	if (g_cfg.core.thread_scheduler_enabled)
+	{
+		// Bind to same set with the SPUs
+		thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::spu));
+	}
+}
--- a/rpcs3/Emu/Cell/MFC.h
+++ b/rpcs3/Emu/Cell/MFC.h
@ -113,4 +113,6 @@ public:
 	virtual void cpu_task() override;

 	virtual void add_spu(spu_ptr _spu);
+
+	virtual void on_spawn() override;
 };
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -334,6 +334,15 @@ extern void ppu_breakpoint(u32 addr)
 	}
 }

+void ppu_thread::on_spawn()
+{
+	if (g_cfg.core.thread_scheduler_enabled)
+	{
+		// Bind to primary set
+		thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::ppu));
+	}
+}
+
 void ppu_thread::on_init(const std::shared_ptr<void>& _this)
 {
 	if (!stack_addr)
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@ -30,6 +30,7 @@ public:
 	static const u32 id_step = 1;
 	static const u32 id_count = 2048;

+	virtual void on_spawn() override;
 	virtual void on_init(const std::shared_ptr<void>&) override;
 	virtual std::string get_name() const override;
 	virtual std::string dump() const override;
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -134,8 +134,8 @@ namespace spu
 			{
 				if (timeout_ms > 0)
 				{
-					const auto timeout = timeout_ms * 1000u; //convert to microseconds
-					const auto start = get_system_time();
+					const u64 timeout = timeout_ms * 1000u; //convert to microseconds
+					const u64 start = get_system_time();
 					auto remaining = timeout;

 					while (atomic_instruction_table[pc_offset].load(std::memory_order_consume) >= max_concurrent_instructions)
@ -143,7 +143,7 @@ namespace spu
 						if (remaining >= native_jiffy_duration_us)
 							std::this_thread::sleep_for(1ms);
 						else
-							std::this_thread::yield();
+							busy_wait(remaining);

 						const auto now = get_system_time();
 						const auto elapsed = now - start;
@ -155,7 +155,8 @@ namespace spu
 				else
 				{
 					//Slight pause if function is overburdened
-					thread_ctrl::wait_for(100);
+					const auto count = atomic_instruction_table[pc_offset].load(std::memory_order_consume) * 100ull;
+					busy_wait(count);
 				}
 			}

@ -278,25 +279,15 @@ spu_imm_table_t::spu_imm_table_t()

 void SPUThread::on_spawn()
 {
-	if (g_cfg.core.bind_spu_cores)
+	if (g_cfg.core.thread_scheduler_enabled)
 	{
-		//Get next secondary core number
-		auto core_count = std::thread::hardware_concurrency();
-		if (core_count > 0 && core_count <= 16)
-		{
-			auto half_count = core_count / 2;
-			auto assigned_secondary_core = ((g_num_spu_threads % half_count) * 2) + 1;
-
-			thread_ctrl::set_ideal_processor_core((s32)assigned_secondary_core);
-		}
+		thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::spu));
 	}

 	if (g_cfg.core.lower_spu_priority)
 	{
 		thread_ctrl::set_native_priority(-1);
 	}
-
-	g_num_spu_threads++;
 }

 void SPUThread::on_init(const std::shared_ptr<void>& _this)
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -398,7 +398,11 @@ namespace rsx

 		// Raise priority above other threads
 		thread_ctrl::set_native_priority(1);
-		thread_ctrl::set_ideal_processor_core(0);
+
+		if (g_cfg.core.thread_scheduler_enabled)
+		{
+			thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx));
+		}

 		// Round to nearest to deal with forward/reverse scaling
 		fesetround(FE_TONEAREST);
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@ -286,8 +286,13 @@ struct cfg_root : cfg::node
 		cfg::_bool llvm_logs{this, "Save LLVM logs"};
 		cfg::string llvm_cpu{this, "Use LLVM CPU"};

+#ifdef _WIN32
+		cfg::_bool thread_scheduler_enabled{ this, "Enable thread scheduler", true };
+#else
+		cfg::_bool thread_scheduler_enabled{ this, "Enable thread scheduler", false };
+#endif
+
 		cfg::_enum<spu_decoder_type> spu_decoder{this, "SPU Decoder", spu_decoder_type::asmjit};
-		cfg::_bool bind_spu_cores{this, "Bind SPU threads to secondary cores"};
 		cfg::_bool lower_spu_priority{this, "Lower SPU thread priority"};
 		cfg::_bool spu_debug{this, "SPU Debug"};
 		cfg::_int<0, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC
--- a/rpcs3/Json/tooltips.json
+++ b/rpcs3/Json/tooltips.json
@ -26,7 +26,7 @@
 		},
 		"checkboxes": {
 			"hookStFunc": "Allows to hook some functions like 'memcpy' replacing them with high-level implementations. May do nothing or break things. Experimental.",
-			"bindSPUThreads": "If your CPU has SMT (Hyper-Threading) SPU threads will run on these logical cores instead.\nUsually faster on an i3, possibly slower or no difference on an i7 or Ryzen.",
+			"enableThreadScheduler": "Allows rpcs3 to manually schedule physical cores to run specific tasks on, instead of letting the OS handle it.\nVery useful on windows, especially for AMD Ryzen systems where it can give huge performance gains.",
 			"lowerSPUThrPrio": "Runs SPU threads with lower priority than PPU threads.\nUsually faster on an i3 or i5, possibly slower or no difference on an i7 or Ryzen.",
 			"spuLoopDetection": "Try to detect loop conditions in SPU kernels and use them as scheduling hints.\nImproves performance and reduces CPU usage.\nMay cause severe audio stuttering in rare cases."
 		},
--- a/rpcs3/rpcs3qt/emu_settings.h
+++ b/rpcs3/rpcs3qt/emu_settings.h
@ -30,7 +30,7 @@ public:
 		SPUDecoder,
 		LibLoadOptions,
 		HookStaticFuncs,
-		BindSPUThreads,
+		EnableThreadScheduler,
 		LowerSPUThreadPrio,
 		SPULoopDetection,
 		PreferredSPUThreads,
@ -183,16 +183,16 @@ private:
 	const QMap<SettingsType, cfg_location> SettingsLoc =
 	{
 		// Core Tab
-		{ PPUDecoder,          { "Core", "PPU Decoder"}},
-		{ SPUDecoder,          { "Core", "SPU Decoder"}},
-		{ LibLoadOptions,      { "Core", "Lib Loader"}},
-		{ HookStaticFuncs,     { "Core", "Hook static functions"}},
-		{ BindSPUThreads,      { "Core", "Bind SPU threads to secondary cores"}},
-		{ LowerSPUThreadPrio,  { "Core", "Lower SPU thread priority"}},
-		{ SPULoopDetection,    { "Core", "SPU loop detection"}},
-		{ PreferredSPUThreads, { "Core", "Preferred SPU Threads"}},
-		{ PPUDebug,            { "Core", "PPU Debug"}},
-		{ SPUDebug,            { "Core", "SPU Debug"}},
+		{ PPUDecoder,               { "Core", "PPU Decoder"}},
+		{ SPUDecoder,               { "Core", "SPU Decoder"}},
+		{ LibLoadOptions,           { "Core", "Lib Loader"}},
+		{ HookStaticFuncs,          { "Core", "Hook static functions"}},
+		{ EnableThreadScheduler,    { "Core", "Enable thread scheduler"}},
+		{ LowerSPUThreadPrio,       { "Core", "Lower SPU thread priority"}},
+		{ SPULoopDetection,         { "Core", "SPU loop detection"}},
+		{ PreferredSPUThreads,      { "Core", "Preferred SPU Threads"}},
+		{ PPUDebug,                 { "Core", "PPU Debug"}},
+		{ SPUDebug,                 { "Core", "SPU Debug"}},

 		// Graphics Tab
 		{ Renderer,                 { "Video", "Renderer"}},
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@ -129,8 +129,8 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> guiSettings, std:
 	xemu_settings->EnhanceCheckBox(ui->hookStFunc, emu_settings::HookStaticFuncs);
 	SubscribeTooltip(ui->hookStFunc, json_cpu_cbs["hookStFunc"].toString());

-	xemu_settings->EnhanceCheckBox(ui->bindSPUThreads, emu_settings::BindSPUThreads);
-	SubscribeTooltip(ui->bindSPUThreads, json_cpu_cbs["bindSPUThreads"].toString());
+	xemu_settings->EnhanceCheckBox(ui->enableScheduler, emu_settings::EnableThreadScheduler);
+	SubscribeTooltip(ui->enableScheduler, json_cpu_cbs["enableThreadScheduler"].toString());

 	xemu_settings->EnhanceCheckBox(ui->lowerSPUThrPrio, emu_settings::LowerSPUThreadPrio);
 	SubscribeTooltip(ui->lowerSPUThrPrio, json_cpu_cbs["lowerSPUThrPrio"].toString());
--- a/rpcs3/rpcs3qt/settings_dialog.ui
+++ b/rpcs3/rpcs3qt/settings_dialog.ui
@ -207,9 +207,9 @@
             </property>
             <layout class="QVBoxLayout" name="verticalLayout_4">
              <item>
-               <widget class="QCheckBox" name="bindSPUThreads">
+               <widget class="QCheckBox" name="enableScheduler">
                <property name="text">
-                 <string>Bind SPU threads to secondary cores</string>
+                 <string>Enable thread scheduler</string>
                </property>
               </widget>
              </item>