diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index fc1fa44f0b..cce866eecf 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -6,7 +6,9 @@ #include "Emu/Cell/lv2/sys_mmapper.h" #include "Emu/Cell/lv2/sys_event.h" #include "Thread.h" +#include "sysinfo.h" #include +#include #ifdef _WIN32 #include @@ -1547,6 +1549,8 @@ thread_local DECLARE(thread_ctrl::g_tls_this_thread) = nullptr; extern thread_local std::string(*g_tls_log_prefix)(); +DECLARE(thread_ctrl::g_native_core_layout) { native_core_arrangement::undefined }; + void thread_ctrl::start(const std::shared_ptr& ctrl, task_stack task) { #ifdef _WIN32 @@ -1853,6 +1857,89 @@ void thread_ctrl::test() } } +void thread_ctrl::detect_cpu_layout() +{ + if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic)) + return; + + const auto system_id = utils::get_system_info(); + if (system_id.find("Ryzen") != std::string::npos) + { + g_native_core_layout.store(native_core_arrangement::amd_ccx); + } + else if (system_id.find("i3") != std::string::npos || system_id.find("i7") != std::string::npos) + { + g_native_core_layout.store(native_core_arrangement::intel_ht); + } +} + +u16 thread_ctrl::get_affinity_mask(thread_class group) +{ + detect_cpu_layout(); + + if (const auto thread_count = std::thread::hardware_concurrency()) + { + const u16 all_cores_mask = thread_count < 16 ? (u16)(~(UINT16_MAX << thread_count)): UINT16_MAX; + + switch (g_native_core_layout) + { + default: + case native_core_arrangement::generic: + { + return all_cores_mask; + } + case native_core_arrangement::amd_ccx: + { + u16 primary_ccx_unit_mask; + if (thread_count >= 16) + { + // Threadripper, R7 + // Assign threads 8-16 + // It appears some windows code is bound to lower core addresses, binding 8-16 is alot faster than 0-7 + primary_ccx_unit_mask = 0b1111111100000000; + } + else + { + // R5 & R3 don't seem to improve performance no matter how these are shuffled (including 1600) + primary_ccx_unit_mask = 0b11111111 & all_cores_mask; + } + + switch (group) + { + default: + case thread_class::general: + return all_cores_mask; + case thread_class::rsx: + case thread_class::ppu: + case thread_class::spu: + return primary_ccx_unit_mask; + } + } + case native_core_arrangement::intel_ht: + { + if (thread_count <= 4) + { + //i3 or worse + switch (group) + { + case thread_class::rsx: + case thread_class::ppu: + return (0b0101 & all_cores_mask); + case thread_class::spu: + return (0b1010 & all_cores_mask); + case thread_class::general: + return all_cores_mask; + } + } + + return all_cores_mask; + } + } + } + + return UINT16_MAX; +} + void thread_ctrl::set_native_priority(int priority) { #ifdef _WIN32 @@ -1886,24 +1973,31 @@ void thread_ctrl::set_native_priority(int priority) #endif } -void thread_ctrl::set_ideal_processor_core(int core) +void thread_ctrl::set_thread_affinity_mask(u16 mask) { #ifdef _WIN32 HANDLE _this_thread = GetCurrentThread(); - SetThreadIdealProcessor(_this_thread, core); + SetThreadAffinityMask(_this_thread, (DWORD_PTR)mask); #elif __APPLE__ - thread_affinity_policy_data_t policy = { static_cast(core) }; + thread_affinity_policy_data_t policy = { static_cast(mask) }; thread_port_t mach_thread = pthread_mach_thread_np(pthread_self()); thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); #elif defined(__linux__) || defined(__DragonFly__) || defined(__FreeBSD__) cpu_set_t cs; CPU_ZERO(&cs); - CPU_SET(core, &cs); + + for (u32 core = 0; core < 16u; ++core) + { + if ((u32)mask & (1u << core)) + { + CPU_SET(core, &cs); + } + } + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cs); #endif } - named_thread::named_thread() { } diff --git a/Utilities/Thread.h b/Utilities/Thread.h index 031ac9eecc..11cb504253 100644 --- a/Utilities/Thread.h +++ b/Utilities/Thread.h @@ -16,6 +16,23 @@ // Will report exception and call std::abort() if put in catch(...) [[noreturn]] void catch_all_exceptions(); +// Hardware core layout +enum class native_core_arrangement : u32 +{ + undefined, + generic, + intel_ht, + amd_ccx +}; + +enum class thread_class : u32 +{ + general, + rsx, + spu, + ppu +}; + // Simple list of void() functors class task_stack { @@ -91,6 +108,9 @@ class thread_ctrl final // Current thread static thread_local thread_ctrl* g_tls_this_thread; + // Target cpu core layout + static atomic_t g_native_core_layout; + // Self pointer std::shared_ptr m_self; @@ -234,8 +254,17 @@ public: thread_ctrl::start(out, std::forward(func)); } + // Detect layout + static void detect_cpu_layout(); + + // Returns a core affinity mask. Set whether to generate the high priority set or not + static u16 get_affinity_mask(thread_class group); + + // Sets the native thread priority static void set_native_priority(int priority); - static void set_ideal_processor_core(int core); + + // Sets the preferred affinity mask for this thread + static void set_thread_affinity_mask(u16 mask); }; class named_thread diff --git a/rpcs3/Emu/Cell/MFC.cpp b/rpcs3/Emu/Cell/MFC.cpp index d6c0b01cbc..833a0ddae9 100644 --- a/rpcs3/Emu/Cell/MFC.cpp +++ b/rpcs3/Emu/Cell/MFC.cpp @@ -3,6 +3,7 @@ #include "Emu/Memory/vm.h" #include "Emu/Cell/SPUThread.h" #include "Emu/Cell/lv2/sys_sync.h" +#include "Emu/System.h" #include "MFC.h" const bool s_use_rtm = utils::has_rtm(); @@ -375,3 +376,12 @@ void mfc_thread::add_spu(spu_ptr _spu) run(); } + +void mfc_thread::on_spawn() +{ + if (g_cfg.core.thread_scheduler_enabled) + { + // Bind to same set with the SPUs + thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::spu)); + } +} diff --git a/rpcs3/Emu/Cell/MFC.h b/rpcs3/Emu/Cell/MFC.h index 5be7986845..1fa4b55f6c 100644 --- a/rpcs3/Emu/Cell/MFC.h +++ b/rpcs3/Emu/Cell/MFC.h @@ -113,4 +113,6 @@ public: virtual void cpu_task() override; virtual void add_spu(spu_ptr _spu); + + virtual void on_spawn() override; }; diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 7f9a2e52e8..1fe24516b6 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -334,6 +334,15 @@ extern void ppu_breakpoint(u32 addr) } } +void ppu_thread::on_spawn() +{ + if (g_cfg.core.thread_scheduler_enabled) + { + // Bind to primary set + thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::ppu)); + } +} + void ppu_thread::on_init(const std::shared_ptr& _this) { if (!stack_addr) diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h index 3d5be3f8ab..f5db19fcea 100644 --- a/rpcs3/Emu/Cell/PPUThread.h +++ b/rpcs3/Emu/Cell/PPUThread.h @@ -30,6 +30,7 @@ public: static const u32 id_step = 1; static const u32 id_count = 2048; + virtual void on_spawn() override; virtual void on_init(const std::shared_ptr&) override; virtual std::string get_name() const override; virtual std::string dump() const override; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index c33b3a1442..483ddbd30f 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -134,8 +134,8 @@ namespace spu { if (timeout_ms > 0) { - const auto timeout = timeout_ms * 1000u; //convert to microseconds - const auto start = get_system_time(); + const u64 timeout = timeout_ms * 1000u; //convert to microseconds + const u64 start = get_system_time(); auto remaining = timeout; while (atomic_instruction_table[pc_offset].load(std::memory_order_consume) >= max_concurrent_instructions) @@ -143,7 +143,7 @@ namespace spu if (remaining >= native_jiffy_duration_us) std::this_thread::sleep_for(1ms); else - std::this_thread::yield(); + busy_wait(remaining); const auto now = get_system_time(); const auto elapsed = now - start; @@ -155,7 +155,8 @@ namespace spu else { //Slight pause if function is overburdened - thread_ctrl::wait_for(100); + const auto count = atomic_instruction_table[pc_offset].load(std::memory_order_consume) * 100ull; + busy_wait(count); } } @@ -278,25 +279,15 @@ spu_imm_table_t::spu_imm_table_t() void SPUThread::on_spawn() { - if (g_cfg.core.bind_spu_cores) + if (g_cfg.core.thread_scheduler_enabled) { - //Get next secondary core number - auto core_count = std::thread::hardware_concurrency(); - if (core_count > 0 && core_count <= 16) - { - auto half_count = core_count / 2; - auto assigned_secondary_core = ((g_num_spu_threads % half_count) * 2) + 1; - - thread_ctrl::set_ideal_processor_core((s32)assigned_secondary_core); - } + thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::spu)); } if (g_cfg.core.lower_spu_priority) { thread_ctrl::set_native_priority(-1); } - - g_num_spu_threads++; } void SPUThread::on_init(const std::shared_ptr& _this) diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index ae86c25072..04c4de0fc8 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -398,7 +398,11 @@ namespace rsx // Raise priority above other threads thread_ctrl::set_native_priority(1); - thread_ctrl::set_ideal_processor_core(0); + + if (g_cfg.core.thread_scheduler_enabled) + { + thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx)); + } // Round to nearest to deal with forward/reverse scaling fesetround(FE_TONEAREST); diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 2d2ad68f50..3044d98e61 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -286,8 +286,13 @@ struct cfg_root : cfg::node cfg::_bool llvm_logs{this, "Save LLVM logs"}; cfg::string llvm_cpu{this, "Use LLVM CPU"}; +#ifdef _WIN32 + cfg::_bool thread_scheduler_enabled{ this, "Enable thread scheduler", true }; +#else + cfg::_bool thread_scheduler_enabled{ this, "Enable thread scheduler", false }; +#endif + cfg::_enum spu_decoder{this, "SPU Decoder", spu_decoder_type::asmjit}; - cfg::_bool bind_spu_cores{this, "Bind SPU threads to secondary cores"}; cfg::_bool lower_spu_priority{this, "Lower SPU thread priority"}; cfg::_bool spu_debug{this, "SPU Debug"}; cfg::_int<0, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC diff --git a/rpcs3/Json/tooltips.json b/rpcs3/Json/tooltips.json index 62fa57d0ae..cca09a5f36 100644 --- a/rpcs3/Json/tooltips.json +++ b/rpcs3/Json/tooltips.json @@ -26,7 +26,7 @@ }, "checkboxes": { "hookStFunc": "Allows to hook some functions like 'memcpy' replacing them with high-level implementations. May do nothing or break things. Experimental.", - "bindSPUThreads": "If your CPU has SMT (Hyper-Threading) SPU threads will run on these logical cores instead.\nUsually faster on an i3, possibly slower or no difference on an i7 or Ryzen.", + "enableThreadScheduler": "Allows rpcs3 to manually schedule physical cores to run specific tasks on, instead of letting the OS handle it.\nVery useful on windows, especially for AMD Ryzen systems where it can give huge performance gains.", "lowerSPUThrPrio": "Runs SPU threads with lower priority than PPU threads.\nUsually faster on an i3 or i5, possibly slower or no difference on an i7 or Ryzen.", "spuLoopDetection": "Try to detect loop conditions in SPU kernels and use them as scheduling hints.\nImproves performance and reduces CPU usage.\nMay cause severe audio stuttering in rare cases." }, diff --git a/rpcs3/rpcs3qt/emu_settings.h b/rpcs3/rpcs3qt/emu_settings.h index d8970c6d8c..696dda4857 100644 --- a/rpcs3/rpcs3qt/emu_settings.h +++ b/rpcs3/rpcs3qt/emu_settings.h @@ -30,7 +30,7 @@ public: SPUDecoder, LibLoadOptions, HookStaticFuncs, - BindSPUThreads, + EnableThreadScheduler, LowerSPUThreadPrio, SPULoopDetection, PreferredSPUThreads, @@ -183,16 +183,16 @@ private: const QMap SettingsLoc = { // Core Tab - { PPUDecoder, { "Core", "PPU Decoder"}}, - { SPUDecoder, { "Core", "SPU Decoder"}}, - { LibLoadOptions, { "Core", "Lib Loader"}}, - { HookStaticFuncs, { "Core", "Hook static functions"}}, - { BindSPUThreads, { "Core", "Bind SPU threads to secondary cores"}}, - { LowerSPUThreadPrio, { "Core", "Lower SPU thread priority"}}, - { SPULoopDetection, { "Core", "SPU loop detection"}}, - { PreferredSPUThreads, { "Core", "Preferred SPU Threads"}}, - { PPUDebug, { "Core", "PPU Debug"}}, - { SPUDebug, { "Core", "SPU Debug"}}, + { PPUDecoder, { "Core", "PPU Decoder"}}, + { SPUDecoder, { "Core", "SPU Decoder"}}, + { LibLoadOptions, { "Core", "Lib Loader"}}, + { HookStaticFuncs, { "Core", "Hook static functions"}}, + { EnableThreadScheduler, { "Core", "Enable thread scheduler"}}, + { LowerSPUThreadPrio, { "Core", "Lower SPU thread priority"}}, + { SPULoopDetection, { "Core", "SPU loop detection"}}, + { PreferredSPUThreads, { "Core", "Preferred SPU Threads"}}, + { PPUDebug, { "Core", "PPU Debug"}}, + { SPUDebug, { "Core", "SPU Debug"}}, // Graphics Tab { Renderer, { "Video", "Renderer"}}, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index be7b8e08ce..5ad5b7c072 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -129,8 +129,8 @@ settings_dialog::settings_dialog(std::shared_ptr guiSettings, std: xemu_settings->EnhanceCheckBox(ui->hookStFunc, emu_settings::HookStaticFuncs); SubscribeTooltip(ui->hookStFunc, json_cpu_cbs["hookStFunc"].toString()); - xemu_settings->EnhanceCheckBox(ui->bindSPUThreads, emu_settings::BindSPUThreads); - SubscribeTooltip(ui->bindSPUThreads, json_cpu_cbs["bindSPUThreads"].toString()); + xemu_settings->EnhanceCheckBox(ui->enableScheduler, emu_settings::EnableThreadScheduler); + SubscribeTooltip(ui->enableScheduler, json_cpu_cbs["enableThreadScheduler"].toString()); xemu_settings->EnhanceCheckBox(ui->lowerSPUThrPrio, emu_settings::LowerSPUThreadPrio); SubscribeTooltip(ui->lowerSPUThrPrio, json_cpu_cbs["lowerSPUThrPrio"].toString()); diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui index 8fc47c7017..63bdb78730 100644 --- a/rpcs3/rpcs3qt/settings_dialog.ui +++ b/rpcs3/rpcs3qt/settings_dialog.ui @@ -207,9 +207,9 @@ - + - Bind SPU threads to secondary cores + Enable thread scheduler