1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-25 04:02:42 +01:00

cell/scheduler: Manage thread placement depending on cpu hardware

This commit is contained in:
kd-11 2017-10-21 14:21:37 +03:00
parent 54fbde0de1
commit cbc8bf01a1
13 changed files with 185 additions and 40 deletions

View File

@ -6,7 +6,9 @@
#include "Emu/Cell/lv2/sys_mmapper.h"
#include "Emu/Cell/lv2/sys_event.h"
#include "Thread.h"
#include "sysinfo.h"
#include <typeinfo>
#include <thread>
#ifdef _WIN32
#include <Windows.h>
@ -1547,6 +1549,8 @@ thread_local DECLARE(thread_ctrl::g_tls_this_thread) = nullptr;
extern thread_local std::string(*g_tls_log_prefix)();
DECLARE(thread_ctrl::g_native_core_layout) { native_core_arrangement::undefined };
void thread_ctrl::start(const std::shared_ptr<thread_ctrl>& ctrl, task_stack task)
{
#ifdef _WIN32
@ -1853,6 +1857,89 @@ void thread_ctrl::test()
}
}
void thread_ctrl::detect_cpu_layout()
{
if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic))
return;
const auto system_id = utils::get_system_info();
if (system_id.find("Ryzen") != std::string::npos)
{
g_native_core_layout.store(native_core_arrangement::amd_ccx);
}
else if (system_id.find("i3") != std::string::npos || system_id.find("i7") != std::string::npos)
{
g_native_core_layout.store(native_core_arrangement::intel_ht);
}
}
u16 thread_ctrl::get_affinity_mask(thread_class group)
{
detect_cpu_layout();
if (const auto thread_count = std::thread::hardware_concurrency())
{
const u16 all_cores_mask = thread_count < 16 ? (u16)(~(UINT16_MAX << thread_count)): UINT16_MAX;
switch (g_native_core_layout)
{
default:
case native_core_arrangement::generic:
{
return all_cores_mask;
}
case native_core_arrangement::amd_ccx:
{
u16 primary_ccx_unit_mask;
if (thread_count >= 16)
{
// Threadripper, R7
// Assign threads 8-16
// It appears some windows code is bound to lower core addresses, binding 8-16 is alot faster than 0-7
primary_ccx_unit_mask = 0b1111111100000000;
}
else
{
// R5 & R3 don't seem to improve performance no matter how these are shuffled (including 1600)
primary_ccx_unit_mask = 0b11111111 & all_cores_mask;
}
switch (group)
{
default:
case thread_class::general:
return all_cores_mask;
case thread_class::rsx:
case thread_class::ppu:
case thread_class::spu:
return primary_ccx_unit_mask;
}
}
case native_core_arrangement::intel_ht:
{
if (thread_count <= 4)
{
//i3 or worse
switch (group)
{
case thread_class::rsx:
case thread_class::ppu:
return (0b0101 & all_cores_mask);
case thread_class::spu:
return (0b1010 & all_cores_mask);
case thread_class::general:
return all_cores_mask;
}
}
return all_cores_mask;
}
}
}
return UINT16_MAX;
}
void thread_ctrl::set_native_priority(int priority)
{
#ifdef _WIN32
@ -1886,24 +1973,31 @@ void thread_ctrl::set_native_priority(int priority)
#endif
}
void thread_ctrl::set_ideal_processor_core(int core)
void thread_ctrl::set_thread_affinity_mask(u16 mask)
{
#ifdef _WIN32
HANDLE _this_thread = GetCurrentThread();
SetThreadIdealProcessor(_this_thread, core);
SetThreadAffinityMask(_this_thread, (DWORD_PTR)mask);
#elif __APPLE__
thread_affinity_policy_data_t policy = { static_cast<integer_t>(core) };
thread_affinity_policy_data_t policy = { static_cast<integer_t>(mask) };
thread_port_t mach_thread = pthread_mach_thread_np(pthread_self());
thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1);
#elif defined(__linux__) || defined(__DragonFly__) || defined(__FreeBSD__)
cpu_set_t cs;
CPU_ZERO(&cs);
for (u32 core = 0; core < 16u; ++core)
{
if ((u32)mask & (1u << core))
{
CPU_SET(core, &cs);
}
}
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cs);
#endif
}
named_thread::named_thread()
{
}

View File

@ -16,6 +16,23 @@
// Will report exception and call std::abort() if put in catch(...)
[[noreturn]] void catch_all_exceptions();
// Hardware core layout
enum class native_core_arrangement : u32
{
undefined,
generic,
intel_ht,
amd_ccx
};
enum class thread_class : u32
{
general,
rsx,
spu,
ppu
};
// Simple list of void() functors
class task_stack
{
@ -91,6 +108,9 @@ class thread_ctrl final
// Current thread
static thread_local thread_ctrl* g_tls_this_thread;
// Target cpu core layout
static atomic_t<native_core_arrangement> g_native_core_layout;
// Self pointer
std::shared_ptr<thread_ctrl> m_self;
@ -234,8 +254,17 @@ public:
thread_ctrl::start(out, std::forward<F>(func));
}
// Detect layout
static void detect_cpu_layout();
// Returns a core affinity mask. Set whether to generate the high priority set or not
static u16 get_affinity_mask(thread_class group);
// Sets the native thread priority
static void set_native_priority(int priority);
static void set_ideal_processor_core(int core);
// Sets the preferred affinity mask for this thread
static void set_thread_affinity_mask(u16 mask);
};
class named_thread

View File

@ -3,6 +3,7 @@
#include "Emu/Memory/vm.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/Cell/lv2/sys_sync.h"
#include "Emu/System.h"
#include "MFC.h"
const bool s_use_rtm = utils::has_rtm();
@ -375,3 +376,12 @@ void mfc_thread::add_spu(spu_ptr _spu)
run();
}
void mfc_thread::on_spawn()
{
if (g_cfg.core.thread_scheduler_enabled)
{
// Bind to same set with the SPUs
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::spu));
}
}

View File

@ -113,4 +113,6 @@ public:
virtual void cpu_task() override;
virtual void add_spu(spu_ptr _spu);
virtual void on_spawn() override;
};

View File

@ -334,6 +334,15 @@ extern void ppu_breakpoint(u32 addr)
}
}
void ppu_thread::on_spawn()
{
if (g_cfg.core.thread_scheduler_enabled)
{
// Bind to primary set
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::ppu));
}
}
void ppu_thread::on_init(const std::shared_ptr<void>& _this)
{
if (!stack_addr)

View File

@ -30,6 +30,7 @@ public:
static const u32 id_step = 1;
static const u32 id_count = 2048;
virtual void on_spawn() override;
virtual void on_init(const std::shared_ptr<void>&) override;
virtual std::string get_name() const override;
virtual std::string dump() const override;

View File

@ -134,8 +134,8 @@ namespace spu
{
if (timeout_ms > 0)
{
const auto timeout = timeout_ms * 1000u; //convert to microseconds
const auto start = get_system_time();
const u64 timeout = timeout_ms * 1000u; //convert to microseconds
const u64 start = get_system_time();
auto remaining = timeout;
while (atomic_instruction_table[pc_offset].load(std::memory_order_consume) >= max_concurrent_instructions)
@ -143,7 +143,7 @@ namespace spu
if (remaining >= native_jiffy_duration_us)
std::this_thread::sleep_for(1ms);
else
std::this_thread::yield();
busy_wait(remaining);
const auto now = get_system_time();
const auto elapsed = now - start;
@ -155,7 +155,8 @@ namespace spu
else
{
//Slight pause if function is overburdened
thread_ctrl::wait_for(100);
const auto count = atomic_instruction_table[pc_offset].load(std::memory_order_consume) * 100ull;
busy_wait(count);
}
}
@ -278,25 +279,15 @@ spu_imm_table_t::spu_imm_table_t()
void SPUThread::on_spawn()
{
if (g_cfg.core.bind_spu_cores)
if (g_cfg.core.thread_scheduler_enabled)
{
//Get next secondary core number
auto core_count = std::thread::hardware_concurrency();
if (core_count > 0 && core_count <= 16)
{
auto half_count = core_count / 2;
auto assigned_secondary_core = ((g_num_spu_threads % half_count) * 2) + 1;
thread_ctrl::set_ideal_processor_core((s32)assigned_secondary_core);
}
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::spu));
}
if (g_cfg.core.lower_spu_priority)
{
thread_ctrl::set_native_priority(-1);
}
g_num_spu_threads++;
}
void SPUThread::on_init(const std::shared_ptr<void>& _this)

View File

@ -398,7 +398,11 @@ namespace rsx
// Raise priority above other threads
thread_ctrl::set_native_priority(1);
thread_ctrl::set_ideal_processor_core(0);
if (g_cfg.core.thread_scheduler_enabled)
{
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx));
}
// Round to nearest to deal with forward/reverse scaling
fesetround(FE_TONEAREST);

View File

@ -286,8 +286,13 @@ struct cfg_root : cfg::node
cfg::_bool llvm_logs{this, "Save LLVM logs"};
cfg::string llvm_cpu{this, "Use LLVM CPU"};
#ifdef _WIN32
cfg::_bool thread_scheduler_enabled{ this, "Enable thread scheduler", true };
#else
cfg::_bool thread_scheduler_enabled{ this, "Enable thread scheduler", false };
#endif
cfg::_enum<spu_decoder_type> spu_decoder{this, "SPU Decoder", spu_decoder_type::asmjit};
cfg::_bool bind_spu_cores{this, "Bind SPU threads to secondary cores"};
cfg::_bool lower_spu_priority{this, "Lower SPU thread priority"};
cfg::_bool spu_debug{this, "SPU Debug"};
cfg::_int<0, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC

View File

@ -26,7 +26,7 @@
},
"checkboxes": {
"hookStFunc": "Allows to hook some functions like 'memcpy' replacing them with high-level implementations. May do nothing or break things. Experimental.",
"bindSPUThreads": "If your CPU has SMT (Hyper-Threading) SPU threads will run on these logical cores instead.\nUsually faster on an i3, possibly slower or no difference on an i7 or Ryzen.",
"enableThreadScheduler": "Allows rpcs3 to manually schedule physical cores to run specific tasks on, instead of letting the OS handle it.\nVery useful on windows, especially for AMD Ryzen systems where it can give huge performance gains.",
"lowerSPUThrPrio": "Runs SPU threads with lower priority than PPU threads.\nUsually faster on an i3 or i5, possibly slower or no difference on an i7 or Ryzen.",
"spuLoopDetection": "Try to detect loop conditions in SPU kernels and use them as scheduling hints.\nImproves performance and reduces CPU usage.\nMay cause severe audio stuttering in rare cases."
},

View File

@ -30,7 +30,7 @@ public:
SPUDecoder,
LibLoadOptions,
HookStaticFuncs,
BindSPUThreads,
EnableThreadScheduler,
LowerSPUThreadPrio,
SPULoopDetection,
PreferredSPUThreads,
@ -187,7 +187,7 @@ private:
{ SPUDecoder, { "Core", "SPU Decoder"}},
{ LibLoadOptions, { "Core", "Lib Loader"}},
{ HookStaticFuncs, { "Core", "Hook static functions"}},
{ BindSPUThreads, { "Core", "Bind SPU threads to secondary cores"}},
{ EnableThreadScheduler, { "Core", "Enable thread scheduler"}},
{ LowerSPUThreadPrio, { "Core", "Lower SPU thread priority"}},
{ SPULoopDetection, { "Core", "SPU loop detection"}},
{ PreferredSPUThreads, { "Core", "Preferred SPU Threads"}},

View File

@ -129,8 +129,8 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> guiSettings, std:
xemu_settings->EnhanceCheckBox(ui->hookStFunc, emu_settings::HookStaticFuncs);
SubscribeTooltip(ui->hookStFunc, json_cpu_cbs["hookStFunc"].toString());
xemu_settings->EnhanceCheckBox(ui->bindSPUThreads, emu_settings::BindSPUThreads);
SubscribeTooltip(ui->bindSPUThreads, json_cpu_cbs["bindSPUThreads"].toString());
xemu_settings->EnhanceCheckBox(ui->enableScheduler, emu_settings::EnableThreadScheduler);
SubscribeTooltip(ui->enableScheduler, json_cpu_cbs["enableThreadScheduler"].toString());
xemu_settings->EnhanceCheckBox(ui->lowerSPUThrPrio, emu_settings::LowerSPUThreadPrio);
SubscribeTooltip(ui->lowerSPUThrPrio, json_cpu_cbs["lowerSPUThrPrio"].toString());

View File

@ -207,9 +207,9 @@
</property>
<layout class="QVBoxLayout" name="verticalLayout_4">
<item>
<widget class="QCheckBox" name="bindSPUThreads">
<widget class="QCheckBox" name="enableScheduler">
<property name="text">
<string>Bind SPU threads to secondary cores</string>
<string>Enable thread scheduler</string>
</property>
</widget>
</item>