From cf0fcf5a2adf74cd56186ea4416b3ebac0a587a6 Mon Sep 17 00:00:00 2001 From: Eladash Date: Tue, 23 Jun 2020 16:41:16 +0300 Subject: [PATCH] SPU: Implement execution wake-up delay --- Utilities/Thread.cpp | 56 ++++++++++++++++++++++++++++++++++-- Utilities/Thread.h | 15 ++++------ rpcs3/Emu/Cell/SPUThread.cpp | 15 ++++++++++ rpcs3/Emu/Cell/SPUThread.h | 1 + rpcs3/Emu/RSX/RSXThread.cpp | 53 ++-------------------------------- rpcs3/Emu/RSX/RSXThread.h | 3 +- rpcs3/Emu/system_config.h | 4 ++- 7 files changed, 83 insertions(+), 64 deletions(-) diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index d13346fb1c..542443c37e 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -2322,14 +2322,14 @@ thread_state thread_ctrl::state() return static_cast(_this->m_sync & 3); } -void thread_ctrl::_wait_for(u64 usec, [[maybe_unused]] bool alert /* true */) +void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */) { auto _this = g_tls_this_thread; #ifdef __linux__ static thread_local struct linux_timer_handle_t { - // Allocate timer only if needed (i.e. someone calls _wait_for with alert and short period) + // Allocate timer only if needed (i.e. someone calls wait_for with alert and short period) const int m_timer = timerfd_create(CLOCK_MONOTONIC, 0); linux_timer_handle_t() noexcept @@ -2383,6 +2383,58 @@ void thread_ctrl::_wait_for(u64 usec, [[maybe_unused]] bool alert /* true */) list.wait(atomic_wait_timeout{usec <= 0xffff'ffff'ffff'ffff / 1000 ? usec * 1000 : 0xffff'ffff'ffff'ffff}); } +void thread_ctrl::wait_for_accurate(u64 usec) +{ + if (!usec) + { + return; + } + + using namespace std::chrono_literals; + + const auto until = std::chrono::steady_clock::now() + 1us * usec; + + while (true) + { +#ifdef __linux__ + // NOTE: Assumption that timer initialization has succeeded + u64 host_min_quantum = usec <= 1000 ? 10 : 50; +#else + // Host scheduler quantum for windows (worst case) + // NOTE: On ps3 this function has very high accuracy + constexpr u64 host_min_quantum = 500; +#endif + if (usec >= host_min_quantum) + { +#ifdef __linux__ + // Do not wait for the last quantum to avoid loss of accuracy + wait_for(usec - ((usec % host_min_quantum) + host_min_quantum), false); +#else + // Wait on multiple of min quantum for large durations to avoid overloading low thread cpus + wait_for(usec - (usec % host_min_quantum), false); +#endif + } + // TODO: Determine best value for yield delay + else if (usec >= host_min_quantum / 2) + { + std::this_thread::yield(); + } + else + { + busy_wait(100); + } + + const auto current = std::chrono::steady_clock::now(); + + if (current >= until) + { + break; + } + + usec = (until - current).count(); + } +} + std::string thread_ctrl::get_name_cached() { auto _this = thread_ctrl::g_tls_this_thread; diff --git a/Utilities/Thread.h b/Utilities/Thread.h index 3e5c909d38..3130c0fcb2 100644 --- a/Utilities/Thread.h +++ b/Utilities/Thread.h @@ -201,9 +201,6 @@ class thread_ctrl final // Target cpu core layout static atomic_t g_native_core_layout; - // Internal waiting function, may throw. Infinite value is -1. - static void _wait_for(u64 usec, bool alert); - friend class thread_base; // Optimized get_name() for logging @@ -263,16 +260,16 @@ public: // Read current state, possibly executing some tasks static thread_state state(); - // Wait once with timeout. May spuriously return false. - static inline void wait_for(u64 usec, bool alert = true) - { - _wait_for(usec, alert); - } + // Wait once with timeout. Infinite value is -1. + static void wait_for(u64 usec, bool alert = true); + + // Waiting with accurate timeout + static void wait_for_accurate(u64 usec); // Wait. static inline void wait() { - _wait_for(-1, true); + wait_for(-1, true); } // Wait for both thread sync var and provided atomic var diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index fa269e82b9..46f19ac28f 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -3849,6 +3849,12 @@ s64 spu_thread::get_ch_value(u32 ch) } const s64 out = channel.pop_wait(*this); + + if (state & cpu_flag::wait) + { + wakeup_delay(); + } + static_cast(test_stopped()); return out; }; @@ -4068,6 +4074,7 @@ s64 spu_thread::get_ch_value(u32 ch) thread_ctrl::wait_on(state, old, 100); } + wakeup_delay(); check_state(); return events.events & mask1; } @@ -4114,6 +4121,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value) } int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT); + wakeup_delay(); check_state(); return true; } @@ -4680,6 +4688,7 @@ bool spu_thread::stop_and_signal(u32 code) thread_ctrl::wait_on(state, old); } + wakeup_delay(); return true; } @@ -5000,6 +5009,12 @@ bool spu_thread::capture_local_storage() const return true; } +void spu_thread::wakeup_delay(u32 div) const +{ + if (g_cfg.core.spu_wakeup_delay_mask & (1u << index)) + thread_ctrl::wait_for_accurate(utils::aligned_div(+g_cfg.core.spu_wakeup_delay, div)); +} + spu_function_logger::spu_function_logger(spu_thread& spu, const char* func) : spu(spu) { diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 935c9d1ef1..8d005c520d 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -872,6 +872,7 @@ public: void fast_call(u32 ls_addr); bool capture_local_storage() const; + void wakeup_delay(u32 div = 1) const; // Convert specified SPU LS address to a pointer of specified (possibly converted to BE) type template diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 37b1c5e638..3f721da53f 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -2780,59 +2780,10 @@ namespace rsx return result; } - void thread::fifo_wake_delay(u64 div) + void thread::fifo_wake_delay(u32 div) { - // TODO: Nanoseconds accuracy - u64 remaining = g_cfg.video.driver_wakeup_delay; - - if (!remaining) - { - return; - } - // Some cases do not need full delay - remaining = utils::aligned_div(remaining, div); - const u64 until = rsx::uclock() + remaining; - - while (true) - { -#ifdef __linux__ - // NOTE: Assumption that timer initialization has succeeded - u64 host_min_quantum = remaining <= 1000 ? 10 : 50; -#else - // Host scheduler quantum for windows (worst case) - // NOTE: On ps3 this function has very high accuracy - constexpr u64 host_min_quantum = 500; -#endif - if (remaining >= host_min_quantum) - { -#ifdef __linux__ - // Do not wait for the last quantum to avoid loss of accuracy - thread_ctrl::wait_for(remaining - ((remaining % host_min_quantum) + host_min_quantum), false); -#else - // Wait on multiple of min quantum for large durations to avoid overloading low thread cpus - thread_ctrl::wait_for(remaining - (remaining % host_min_quantum), false); -#endif - } - // TODO: Determine best value for yield delay - else if (remaining >= host_min_quantum / 2) - { - std::this_thread::yield(); - } - else - { - busy_wait(100); - } - - const u64 current = rsx::uclock(); - - if (current >= until) - { - break; - } - - remaining = until - current; - } + thread_ctrl::wait_for_accurate(utils::aligned_div(+g_cfg.video.driver_wakeup_delay, div)); } u32 thread::get_fifo_cmd() const diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index f767880058..917c98b0ca 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -24,6 +24,7 @@ #include "Capture/rsx_trace.h" #include "Capture/rsx_replay.h" +#include "Emu/system_config.h" #include "Emu/Cell/lv2/sys_rsx.h" #include "Emu/IdManager.h" #include "Emu/system_config.h" @@ -518,7 +519,7 @@ namespace rsx const char* file = __builtin_FILE(), const char* func = __builtin_FUNCTION()); - static void fifo_wake_delay(u64 div = 1); + static void fifo_wake_delay(u32 div = 1); u32 get_fifo_cmd() const; void dump_regs(std::string&) const override; diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 2a172e5831..c2e4c1b741 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -87,6 +87,8 @@ struct cfg_root : cfg::node cfg::uint64 tx_limit2_ns{this, "TSX Transaction Second Limit", 2000}; // In nanoseconds cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100 }; // Changing this from 100 (percentage) may affect game speed in unexpected ways + cfg::uint<0, 3000> spu_wakeup_delay{ this, "SPU Wake-Up Delay", 0, true }; + cfg::uint<0, (1 << 6) - 1> spu_wakeup_delay_mask{ this, "SPU Wake-Up Delay Thread Mask", (1 << 6) - 1, true }; #if defined (__linux__) || defined (__APPLE__) cfg::_enum sleep_timers_accuracy{ this, "Sleep Timers Accuracy", sleep_timers_accuracy_level::_as_host, true }; #else @@ -168,7 +170,7 @@ struct cfg_root : cfg::node cfg::_int<1, 1024> min_scalable_dimension{ this, "Minimum Scalable Dimension", 16 }; cfg::_int<0, 16> shader_compiler_threads_count{ this, "Shader Compiler Threads", 0 }; cfg::_int<0, 30000000> driver_recovery_timeout{ this, "Driver Recovery Timeout", 1000000, true }; - cfg::_int<0, 16667> driver_wakeup_delay{ this, "Driver Wake-Up Delay", 1, true }; + cfg::uint<0, 16667> driver_wakeup_delay{ this, "Driver Wake-Up Delay", 1, true }; cfg::_int<1, 1800> vblank_rate{ this, "Vblank Rate", 60, true }; // Changing this from 60 may affect game speed in unexpected ways cfg::_bool vblank_ntsc{ this, "Vblank NTSC Fixup", false, true }; cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console