Implement independent CPU preemptions

2024-11-22 18:53:28 +01:00 · 2022-09-06 18:59:23 +03:00 · 2022-09-06 18:59:23 +03:00 · ec7b18dab5
commit ec7b18dab5
parent b9e20dc5c9
12 changed files with 358 additions and 4 deletions
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@ -55,6 +55,8 @@ void fmt_class_string<cpu_flag>::format(std::string& out, u64 arg)
 		case cpu_flag::memory: return "mem";
 		case cpu_flag::pending: return "pend";
 		case cpu_flag::pending_recheck: return "pend-re";
+		case cpu_flag::yield: return "y";
+		case cpu_flag::preempt: return "PREEMPT";
 		case cpu_flag::dbg_global_pause: return "G-PAUSE";
 		case cpu_flag::dbg_pause: return "PAUSE";
 		case cpu_flag::dbg_step: return "STEP";
@ -575,6 +577,7 @@ void cpu_thread::operator()()
 		if (!(state0 & cpu_flag::stop))
 		{
 			cpu_task();
+			state += cpu_flag::wait;

 			if (state & cpu_flag::ret && state.test_and_reset(cpu_flag::ret))
 			{
@ -731,7 +734,7 @@ bool cpu_thread::check_state() noexcept
 			if (!is_stopped(flags) && flags.none_of(cpu_flag::ret))
 			{
 				// Check pause flags which hold thread inside check_state (ignore suspend/debug flags on cpu_flag::temp)
-				if (flags & (cpu_flag::pause + cpu_flag::memory) || (cpu_can_stop && flags & (cpu_flag::dbg_global_pause + cpu_flag::dbg_pause + cpu_flag::suspend)))
+				if (flags & (cpu_flag::pause + cpu_flag::memory + cpu_flag::yield + cpu_flag::preempt) || (cpu_can_stop && flags & (cpu_flag::dbg_global_pause + cpu_flag::dbg_pause + cpu_flag::suspend)))
 				{
 					if (!(flags & cpu_flag::wait))
 					{
@ -739,6 +742,12 @@ bool cpu_thread::check_state() noexcept
 						store = true;
 					}

+					if (flags & (cpu_flag::yield + cpu_flag::preempt))
+					{
+						flags -= (cpu_flag::yield + cpu_flag::preempt);
+						store = true;
+					}
+
 					escape = false;
 					state1 = flags;
 					return store;
@ -768,6 +777,30 @@ bool cpu_thread::check_state() noexcept
 			return store;
 		}).first;

+		if (state0 & cpu_flag::preempt)
+		{
+			if (cpu_flag::wait - state0)
+			{
+				// Yield itself
+				state.wait(state1, atomic_wait_timeout{20'000});
+			}
+
+			if (const u128 bits = s_cpu_bits)
+			{
+				reader_lock lock(s_cpu_lock);
+
+				cpu_counter::for_all_cpu(bits & s_cpu_bits, [](cpu_thread* cpu)
+				{
+					if (cpu->state.none_of(cpu_flag::wait + cpu_flag::yield))
+					{
+						cpu->state += cpu_flag::yield;
+					}
+
+					return true;
+				});
+			}
+		}
+
 		if (escape)
 		{
 			if (s_tls_thread_slot == umax && !retval)
@ -856,6 +889,14 @@ bool cpu_thread::check_state() noexcept
 						break;
 					}
 				}
+
+				continue;
+			}
+
+			if (state0 & cpu_flag::yield && cpu_flag::wait - state0)
+			{
+				// Short sleep when yield flag is present alone (makes no sense when other methods which can stop thread execution have been done)
+				state.wait(state1, atomic_wait_timeout{20'000});
 			}
 		}
 	}
--- a/rpcs3/Emu/CPU/CPUThread.h
+++ b/rpcs3/Emu/CPU/CPUThread.h
@ -25,6 +25,8 @@ enum class cpu_flag : u32
 	pending, // Thread has postponed work
 	pending_recheck, // Thread needs to recheck if there is pending work before ::pending removal
 	notify, // Flag meant solely to allow atomic notification on state without changing other flags
+	yield, // Thread is being requested to yield its execution time if it's running
+	preempt, // Thread is being requested to preempt the execution of all CPU threads

 	dbg_global_pause, // Emulation paused
 	dbg_pause, // Thread paused
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -4247,7 +4247,7 @@ s64 spu_thread::get_ch_value(u32 ch)

 		spu_function_logger logger(*this, "MFC Events read");

-		state += cpu_flag::wait;
+		lv2_obj::prepare_for_sleep(*this);

 		using resrv_ptr = std::add_pointer_t<const decltype(rdata)>;

@ -4943,7 +4943,7 @@ bool spu_thread::stop_and_signal(u32 code)
 			return ch_in_mbox.set_values(1, CELL_EINVAL), true;
 		}

-		state += cpu_flag::wait;
+		lv2_obj::prepare_for_sleep(*this);

 		spu_function_logger logger(*this, "sys_spu_thread_receive_event");

--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@ -51,9 +51,15 @@

 #include <optional>
 #include <deque>
+#include "util/tsc.hpp"

 extern std::string ppu_get_syscall_name(u64 code);

+namespace rsx
+{
+	void set_rsx_yield_flag() noexcept;
+}
+
 template <>
 void fmt_class_string<ppu_syscall_code>::format(std::string& out, u64 arg)
 {
@ -1202,6 +1208,10 @@ static std::deque<std::pair<u64, class cpu_thread*>> g_waiting;
 // Threads which must call lv2_obj::sleep before the scheduler starts
 static std::deque<class cpu_thread*> g_to_sleep;

+static atomic_t<u64> s_yield_frequency = 0;
+static atomic_t<u64> s_max_allowed_yield_tsc = 0;
+static u64 s_last_yield_tsc = 0;
+
 namespace cpu_counter
 {
 	void remove(cpu_thread*) noexcept;
@ -1577,6 +1587,7 @@ void lv2_obj::cleanup()
 	g_to_sleep.clear();
 	g_waiting.clear();
 	g_pending = 0;
+	s_yield_frequency = 0;
 }

 void lv2_obj::schedule_all(u64 current_time)
@ -1653,6 +1664,22 @@ void lv2_obj::schedule_all(u64 current_time)
 		// Null-terminate the list if it ends before last slot
 		g_to_notify[notify_later_idx] = nullptr;
 	}
+
+	if (const u64 freq = s_yield_frequency)
+	{
+		if (auto cpu = cpu_thread::get_current())
+		{
+			const u64 tsc = utils::get_tsc();
+			const u64 last_tsc = s_last_yield_tsc;
+
+			if (tsc >= last_tsc && tsc <= s_max_allowed_yield_tsc && tsc - last_tsc >= freq)
+			{
+				cpu->state += cpu_flag::preempt;
+				s_last_yield_tsc = tsc;
+				rsx::set_rsx_yield_flag();
+			}
+		}
+	}
 }

 ppu_thread_status lv2_obj::ppu_state(ppu_thread* ppu, bool lock_idm, bool lock_lv2)
@ -1737,6 +1764,12 @@ bool lv2_obj::has_ppus_in_running_state()
 	return false;
 }

+void lv2_obj::set_yield_frequency(u64 freq, u64 max_allowed_tsc)
+{
+	s_yield_frequency.release(freq);
+	s_max_allowed_yield_tsc.release(max_allowed_tsc);
+}
+
 bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep)
 {
 	static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling");
--- a/rpcs3/Emu/Cell/lv2/sys_sync.h
+++ b/rpcs3/Emu/Cell/lv2/sys_sync.h
@ -266,6 +266,8 @@ public:
 	// Must be called under IDM lock
 	static bool has_ppus_in_running_state();

+	static void set_yield_frequency(u64 freq, u64 max_allowed_tsx);
+
 	static void cleanup();

 	template <typename T>
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -2,6 +2,7 @@
 #include "RSXThread.h"

 #include "Emu/Cell/PPUCallback.h"
+#include "Emu/Cell/SPUThread.h"
 #include "Emu/Cell/timers.hpp"

 #include "Common/BufferUtils.h"
@ -42,6 +43,8 @@ rsx::frame_capture_data frame_capture;
 extern CellGcmOffsetTable offsetTable;
 extern thread_local std::string(*g_tls_log_prefix)();

+LOG_CHANNEL(perf_log, "PERF");
+
 template <>
 bool serialize<rsx::rsx_state>(utils::serial& ar, rsx::rsx_state& o)
 {
@ -241,6 +244,17 @@ namespace rsx
 		fmt::throw_exception("rsx::get_address(offset=0x%x, location=0x%x): %s%s", offset, location, msg, src_loc{line, col, file, func});
 	}

+	extern void set_rsx_yield_flag() noexcept
+	{
+		if (auto rsx = get_current_renderer())
+		{
+			if (g_cfg.core.allow_rsx_cpu_preempt)
+			{
+				rsx->state += cpu_flag::yield;
+			}
+		}
+	}
+
 	std::pair<u32, u32> interleaved_range_info::calculate_required_range(u32 first, u32 count) const
 	{
 		if (single_vertex)
@ -3373,7 +3387,7 @@ namespace rsx

 		switch (frame_limit)
 		{
-		case frame_limit_type::none: limit = 0.; break;
+		case frame_limit_type::none: limit = g_cfg.core.max_cpu_preempt_count_per_frame ? static_cast<double>(g_cfg.video.vblank_rate) : 0.; break;
 		case frame_limit_type::_50: limit = 50.; break;
 		case frame_limit_type::_60: limit = 60.; break;
 		case frame_limit_type::_30: limit = 30.; break;
@ -3478,5 +3492,183 @@ namespace rsx
 				intr_thread->cmd_notify.notify_one();
 			}
 		}
+
+		evaluate_cpu_usage_reduction_limits();
+	}
+
+	void thread::evaluate_cpu_usage_reduction_limits()
+	{
+		const u64 max_preempt_count = g_cfg.core.max_cpu_preempt_count_per_frame;
+
+		if (!max_preempt_count)
+		{
+			frame_times.clear();
+			lv2_obj::set_yield_frequency(0, 0);
+			return;
+		}
+
+		const u64 current_time = get_system_time();
+		const u64 current_tsc = utils::get_tsc();
+		u64 preempt_count = 0;
+
+		if (frame_times.size() >= 60)
+		{
+			u64 diffs = 0;
+
+			for (usz i = 1; i < frame_times.size(); i++)
+			{
+				const u64 cur_diff = frame_times[i].timestamp - frame_times[i - 1].timestamp;
+				diffs += cur_diff;
+			}
+
+			const usz avg_frame_time = diffs / 59;
+
+			u32 lowered_delay = 0;
+			u32 highered_delay = 0;
+			bool can_reevaluate = true;
+			u64 prev_preempt_count = umax;
+
+			for (usz i = frame_times.size() - 30; i < frame_times.size(); i++)
+			{
+				if (prev_preempt_count == umax)
+				{
+					prev_preempt_count = frame_times[i].preempt_count;
+					continue;
+				}
+
+				if (prev_preempt_count != frame_times[i].preempt_count)
+				{
+					if (prev_preempt_count > frame_times[i].preempt_count)
+					{
+						lowered_delay++;
+					}
+					else if (prev_preempt_count < frame_times[i].preempt_count)
+					{
+						highered_delay++;
+					}
+
+					if (i > frame_times.size() - 30)
+					{
+						// Slow preemption count increase
+						can_reevaluate = false;
+					}
+				}
+
+				prev_preempt_count = frame_times[i].preempt_count;
+			}
+
+			preempt_count = frame_times.back().preempt_count;
+
+			u32 fails = 0;
+			u32 hard_fails = 0;
+			bool is_last_frame_a_fail = false;
+
+			auto abs_dst = [](u64 a, u64 b)
+			{
+				return a >= b ? a - b : b - a;
+			};
+
+			for (u32 i = 1; i <= frame_times.size(); i++)
+			{
+				const u64 cur_diff = (i == frame_times.size() ? current_time : frame_times[i].timestamp) - frame_times[i - 1].timestamp;
+
+				if (const u64 diff_of_diff = abs_dst(cur_diff, avg_frame_time);
+					diff_of_diff >= avg_frame_time / 4)
+				{
+					if (diff_of_diff >= avg_frame_time / 2)
+					{
+						highered_delay++;
+						hard_fails++;
+
+						if (i == frame_times.size())
+						{
+							is_last_frame_a_fail = true;
+						}
+					}
+
+					if (fails != umax)
+					{
+						fails++;
+					}
+				}
+			}
+
+			bool hard_measures_taken = false;
+			const usz fps_10 = 10'000'000 / avg_frame_time;
+
+			auto lower_preemption_count = [&]()
+			{
+				if (preempt_count >= 10)
+				{
+					preempt_count -= 10;
+				}
+				else
+				{
+					preempt_count = 0;
+				}
+
+				if (hard_fails > 2 && is_last_frame_a_fail)
+				{
+					hard_measures_taken = preempt_count > 1;
+					preempt_count = preempt_count * 7 / 8;
+					prevent_preempt_increase_tickets = 10;
+				}
+				else
+				{
+					prevent_preempt_increase_tickets = std::max<u32>(7, prevent_preempt_increase_tickets);
+				}
+			};
+
+			if (can_reevaluate)
+			{
+				const bool is_avg_fps_ok = (abs_dst(fps_10, 300) < 3 || abs_dst(fps_10, 600) < 4 || abs_dst(fps_10, g_cfg.video.vblank_rate * 10) < 4 || abs_dst(fps_10, g_cfg.video.vblank_rate * 10 / 2) < 3);
+
+				if (!hard_fails && fails < 6 && is_avg_fps_ok)
+				{
+					if (prevent_preempt_increase_tickets)
+					{
+						prevent_preempt_increase_tickets--;
+					}
+					else if (preempt_count < max_preempt_count)
+					{
+						preempt_count += 4;
+					}
+				}
+				else
+				{
+					lower_preemption_count();
+				}
+			}
+			// Sudden FPS drop detection
+			else if ((fails > 10 || hard_fails > 2 || !(abs_dst(fps_10, 300) < 20 || abs_dst(fps_10, 600) < 30 || abs_dst(fps_10, g_cfg.video.vblank_rate * 10) < 20 || abs_dst(fps_10, g_cfg.video.vblank_rate * 10 / 2) < 30)) && lowered_delay < highered_delay && is_last_frame_a_fail)
+			{
+				lower_preemption_count();
+			}
+
+			perf_log.trace("CPU preemption control: reeval=%d, preempt_count=%d, fails=%d, hard=%d, avg_frame_time=%d, highered=%d, lowered=%d", can_reevaluate, preempt_count, fails, hard_fails, avg_frame_time, highered_delay, lowered_delay);
+
+			if (hard_measures_taken)
+			{
+				preempt_fail_old_preempt_count = std::max<u32>(preempt_fail_old_preempt_count, frame_times.back().preempt_count);
+			}
+			else if (preempt_fail_old_preempt_count)
+			{
+				perf_log.error("Lowering current preemption count significantly due to a performance drop, if this issue persists frequantly consider lowering max preemptions count to 'new-count' or lower. (old-count=%d, new-count=%d)", preempt_fail_old_preempt_count, preempt_count);
+				preempt_fail_old_preempt_count = 0;
+			}
+
+			const u64 tsc_diff = (current_tsc - frame_times.back().tsc);
+
+			// Set an upper limit so a backoff technique would be taken if there is a sudden performance drop
+			// Allow 6% of no yield to reduce significantly the risk of stutter
+			lv2_obj::set_yield_frequency(preempt_count ? tsc_diff / preempt_count : 0, current_tsc + (tsc_diff * 94 / 100));
+			frame_times.pop_front();
+		}
+		else
+		{
+			lv2_obj::set_yield_frequency(0, 0);
+		}
+
+		frame_times.push_back(frame_time_t{preempt_count, current_time, current_tsc});
 	}
 }
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@ -689,6 +689,17 @@ namespace rsx

 		std::queue<desync_fifo_cmd_info> recovered_fifo_cmds_history;

+		struct frame_time_t
+		{
+			u64 preempt_count;
+			u64 timestamp;
+			u64 tsc;
+		};
+
+		std::deque<frame_time_t> frame_times;
+		u32 prevent_preempt_increase_tickets = 0;
+		u32 preempt_fail_old_preempt_count = 0;
+
 		atomic_t<s32> async_tasks_pending{ 0 };

 		reports::conditional_render_eval cond_render_ctrl;
@ -793,6 +804,7 @@ namespace rsx
 		shared_mutex m_mtx_task;

 		void handle_emu_flip(u32 buffer);
+		void evaluate_cpu_usage_reduction_limits();
 		void handle_invalidated_memory_range();

 	public:
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@ -91,6 +91,8 @@ struct cfg_root : cfg::node
 		cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100 }; // Changing this from 100 (percentage) may affect game speed in unexpected ways
 		cfg::uint<0, 3000> spu_wakeup_delay{ this, "SPU Wake-Up Delay", 0, true };
 		cfg::uint<0, (1 << 6) - 1> spu_wakeup_delay_mask{ this, "SPU Wake-Up Delay Thread Mask", (1 << 6) - 1, true };
+		cfg::uint<0, 300> max_cpu_preempt_count_per_frame{ this, "Max CPU Preempt Count", 0, true }; 
+		cfg::_bool allow_rsx_cpu_preempt{ this, "Allow RSX CPU Preemptions", true, true }; 
 #if defined (__linux__) || defined (__APPLE__)
 		cfg::_enum<sleep_timers_accuracy_level> sleep_timers_accuracy{ this, "Sleep Timers Accuracy", sleep_timers_accuracy_level::_as_host, true };
 #else
--- a/rpcs3/rpcs3qt/emu_settings_type.h
+++ b/rpcs3/rpcs3qt/emu_settings_type.h
@ -47,6 +47,7 @@ enum class emu_settings_type
 	FixupPPUVNAN,
 	AccuratePPUVNAN,
 	AccuratePPUFPCC,
+	MaxPreemptCount,

 	// Graphics
 	Renderer,
@ -221,6 +222,7 @@ inline static const QMap<emu_settings_type, cfg_location> settings_location =
 	{ emu_settings_type::FixupPPUVNAN,             { "Core", "PPU Fixup Vector NaN Values"}},
 	{ emu_settings_type::AccuratePPUVNAN,          { "Core", "PPU Accurate Vector NaN Values"}},
 	{ emu_settings_type::AccuratePPUFPCC,          { "Core", "PPU Set FPCC Bits"}},
+	{ emu_settings_type::MaxPreemptCount,          { "Core", "Max CPU Preempt Count"}},

 	// Graphics Tab
 	{ emu_settings_type::Renderer,                   { "Video", "Renderer"}},
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@ -1409,6 +1409,24 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 		ui->clockScale->setValue(clocks_scale_def);
 	});

+	EnhanceSlider(emu_settings_type::MaxPreemptCount, ui->maxPreemptCount, ui->preemptText, tr(reinterpret_cast<const char*>(u8"%0"), "Max CPU preempt count"));
+	SubscribeTooltip(ui->gb_max_preempt_count, tooltips.settings.max_cpu_preempt);
+
+#ifdef _WIN32
+	// Windows' thread execution slice is much larger than on other platforms
+	SnapSlider(ui->maxPreemptCount, 5);
+	ui->maxPreemptCount->setPageStep(20);
+#else
+	SnapSlider(ui->maxPreemptCount, 10);
+	ui->maxPreemptCount->setPageStep(50);
+#endif
+
+	const int preempt_def = stoi(m_emu_settings->GetSettingDefault(emu_settings_type::MaxPreemptCount));
+	connect(ui->preemptReset, &QAbstractButton::clicked, [preempt_def, this]()
+	{
+		ui->maxPreemptCount->setValue(preempt_def);
+	});
+
 	if (!game) // Prevent users from doing dumb things
 	{
 		ui->gb_vblank->setDisabled(true);
--- a/rpcs3/rpcs3qt/settings_dialog.ui
+++ b/rpcs3/rpcs3qt/settings_dialog.ui
@ -193,6 +193,55 @@
              </layout>
             </widget>
            </item>
+            <item>
+             <widget class="QGroupBox" name="gb_max_preempt_count">
+              <property name="sizePolicy">
+               <sizepolicy hsizetype="Preferred" vsizetype="Minimum">
+                <horstretch>0</horstretch>
+                <verstretch>0</verstretch>
+               </sizepolicy>
+              </property>
+              <property name="title">
+               <string>Max Power Saving CPU-preemptions</string>
+              </property>
+              <layout class="QVBoxLayout" name="gb_max_preempt_layout">
+               <item>
+                <widget class="QSlider" name="maxPreemptCount">
+                 <property name="sizePolicy">
+                  <sizepolicy hsizetype="Preferred" vsizetype="Minimum">
+                    <horstretch>0</horstretch>
+                    <verstretch>0</verstretch>
+                  </sizepolicy>
+                 </property>
+                 <property name="orientation">
+                  <enum>Qt::Horizontal</enum>
+                 </property>
+                </widget>
+               </item>
+               <item>
+                <layout class="QHBoxLayout" name="maxPreemptLayout" stretch="0,0">
+                 <item>
+                  <widget class="QLabel" name="preemptText">
+                   <property name="text">
+                    <string>0</string>
+                   </property>
+                   <property name="alignment">
+                    <set>Qt::AlignCenter</set>
+                   </property>
+                  </widget>
+                 </item>
+                 <item>
+                  <widget class="QPushButton" name="preemptReset">
+                   <property name="text">
+                    <string>Reset</string>
+                   </property>
+                  </widget>
+                 </item>
+                </layout>
+               </item>
+              </layout>
+             </widget>
+            </item>
            <item>
             <spacer name="coreTabMiddleLayoutSpacer">
              <property name="orientation">
--- a/rpcs3/rpcs3qt/tooltips.h
+++ b/rpcs3/rpcs3qt/tooltips.h
@ -85,6 +85,7 @@ public:
 		const QString fixup_ppuvnan             = tr("Fixup NaN results in vector instructions in PPU backends.\nIf unsure, do not modify this setting.");
 		const QString accurate_ppuvnan          = tr("Accurately set NaN results in vector instructions in PPU backends.\nIf unsure, do not modify this setting.");
 		const QString accurate_ppufpcc          = tr("Accurately set FPCC Bits in PPU backends.\nIf unsure, do not modify this setting.");
+		const QString max_cpu_preempt           = tr("Reduces CPU usage and power consumption, on mobile devices improves battery life. (0 means disabled)\nHigher values cause a more pronounced effect, but may cause audio or performance issues. A value of 50 or less is recommended.\nThis option forces an FPS limit because it's active when framerate is stable.\nThe lighter the game is on the hardware, the more power is saved by it. (until the preemption count barrier is reached)");

 		// debug