PPU/LV2: Implement HW threads switching delay when signaling higher priority threads

In real PS3 (it seems), when a thread with a higher priority than the caller is signaled and that there is available space on the running queue for the other hardware thread to start It prioritizes signaled thread caller's hardware thread switches instantly to the new thread code while signaling to the other hardware thread to execute the caller's code. Resulting in a delay to the caller after such thread is signaled
2024-11-22 18:53:28 +01:00 · 2024-02-09 19:45:07 +02:00 · 2024-02-09 19:45:07 +02:00 · ec59f4d37e
commit ec59f4d37e
parent 2c03878c3b
4 changed files with 59 additions and 7 deletions
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@ -964,6 +964,14 @@ bool cpu_thread::check_state() noexcept
 				}
 				else if (auto ppu = try_get<ppu_thread>())
 				{
+					if (u32 usec = ppu->hw_sleep_time)
+					{
+						thread_ctrl::wait_for_accurate(usec);
+						ppu->hw_sleep_time = 0;
+						ppu->raddr = 0; // Also lose reservation if there is any (reservation is unsaved on hw thread switch)
+						continue;
+					}
+
 					if (ppu->raddr && ppu->rtime == vm::reservation_acquire(ppu->raddr))
 					{
 						// Same
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@ -276,6 +276,7 @@ public:
 	const u32 stack_addr; // Stack address

 	atomic_t<ppu_join_status> joiner; // Joining thread or status
+	u32 hw_sleep_time = 0; // Very specific delay for hardware threads switching, see lv2_obj::awake_unlocked for more details

 	lf_fifo<atomic_t<cmd64>, 127> cmd_queue; // Command queue for asynchronous operations.

--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@ -1662,12 +1662,16 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
 	// Yield changed the queue before
 	bool changed_queue = prio == yield_cmd;

+	s32 lowest_new_priority = smax;
+	const bool has_free_hw_thread_space = count_non_sleeping_threads().onproc_count < g_cfg.core.ppu_threads + 0u;
+
 	if (cpu && prio != yield_cmd)
 	{
 		// Emplace current thread
 		if (emplace_thread(cpu))
 		{
 			changed_queue = true;
+			lowest_new_priority = std::min<s32>(static_cast<ppu_thread*>(cpu)->prio.load().prio, lowest_new_priority);
 		}
 	}
 	else for (const auto _cpu : g_to_awake)
@ -1676,13 +1680,15 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
 		if (emplace_thread(_cpu))
 		{
 			changed_queue = true;
+			lowest_new_priority = std::min<s32>(static_cast<ppu_thread*>(_cpu)->prio.load().prio, lowest_new_priority);
 		}
 	}

 	auto target = +g_ppu;
+	usz i = 0;

 	// Suspend threads if necessary
-	for (usz i = 0, thread_count = g_cfg.core.ppu_threads; target; target = target->next_ppu, i++)
+	for (usz thread_count = g_cfg.core.ppu_threads; target; target = target->next_ppu, i++)
 	{
 		if (i >= thread_count && cpu_flag::suspend - target->state)
 		{
@ -1709,6 +1715,27 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
 		}
 	}

+	// In real PS3 (it seems), when a thread with a higher priority than the caller is signaled and -
+	// - that there is available space on the running queue for the other hardware thread to start
+	// It prioritizes signaled thread - caller's hardware thread switches instantly to the new thread code
+	// While signaling to the other hardware thread to execute the caller's code.
+	// Resulting in a delay to the caller after such thread is signaled
+
+	if (current_ppu && changed_queue && has_free_hw_thread_space)
+	{
+		if (current_ppu->prio.load().prio > lowest_new_priority)
+		{
+			if (!current_ppu->state.test_and_set(cpu_flag::yield) || current_ppu->hw_sleep_time != 0)
+			{
+				current_ppu->hw_sleep_time += 35; // Seems like 35us after extensive testing
+			}
+			else
+			{
+				current_ppu->hw_sleep_time = 30000; // In addition to another flag's use (TODO: Refactor and clean this)
+			}
+		}
+	}
+
 	return changed_queue;
 }

@ -1920,19 +1947,24 @@ bool lv2_obj::is_scheduler_ready()
 	return g_to_sleep.empty();
 }

-bool lv2_obj::has_ppus_in_running_state()
+ppu_non_sleeping_count_t lv2_obj::count_non_sleeping_threads()
 {
+	ppu_non_sleeping_count_t total{};
+
 	auto target = atomic_storage<ppu_thread*>::load(g_ppu);

-	for (usz i = 0, thread_count = g_cfg.core.ppu_threads; target; target = atomic_storage<ppu_thread*>::load(target->next_ppu), i++)
+	for (usz thread_count = g_cfg.core.ppu_threads; target; target = atomic_storage<ppu_thread*>::load(target->next_ppu))
 	{
-		if (i >= thread_count)
+		if (total.onproc_count == thread_count)
 		{
-			return true;
+			total.has_running = true;
+			break;
 		}
+
+		total.onproc_count++;
 	}

-	return false;
+	return total;
 }

 void lv2_obj::set_yield_frequency(u64 freq, u64 max_allowed_tsc)
--- a/rpcs3/Emu/Cell/lv2/sys_sync.h
+++ b/rpcs3/Emu/Cell/lv2/sys_sync.h
@ -60,6 +60,12 @@ enum

 enum ppu_thread_status : u32;

+struct ppu_non_sleeping_count_t
+{
+	bool has_running; // no actual count for optimization sake
+	u32 onproc_count;
+};
+
 namespace vm
 {
 	extern u8 g_reservations[65536 / 128 * 64];
@ -280,7 +286,12 @@ public:
 	static bool is_scheduler_ready();

 	// Must be called under IDM lock
-	static bool has_ppus_in_running_state();
+	static ppu_non_sleeping_count_t count_non_sleeping_threads();
+
+	static inline bool has_ppus_in_running_state() noexcept
+	{
+		return count_non_sleeping_threads().has_running != 0;
+	}

 	static void set_yield_frequency(u64 freq, u64 max_allowed_tsx);