SPU: multithread compilation

Allow parallel compilation of SPU code, both at startup and runtime Remove 'SPU Shared Runtime' option (it became obsolete) Refactor spu_runtime class (now is common for ASMJIT and LLVM) Implement SPU ubertrampoline generation in raw assembly (LLVM) Minor improvement of balanced_wait_until<> and balanced_awaken<> Make JIT MemoryManager2 shared (global) Fix wrong assertion in cond_variable
2025-01-31 20:41:45 +01:00 · 2019-01-21 21:04:32 +03:00 · 2019-01-21 21:04:32 +03:00 · 4f152ad126
commit 4f152ad126
parent 8d5d44141e
9 changed files with 503 additions and 394 deletions
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@ -95,6 +95,12 @@ static void* const s_memory = []() -> void*
 	return utils::memory_reserve(s_memory_size);
 }();
 // Reserve 2G of memory, should replace previous area for ASLR compatibility
 static void* const s_memory2 = utils::memory_reserve(0x80000000);
 static u64 s_code_pos = 0;
 static u64 s_data_pos = 0;
 static void* s_next = s_memory;
 #ifdef _WIN32
@ -129,6 +135,11 @@ extern void jit_finalize()
 	utils::memory_decommit(s_memory, s_memory_size);
 	s_next = s_memory;
 	utils::memory_decommit(s_memory2, 0x80000000);
 	s_code_pos = 0;
 	s_data_pos = 0;
 }
 // Helper class
@ -311,24 +322,25 @@ struct MemoryManager : llvm::RTDyldMemoryManager
 // Simple memory manager
 struct MemoryManager2 : llvm::RTDyldMemoryManager
 {
-	// Reserve 2 GiB
+	// Patchwork again...
-	void* const m_memory = utils::memory_reserve(0x80000000);
+	void* const m_memory = s_memory2;
 	u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
 	u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;
-	u64 m_code_pos = 0;
+	u64& m_code_pos = s_code_pos;
-	u64 m_data_pos = 0;
+	u64& m_data_pos = s_data_pos;
 	MemoryManager2() = default;
 	~MemoryManager2() override
 	{
 		utils::memory_release(m_memory, 0x80000000);
 	}
 	u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
 	{
 		std::lock_guard lock(s_mutex);
 		// Simple allocation
 		const u64 old = m_code_pos;
 		const u64 pos = ::align(m_code_pos, align);
@ -349,12 +361,20 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
 			utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
 		}
 		if (!sec_id && sec_name.empty())
 		{
 			// Special case: don't log
 			return m_code + pos;
 		}
 		LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
 		return m_code + pos;
 	}
 	u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
 	{
 		std::lock_guard lock(s_mutex);
 		// Simple allocation
 		const u64 old = m_data_pos;
 		const u64 pos = ::align(m_data_pos, align);
@ -642,33 +662,12 @@ u64 jit_compiler::get(const std::string& name)
 	return m_engine->getGlobalValueAddress(name);
 }
-std::unordered_map<std::string, u64> jit_compiler::add(std::unordered_map<std::string, std::string> data)
+u8* jit_compiler::alloc(u32 size)
 {
-	// Lock memory manager
+	// Dummy memory manager object
-	std::lock_guard lock(s_mutex);
+	MemoryManager2 mm;
-	std::unordered_map<std::string, u64> result;
+	return mm.allocateCodeSection(size, 16, 0, {});
 	std::size_t size = 0;
 	for (auto&& pair : data)
 	{
 		size += ::align(pair.second.size(), 16);
 	}
 	utils::memory_commit(s_next, size, utils::protection::wx);
 	std::memset(s_next, 0xc3, ::align(size, 4096));
 	for (auto&& pair : data)
 	{
 		std::memcpy(s_next, pair.second.data(), pair.second.size());
 		result.emplace(pair.first, (u64)s_next);
 		s_next = (void*)::align((u64)s_next + pair.second.size(), 16);
 	}
 	s_next = (void*)::align((u64)s_next, 4096);
 	return result;
 }
 #endif
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -61,6 +61,7 @@ FT build_function_asm(F&& builder)
 #include <memory>
 #include <string>
 #include <string_view>
 #include <unordered_map>
 #include "types.h"
@ -129,8 +130,8 @@ public:
 	// Get compiled function address
 	u64 get(const std::string& name);
-	// Add functions directly to the memory manager (name -> code)
+	// Allocate writable executable memory (alignment is assumed 16)
-	static std::unordered_map<std::string, u64> add(std::unordered_map<std::string, std::string>);
+	static u8* alloc(u32 size);
 	// Get CPU info
 	static std::string cpu(const std::string& _cpu);
--- a/Utilities/cond.cpp
+++ b/Utilities/cond.cpp
@ -10,7 +10,7 @@
 bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
 {
-	verify("cond_variable overflow" HERE), (_old & 0xffff) == 0; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
+	verify("cond_variable overflow" HERE), (_old & 0xffff) != 0xffff; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
 	return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int
 	{
@ -42,7 +42,8 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
 void cond_variable::imp_wake(u32 _count) noexcept
 {
-	balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32
+	// TODO (notify_one)
 	balanced_awaken<true>(m_value, m_value.atomic_op([&](u32& value) -> u32
 	{
 		// Subtract already signaled number from total amount of waiters
 		const u32 can_sig = (value & 0xffff) - (value >> 16);
@ -266,7 +267,7 @@ void cond_x16::imp_notify() noexcept
 		return;
 	}
-	balanced_awaken(m_cvx16, utils::popcnt16(wait_mask));
+	balanced_awaken<true>(m_cvx16, utils::popcnt16(wait_mask));
 }
 bool lf_queue_base::wait(u64 _timeout)
--- a/Utilities/sync.h
+++ b/Utilities/sync.h
@ -186,7 +186,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 		{
 			if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000))
 			{
-				if (!test_pred(value) && !test_pred(value, nullptr))
+				if (!test_pred(value, nullptr))
 				{
 					return false;
 				}
@ -220,7 +220,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 		return true;
 	}
-	if (!test_pred(value) && !test_pred(value, nullptr))
+	if (!test_pred(value, nullptr))
 	{
 		// Stolen notification: restore balance
 		NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
@ -237,7 +237,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 	{
 		if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast<u32>(value), is_inf ? nullptr : &timeout) == 0)
 		{
-			if (!test_pred(value) && !test_pred(value, nullptr))
+			if (!test_pred(value, nullptr))
 			{
 				return false;
 			}
@ -257,7 +257,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 #endif
 }
-template <typename T>
+template <bool All = false, typename T>
 void balanced_awaken(atomic_t<T>& var, u32 weight)
 {
 	static_assert(sizeof(T) == 4 || sizeof(T) == 8);
@ -265,11 +265,13 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
 #ifdef _WIN32
 	if (OptWaitOnAddress)
 	{
-		if (weight > 1)
+		if (All || weight > 3)
 		{
 			OptWakeByAddressAll(&var);
 			return;
 		}
-		else if (weight == 1)
+
 		for (u32 i = 0; i < weight; i++)
 		{
 			OptWakeByAddressSingle(&var);
 		}
@ -282,9 +284,9 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
 		NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
 	}
 #else
-	if (weight)
+	if (All || weight)
 	{
-		futex(&var, FUTEX_WAKE_PRIVATE, std::min<u32>(INT_MAX, weight));
+		futex(&var, FUTEX_WAKE_PRIVATE, All ? INT_MAX : std::min<u32>(INT_MAX, weight));
 	}
 	return;
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -32,33 +32,8 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler
 	return std::make_unique<spu_recompiler>();
 }
 spu_runtime::spu_runtime()
 {
 	m_cache_path = fxm::check_unlocked<ppu_module>()->cache;
 	if (g_cfg.core.spu_debug)
 	{
 		fs::file(m_cache_path + "spu.log", fs::rewrite);
 	}
 	LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized...");
 	// Initialize lookup table
 	for (auto& v : m_dispatcher)
 	{
 		v.raw() = &spu_recompiler_base::dispatch;
 	}
 	// Initialize "empty" block
 	m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
 }
 spu_recompiler::spu_recompiler()
 {
 	if (!g_cfg.core.spu_shared_runtime)
 	{
 		m_spurt = std::make_shared<spu_runtime>();
 	}
 }
 void spu_recompiler::init()
@ -68,6 +43,7 @@ void spu_recompiler::init()
 	{
 		m_cache = fxm::get<spu_cache>();
 		m_spurt = fxm::get_always<spu_runtime>();
 		m_asmrt = m_spurt->get_asmjit_rt();
 	}
 }
@ -83,19 +59,22 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 {
 	init();
-	// Don't lock without shared runtime
+	std::unique_lock lock(m_spurt->m_mutex);
 	std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
 	if (g_cfg.core.spu_shared_runtime)
 	{
 		lock.lock();
 	}
 	// Try to find existing function, register new one if necessary
 	const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
 	auto& fn_location = fn_info.first->second;
 	if (!fn_location && !fn_info.second)
 	{
 		// Wait if already in progress
 		while (!fn_location)
 		{
 			m_spurt->m_cond.wait(lock);
 		}
 	}
 	if (fn_location)
 	{
 		return fn_location;
@ -103,6 +82,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	auto& func = fn_info.first->first;
 	lock.unlock();
 	using namespace asmjit;
 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
@ -124,7 +105,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	}
 	CodeHolder code;
-	code.init(m_spurt->m_jitrt.getCodeInfo());
+	code.init(m_asmrt->getCodeInfo());
 	code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign;
 	X86Assembler compiler(&code);
@ -861,14 +842,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	// Compile and get function address
 	spu_function_t fn;
-	if (m_spurt->m_jitrt.add(&fn, &code))
+	if (m_asmrt->add(&fn, &code))
 	{
 		LOG_FATAL(SPU, "Failed to build a function");
 	}
 	// Register function
 	fn_location = fn;
 	if (g_cfg.core.spu_debug)
 	{
 		// Add ASMJIT logs
@ -885,6 +863,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		m_cache->add(func);
 	}
 	lock.lock();
 	// Register function (possibly temporarily)
 	fn_location = fn;
 	// Generate a dispatcher (übertrampoline)
 	std::vector<u32> addrv{func[0]};
 	const auto beg = m_spurt->m_map.lower_bound(addrv);
@ -899,19 +882,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	else
 	{
 		CodeHolder code;
-		code.init(m_spurt->m_jitrt.getCodeInfo());
+		code.init(m_asmrt->getCodeInfo());
 		X86Assembler compiler(&code);
 		this->c = &compiler;
 		if (g_cfg.core.spu_debug)
 		{
 			// Set logger
 			code.setLogger(&logger);
 		}
 		compiler.comment("\n\nTrampoline:\n\n");
 		struct work
 		{
 			u32 size;
@ -1110,7 +1085,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		spu_function_t tr;
-		if (m_spurt->m_jitrt.add(&tr, &code))
+		if (m_asmrt->add(&tr, &code))
 		{
 			LOG_FATAL(SPU, "Failed to build a trampoline");
 		}
@ -1118,6 +1093,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		m_spurt->m_dispatcher[func[0] / 4] = tr;
 	}
 	lock.unlock();
 	m_spurt->m_cond.notify_all();
 	return fn;
 }
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -1,33 +1,10 @@
 #pragma once
 #include "Utilities/JIT.h"
 #include "Utilities/mutex.h"
 #include "SPURecompiler.h"
 #include <functional>
 // SPU ASMJIT Runtime object (global)
 class spu_runtime
 {
 	shared_mutex m_mutex;
 	asmjit::JitRuntime m_jitrt;
 	// All functions
 	std::map<std::vector<u32>, spu_function_t> m_map;
 	// All dispatchers
 	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
 	// Debug module output location
 	std::string m_cache_path;
 	friend class spu_recompiler;
 public:
 	spu_runtime();
 };
 // SPU ASMJIT Recompiler
 class spu_recompiler : public spu_recompiler_base
 {
@ -43,6 +20,9 @@ public:
 	virtual spu_function_t compile(std::vector<u32>&&) override;
 private:
 	// ASMJIT runtime
 	asmjit::JitRuntime* m_asmrt;
 	// emitter:
 	asmjit::X86Assembler* c;
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -24,7 +24,7 @@ const spu_decoder<spu_iname> s_spu_iname;
 extern u64 get_timebased_time();
 spu_cache::spu_cache(const std::string& loc)
-	: m_file(loc, fs::read + fs::write + fs::create)
+	: m_file(loc, fs::read + fs::write + fs::create + fs::append)
 {
 }
@ -76,18 +76,22 @@ void spu_cache::add(const std::vector<u32>& func)
 		return;
 	}
-	be_t<u32> size = ::size32(func) - 1;
+	// Allocate buffer
-	be_t<u32> addr = func[0];
+	const auto buf = std::make_unique<be_t<u32>[]>(func.size() + 1);
-	m_file.write(size);
+
-	m_file.write(addr);
+	buf[0] = ::size32(func) - 1;
-	m_file.write(func.data() + 1, func.size() * 4 - 4);
+	buf[1] = func[0];
 	std::memcpy(buf.get() + 2, func.data() + 1, func.size() * 4 - 4);
 	// Append data
 	m_file.write(buf.get(), func.size() * 4 + 4);
 }
 void spu_cache::initialize()
 {
 	const std::string ppu_cache = Emu.PPUCache();
-	if (ppu_cache.empty() || !g_cfg.core.spu_shared_runtime)
+	if (ppu_cache.empty())
 	{
 		return;
 	}
@ -105,30 +109,34 @@ void spu_cache::initialize()
 	// Read cache
 	auto func_list = cache->get();
 	atomic_t<std::size_t> fnext{};
-	// Recompiler instance for cache initialization
+	// Initialize compiler instances for parallel compilation
-	std::unique_ptr<spu_recompiler_base> compiler;
+	u32 max_threads = static_cast<u32>(g_cfg.core.llvm_threads);
 	u32 thread_count = max_threads > 0 ? std::min(max_threads, std::thread::hardware_concurrency()) : std::thread::hardware_concurrency();
 	std::vector<std::unique_ptr<spu_recompiler_base>> compilers{thread_count};
-	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
+	for (auto& compiler : compilers)
 	{
-		compiler = spu_recompiler_base::make_asmjit_recompiler();
+		if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
-	}
+		{
 			compiler = spu_recompiler_base::make_asmjit_recompiler();
 		}
 		else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
 		{
 			compiler = spu_recompiler_base::make_llvm_recompiler();
 		}
 		else
 		{
 			compilers.clear();
 			break;
 		}
 	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
 	{
 		compiler = spu_recompiler_base::make_llvm_recompiler();
 	}
 	if (compiler)
 	{
 		compiler->init();
 	}
-	if (compiler && !func_list.empty())
+	if (compilers.size() && !func_list.empty())
 	{
 		// Fake LS
 		std::vector<be_t<u32>> ls(0x10000);
 		// Initialize progress dialog (wait for previous progress done)
 		while (g_progr_ptotal)
 		{
@ -137,10 +145,20 @@ void spu_cache::initialize()
 		g_progr = "Building SPU cache...";
 		g_progr_ptotal += func_list.size();
 	}
 	std::deque<named_thread<std::function<void()>>> thread_queue;
 	for (std::size_t i = 0; i < compilers.size(); i++) thread_queue.emplace_back("Worker " + std::to_string(i), [&, compiler = compilers[i].get()]()
 	{
 		// Fake LS
 		std::vector<be_t<u32>> ls(0x10000);
 		// Build functions
-		for (auto&& func : func_list)
+		for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++)
 		{
 			std::vector<u32>& func = func_list[func_i];
 			if (Emu.IsStopped())
 			{
 				g_progr_pdone++;
@ -185,13 +203,22 @@ void spu_cache::initialize()
 			g_progr_pdone++;
 		}
 	});
-		if (Emu.IsStopped())
+	// Join all threads
-		{
+	while (!thread_queue.empty())
-			LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
+	{
-			return;
+		thread_queue.pop_front();
-		}
+	}
 	if (Emu.IsStopped())
 	{
 		LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
 		return;
 	}
 	if (compilers.size() && !func_list.empty())
 	{
 		LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
 	}
@ -202,6 +229,317 @@ void spu_cache::initialize()
 	});
 }
 spu_runtime::spu_runtime()
 {
 	// Initialize lookup table
 	for (auto& v : m_dispatcher)
 	{
 		v.raw() = &spu_recompiler_base::dispatch;
 	}
 	// Initialize "empty" block
 	m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
 	// Clear LLVM output
 	m_cache_path = Emu.PPUCache();
 	fs::create_dir(m_cache_path + "llvm/");
 	fs::remove_all(m_cache_path + "llvm/", false);
 	if (g_cfg.core.spu_debug)
 	{
 		fs::file(m_cache_path + "spu.log", fs::rewrite);
 	}
 	LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized...");
 }
 asmjit::JitRuntime* spu_runtime::get_asmjit_rt()
 {
 	std::lock_guard lock(m_mutex);
 	m_asmjit_rts.emplace_back(std::make_unique<asmjit::JitRuntime>());
 	return m_asmjit_rts.back().get();
 }
 void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled)
 {
 	std::unique_lock lock(m_mutex);
 	// Function info
 	const std::vector<u32>& func = where.first;
 	//
 	const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
 	// Set pointer to the compiled function
 	where.second = compiled;
 	// Generate a dispatcher (übertrampoline)
 	std::vector<u32> addrv{func[0]};
 	const auto beg = m_map.lower_bound(addrv);
 	addrv[0] += 4;
 	const auto _end = m_map.lower_bound(addrv);
 	const u32 size0 = std::distance(beg, _end);
 	if (size0 == 1)
 	{
 		m_dispatcher[func[0] / 4] = compiled;
 	}
 	else
 	{
 		// Allocate some writable executable memory
 #ifdef LLVM_AVAILABLE
 		const auto wxptr = jit_compiler::alloc(size0 * 20);
 #else
 		u8* const wxptr = new u8[size0 * 20]; // dummy
 #endif
 		// Raw assembly pointer
 		u8* raw = wxptr;
 		struct work
 		{
 			u32 size;
 			u32 level;
 			u8* rel32;
 			std::map<std::vector<u32>, spu_function_t>::iterator beg;
 			std::map<std::vector<u32>, spu_function_t>::iterator end;
 		};
 		// Write jump instruction with rel32 immediate
 		auto make_jump = [&](u8 op, auto target)
 		{
 			verify("Asm overflow" HERE), raw + 6 <= wxptr + size0 * 20;
 			if (!target && !tr_dispatch)
 			{
 				// Generate a special trampoline with pause instruction
 #ifdef LLVM_AVAILABLE
 				const auto trptr = jit_compiler::alloc(16);
 #else
 				u8* const trptr = new u8[16]; // dummy
 #endif
 				trptr[0] = 0xf3; // pause
 				trptr[1] = 0x90;
 				trptr[2] = 0xff; // jmp [rip]
 				trptr[3] = 0x25;
 				std::memset(trptr + 4, 0, 4);
 				const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
 				std::memcpy(trptr + 8, &target, 8);
 				tr_dispatch = reinterpret_cast<spu_function_t>(trptr);
 			}
 			// Fallback to dispatch if no target
 			const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
 			// Compute the distance
 			const s64 rel = taddr - reinterpret_cast<u64>(raw) - (op != 0xe9 ? 6 : 5);
 			verify(HERE), rel >= INT32_MIN, rel <= INT32_MAX;
 			if (op != 0xe9)
 			{
 				// First jcc byte
 				*raw++ = 0x0f;
 				verify(HERE), (op >> 4) == 0x8;
 			}
 			*raw++ = op;
 			const s32 r32 = static_cast<s32>(rel);
 			std::memcpy(raw, &r32, 4);
 			raw += 4;
 		};
 		std::vector<work> workload;
 		workload.reserve(size0);
 		workload.emplace_back();
 		workload.back().size  = size0;
 		workload.back().level = 1;
 		workload.back().rel32 = 0;
 		workload.back().beg   = beg;
 		workload.back().end   = _end;
 		for (std::size_t i = 0; i < workload.size(); i++)
 		{
 			// Get copy of the workload info
 			work w = workload[i];
 			// Split range in two parts
 			auto it = w.beg;
 			auto it2 = w.beg;
 			u32 size1 = w.size / 2;
 			u32 size2 = w.size - size1;
 			std::advance(it2, w.size / 2);
 			while (true)
 			{
 				it = it2;
 				size1 = w.size - size2;
 				if (w.level >= w.beg->first.size())
 				{
 					// Cannot split: smallest function is a prefix of bigger ones (TODO)
 					break;
 				}
 				const u32 x1 = w.beg->first.at(w.level);
 				if (!x1)
 				{
 					// Cannot split: some functions contain holes at this level
 					w.level++;
 					continue;
 				}
 				// Adjust ranges (forward)
 				while (it != w.end && x1 == it->first.at(w.level))
 				{
 					it++;
 					size1++;
 				}
 				if (it == w.end)
 				{
 					// Cannot split: words are identical within the range at this level
 					w.level++;
 				}
 				else
 				{
 					size2 = w.size - size1;
 					break;
 				}
 			}
 			if (w.rel32)
 			{
 				// Patch rel32 linking it to the current location if necessary
 				const s32 r32 = ::narrow<s32>(raw - w.rel32, HERE);
 				std::memcpy(w.rel32 - 4, &r32, 4);
 			}
 			if (w.level >= w.beg->first.size())
 			{
 				// If functions cannot be compared, assume smallest function
 				LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
 				make_jump(0xe9, w.beg->second); // jmp rel32
 				continue;
 			}
 			// Value for comparison
 			const u32 x = it->first.at(w.level);
 			// Adjust ranges (backward)
 			while (true)
 			{
 				it--;
 				if (it->first.at(w.level) != x)
 				{
 					it++;
 					break;
 				}
 				verify(HERE), it != w.beg;
 				size1--;
 				size2++;
 			}
 			// Emit 32-bit comparison: cmp [ls+addr], imm32
 			verify("Asm overflow" HERE), raw + 10 <= wxptr + size0 * 20;
 			const u32 cmp_lsa = start + (w.level - 1) * 4;
 			*raw++ = 0x81;
 #ifdef _WIN32
 			*raw++ = 0xba;
 #else
 			*raw++ = 0xbe;
 #endif
 			std::memcpy(raw, &cmp_lsa, 4);
 			std::memcpy(raw + 4, &x, 4);
 			raw += 8;
 			// Low subrange target
 			if (size1 == 1)
 			{
 				make_jump(0x82, w.beg->second); // jb rel32
 			}
 			else
 			{
 				make_jump(0x82, raw); // jb rel32 (stub)
 				workload.push_back(w);
 				workload.back().end = it;
 				workload.back().size = size1;
 				workload.back().rel32 = raw;
 			}
 			// Second subrange target
 			if (size2 == 1)
 			{
 				make_jump(0xe9, it->second); // jmp rel32
 			}
 			else
 			{
 				it2 = it;
 				// Select additional midrange for equality comparison
 				while (it2 != w.end && it2->first.at(w.level) == x)
 				{
 					size2--;
 					it2++;
 				}
 				if (it2 != w.end)
 				{
 					// High subrange target
 					if (size2 == 1)
 					{
 						make_jump(0x87, it2->second); // ja rel32
 					}
 					else
 					{
 						make_jump(0x87, raw); // ja rel32 (stub)
 						workload.push_back(w);
 						workload.back().beg = it2;
 						workload.back().size = size2;
 						workload.back().rel32 = raw;
 					}
 					const u32 size3 = w.size - size1 - size2;
 					if (size3 == 1)
 					{
 						make_jump(0xe9, it->second); // jmp rel32
 					}
 					else
 					{
 						make_jump(0xe9, raw); // jmp rel32 (stub)
 						workload.push_back(w);
 						workload.back().beg = it;
 						workload.back().end = it2;
 						workload.back().size = size3;
 						workload.back().rel32 = raw;
 					}
 				}
 				else
 				{
 					make_jump(0xe9, raw); // jmp rel32 (stub)
 					workload.push_back(w);
 					workload.back().beg = it;
 					workload.back().size = w.size - size1;
 					workload.back().rel32 = raw;
 				}
 			}
 		}
 		m_dispatcher[func[0] / 4] = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
 	}
 	lock.unlock();
 	m_cond.notify_all();
 }
 spu_recompiler_base::spu_recompiler_base()
 {
 }
@ -1491,55 +1829,14 @@ void spu_recompiler_base::dump(std::string& out)
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Vectorize.h"
 #include "Utilities/JIT.h"
 class spu_llvm_runtime
 {
 	shared_mutex m_mutex;
 	// All functions
 	std::map<std::vector<u32>, spu_function_t> m_map;
 	// All dispatchers
 	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
 	// JIT instance
 	jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
 	// Debug module output location
 	std::string m_cache_path;
 	friend class spu_llvm_recompiler;
 public:
 	spu_llvm_runtime()
 	{
 		// Initialize lookup table
 		for (auto& v : m_dispatcher)
 		{
 			v.raw() = &spu_recompiler_base::dispatch;
 		}
 		// Initialize "empty" block
 		m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
 		// Clear LLVM output
 		m_cache_path = Emu.PPUCache();
 		fs::create_dir(m_cache_path + "llvm/");
 		fs::remove_all(m_cache_path + "llvm/", false);
 		if (g_cfg.core.spu_debug)
 		{
 			fs::file(m_cache_path + "spu.log", fs::rewrite);
 		}
 		LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized...");
 	}
 };
 class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 {
-	std::shared_ptr<spu_llvm_runtime> m_spurt;
+	// SPU Runtime Instance
 	std::shared_ptr<spu_runtime> m_spurt;
 	// JIT Instance
 	jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
 	// Current function (chunk)
 	llvm::Function* m_function;
@ -2239,11 +2536,6 @@ public:
 		: spu_recompiler_base()
 		, cpu_translator(nullptr, false)
 	{
 		if (g_cfg.core.spu_shared_runtime)
 		{
 			// TODO (local context is unsupported)
 			//m_spurt = std::make_shared<spu_llvm_runtime>();
 		}
 	}
 	virtual void init() override
@ -2252,9 +2544,9 @@ public:
 		if (!m_spurt)
 		{
 			m_cache = fxm::get<spu_cache>();
-			m_spurt = fxm::get_always<spu_llvm_runtime>();
+			m_spurt = fxm::get_always<spu_runtime>();
-			m_context = m_spurt->m_jit.get_context();
+			m_context = m_jit.get_context();
-			m_use_ssse3 = m_spurt->m_jit.has_ssse3();
+			m_use_ssse3 = m_jit.has_ssse3();
 		}
 	}
@ -2271,18 +2563,22 @@ public:
 		init();
 		// Don't lock without shared runtime
-		std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
+		std::unique_lock lock(m_spurt->m_mutex);
 		if (g_cfg.core.spu_shared_runtime)
 		{
 			lock.lock();
 		}
 		// Try to find existing function, register new one if necessary
 		const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
 		auto& fn_location = fn_info.first->second;
 		if (!fn_location && !fn_info.second)
 		{
 			// Wait if already in progress
 			while (!fn_location)
 			{
 				m_spurt->m_cond.wait(lock);
 			}
 		}
 		if (fn_location)
 		{
 			return fn_location;
@ -2290,6 +2586,8 @@ public:
 		auto& func = fn_info.first->first;
 		lock.unlock();
 		std::string hash;
 		{
 			sha1_context ctx;
@ -2770,179 +3068,6 @@ public:
 		m_scan_queue.clear();
 		m_function_table = nullptr;
 		// Generate a dispatcher (übertrampoline)
 		std::vector<u32> addrv{func[0]};
 		const auto beg = m_spurt->m_map.lower_bound(addrv);
 		addrv[0] += 4;
 		const auto _end = m_spurt->m_map.lower_bound(addrv);
 		const u32 size0 = std::distance(beg, _end);
 		if (size0 > 1)
 		{
 			const auto trampoline = cast<Function>(module->getOrInsertFunction(fmt::format("spu-0x%05x-trampoline-%03u", func[0], size0), get_type<void>(), get_type<u8*>(), get_type<u8*>()));
 			set_function(trampoline);
 			struct work
 			{
 				u32 size;
 				u32 level;
 				BasicBlock* label;
 				std::map<std::vector<u32>, spu_function_t>::iterator beg;
 				std::map<std::vector<u32>, spu_function_t>::iterator end;
 			};
 			std::vector<work> workload;
 			workload.reserve(size0);
 			workload.emplace_back();
 			workload.back().size = size0;
 			workload.back().level = 1;
 			workload.back().beg = beg;
 			workload.back().end = _end;
 			workload.back().label = m_ir->GetInsertBlock();
 			for (std::size_t i = 0; i < workload.size(); i++)
 			{
 				// Get copy of the workload info
 				work w = workload[i];
 				// Switch targets
 				std::vector<std::pair<u32, llvm::BasicBlock*>> targets;
 				llvm::BasicBlock* def{};
 				bool unsorted = false;
 				while (w.level < w.beg->first.size())
 				{
 					const u32 x1 = w.beg->first.at(w.level);
 					if (x1 == 0)
 					{
 						// Cannot split: some functions contain holes at this level
 						auto it = w.end;
 						it--;
 						if (it->first.at(w.level) != 0)
 						{
 							unsorted = true;
 						}
 						w.level++;
 						continue;
 					}
 					auto it = w.beg;
 					auto it2 = it;
 					u32 x = x1;
 					bool split = false;
 					while (it2 != w.end)
 					{
 						it2++;
 						const u32 x2 = it2 != w.end ? it2->first.at(w.level) : x1;
 						if (x2 != x)
 						{
 							const u32 dist = std::distance(it, it2);
 							const auto b = llvm::BasicBlock::Create(m_context, "", m_function);
 							if (dist == 1 && x != 0)
 							{
 								m_ir->SetInsertPoint(b);
 								if (const u64 fval = reinterpret_cast<u64>(it->second))
 								{
 									const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
 									m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
 								}
 								else
 								{
 									verify(HERE, &it->second == &fn_location);
 									m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
 								}
 								m_ir->CreateRetVoid();
 							}
 							else
 							{
 								workload.emplace_back(w);
 								workload.back().beg = it;
 								workload.back().end = it2;
 								workload.back().label = b;
 								workload.back().size = dist;
 							}
 							if (x == 0)
 							{
 								def = b;
 							}
 							else
 							{
 								targets.emplace_back(std::make_pair(x, b));
 							}
 							x = x2;
 							it = it2;
 							split = true;
 						}
 					}
 					if (!split)
 					{
 						// Cannot split: words are identical within the range at this level
 						w.level++;
 					}
 					else
 					{
 						break;
 					}
 				}
 				if (!def && targets.empty())
 				{
 					LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
 					m_ir->SetInsertPoint(w.label);
 					if (const u64 fval = reinterpret_cast<u64>(w.beg->second))
 					{
 						const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
 						m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
 					}
 					else
 					{
 						verify(HERE, &w.beg->second == &fn_location);
 						m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
 					}
 					m_ir->CreateRetVoid();
 					continue;
 				}
 				if (!def)
 				{
 					def = llvm::BasicBlock::Create(m_context, "", m_function);
 					m_ir->SetInsertPoint(def);
 					tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0));
 				}
 				m_ir->SetInsertPoint(w.label);
 				const auto add = m_ir->CreateGEP(m_lsptr, m_ir->getInt64(start + w.level * 4 - 4));
 				const auto ptr = m_ir->CreateBitCast(add, get_type<u32*>());
 				const auto val = m_ir->CreateLoad(ptr);
 				const auto sw = m_ir->CreateSwitch(val, def, ::size32(targets));
 				for (auto& pair : targets)
 				{
 					sw->addCase(m_ir->getInt32(pair.first), pair.second);
 				}
 			}
 		}
 		spu_function_t fn{}, tr{};
 		std::string log;
 		raw_string_ostream out(log);
@ -2970,32 +3095,19 @@ public:
 		if (g_cfg.core.spu_debug)
 		{
 			// Testing only
-			m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
+			m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
 		}
 		else
 		{
-			m_spurt->m_jit.add(std::move(module));
+			m_jit.add(std::move(module));
 		}
-		m_spurt->m_jit.fin();
+		m_jit.fin();
 		fn = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(main_func));
 		tr = fn;
 		if (size0 > 1)
 		{
 			tr = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(m_function));
 		}
 		// Register function pointer
-		fn_location = fn;
+		const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));
-		// Trampoline
+		m_spurt->add(*fn_info.first, fn);
 		m_spurt->m_dispatcher[func[0] / 4] = tr;
 		LOG_NOTICE(SPU, "[0x%x] Compiled: %p", func[0], fn);
 		if (tr != fn)
 			LOG_NOTICE(SPU, "[0x%x] T: %p", func[0], tr);
 		if (g_cfg.core.spu_debug)
 		{
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -1,6 +1,9 @@
 #pragma once
 #include "Utilities/File.h"
 #include "Utilities/mutex.h"
 #include "Utilities/cond.h"
 #include "Utilities/JIT.h"
 #include "SPUThread.h"
 #include <vector>
 #include <bitset>
@ -30,6 +33,40 @@ public:
 	static void initialize();
 };
 // Helper class
 class spu_runtime
 {
 public:
 	shared_mutex m_mutex;
 	cond_variable m_cond;
 	// All functions
 	std::map<std::vector<u32>, spu_function_t> m_map;
 	// All dispatchers
 	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
 	// Debug module output location
 	std::string m_cache_path;
 private:
 	// Temporarily: asmjit runtime collection
 	std::deque<std::unique_ptr<asmjit::JitRuntime>> m_asmjit_rts;
 	// Trampoline to spu_recompiler_base::dispatch
 	spu_function_t tr_dispatch = nullptr;
 public:
 	spu_runtime();
 	// Get new ASMJIT runtime
 	asmjit::JitRuntime* get_asmjit_rt();
 	// Add compiled function and generate trampoline if necessary
 	void add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled);
 };
 // SPU Recompiler instance base class
 class spu_recompiler_base
 {
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@ -367,7 +367,6 @@ struct cfg_root : cfg::node
 		cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
 		cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
 		cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield
 		cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads
 		cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
 		cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
 		cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};