SPU: multithread compilation

Allow parallel compilation of SPU code, both at startup and runtime Remove 'SPU Shared Runtime' option (it became obsolete) Refactor spu_runtime class (now is common for ASMJIT and LLVM) Implement SPU ubertrampoline generation in raw assembly (LLVM) Minor improvement of balanced_wait_until<> and balanced_awaken<> Make JIT MemoryManager2 shared (global) Fix wrong assertion in cond_variable
2025-01-31 12:31:45 +01:00 · 2019-01-21 21:04:32 +03:00 · 2019-01-21 21:04:32 +03:00 · 4f152ad126
commit 4f152ad126
parent 8d5d44141e
9 changed files with 503 additions and 394 deletions
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@ -95,6 +95,12 @@ static void* const s_memory = []() -> void*
 	return utils::memory_reserve(s_memory_size);
 }();

+// Reserve 2G of memory, should replace previous area for ASLR compatibility
+static void* const s_memory2 = utils::memory_reserve(0x80000000);
+
+static u64 s_code_pos = 0;
+static u64 s_data_pos = 0;
+
 static void* s_next = s_memory;

 #ifdef _WIN32
@ -129,6 +135,11 @@ extern void jit_finalize()
 	utils::memory_decommit(s_memory, s_memory_size);

 	s_next = s_memory;
+
+	utils::memory_decommit(s_memory2, 0x80000000);
+
+	s_code_pos = 0;
+	s_data_pos = 0;
 }

 // Helper class
@ -311,24 +322,25 @@ struct MemoryManager : llvm::RTDyldMemoryManager
 // Simple memory manager
 struct MemoryManager2 : llvm::RTDyldMemoryManager
 {
-	// Reserve 2 GiB
-	void* const m_memory = utils::memory_reserve(0x80000000);
+	// Patchwork again...
+	void* const m_memory = s_memory2;

 	u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
 	u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;

-	u64 m_code_pos = 0;
-	u64 m_data_pos = 0;
+	u64& m_code_pos = s_code_pos;
+	u64& m_data_pos = s_data_pos;

 	MemoryManager2() = default;

 	~MemoryManager2() override
 	{
-		utils::memory_release(m_memory, 0x80000000);
 	}

 	u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
 	{
+		std::lock_guard lock(s_mutex);
+
 		// Simple allocation
 		const u64 old = m_code_pos;
 		const u64 pos = ::align(m_code_pos, align);
@ -349,12 +361,20 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
 			utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
 		}

+		if (!sec_id && sec_name.empty())
+		{
+			// Special case: don't log
+			return m_code + pos;
+		}
+
 		LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
 		return m_code + pos;
 	}

 	u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
 	{
+		std::lock_guard lock(s_mutex);
+
 		// Simple allocation
 		const u64 old = m_data_pos;
 		const u64 pos = ::align(m_data_pos, align);
@ -642,33 +662,12 @@ u64 jit_compiler::get(const std::string& name)
 	return m_engine->getGlobalValueAddress(name);
 }

-std::unordered_map<std::string, u64> jit_compiler::add(std::unordered_map<std::string, std::string> data)
+u8* jit_compiler::alloc(u32 size)
 {
-	// Lock memory manager
-	std::lock_guard lock(s_mutex);
+	// Dummy memory manager object
+	MemoryManager2 mm;

-	std::unordered_map<std::string, u64> result;
-
-	std::size_t size = 0;
-
-	for (auto&& pair : data)
-	{
-		size += ::align(pair.second.size(), 16);
-	}
-
-	utils::memory_commit(s_next, size, utils::protection::wx);
-	std::memset(s_next, 0xc3, ::align(size, 4096));
-
-	for (auto&& pair : data)
-	{
-		std::memcpy(s_next, pair.second.data(), pair.second.size());
-		result.emplace(pair.first, (u64)s_next);
-		s_next = (void*)::align((u64)s_next + pair.second.size(), 16);
-	}
-
-	s_next = (void*)::align((u64)s_next, 4096);
-
-	return result;
+	return mm.allocateCodeSection(size, 16, 0, {});
 }

 #endif
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -61,6 +61,7 @@ FT build_function_asm(F&& builder)

 #include <memory>
 #include <string>
+#include <string_view>
 #include <unordered_map>

 #include "types.h"
@ -129,8 +130,8 @@ public:
 	// Get compiled function address
 	u64 get(const std::string& name);

-	// Add functions directly to the memory manager (name -> code)
-	static std::unordered_map<std::string, u64> add(std::unordered_map<std::string, std::string>);
+	// Allocate writable executable memory (alignment is assumed 16)
+	static u8* alloc(u32 size);

 	// Get CPU info
 	static std::string cpu(const std::string& _cpu);
--- a/Utilities/cond.cpp
+++ b/Utilities/cond.cpp
@ -10,7 +10,7 @@

 bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
 {
-	verify("cond_variable overflow" HERE), (_old & 0xffff) == 0; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
+	verify("cond_variable overflow" HERE), (_old & 0xffff) != 0xffff; // Very unlikely: it requires 65535 distinct threads to wait simultaneously

 	return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int
 	{
@ -42,7 +42,8 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept

 void cond_variable::imp_wake(u32 _count) noexcept
 {
-	balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32
+	// TODO (notify_one)
+	balanced_awaken<true>(m_value, m_value.atomic_op([&](u32& value) -> u32
 	{
 		// Subtract already signaled number from total amount of waiters
 		const u32 can_sig = (value & 0xffff) - (value >> 16);
@ -266,7 +267,7 @@ void cond_x16::imp_notify() noexcept
 		return;
 	}

-	balanced_awaken(m_cvx16, utils::popcnt16(wait_mask));
+	balanced_awaken<true>(m_cvx16, utils::popcnt16(wait_mask));
 }

 bool lf_queue_base::wait(u64 _timeout)
--- a/Utilities/sync.h
+++ b/Utilities/sync.h
@ -186,7 +186,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 		{
 			if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000))
 			{
-				if (!test_pred(value) && !test_pred(value, nullptr))
+				if (!test_pred(value, nullptr))
 				{
 					return false;
 				}
@ -220,7 +220,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 		return true;
 	}

-	if (!test_pred(value) && !test_pred(value, nullptr))
+	if (!test_pred(value, nullptr))
 	{
 		// Stolen notification: restore balance
 		NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
@ -237,7 +237,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 	{
 		if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast<u32>(value), is_inf ? nullptr : &timeout) == 0)
 		{
-			if (!test_pred(value) && !test_pred(value, nullptr))
+			if (!test_pred(value, nullptr))
 			{
 				return false;
 			}
@ -257,7 +257,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 #endif
 }

-template <typename T>
+template <bool All = false, typename T>
 void balanced_awaken(atomic_t<T>& var, u32 weight)
 {
 	static_assert(sizeof(T) == 4 || sizeof(T) == 8);
@ -265,11 +265,13 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
 #ifdef _WIN32
 	if (OptWaitOnAddress)
 	{
-		if (weight > 1)
+		if (All || weight > 3)
 		{
 			OptWakeByAddressAll(&var);
+			return;
 		}
-		else if (weight == 1)
+
+		for (u32 i = 0; i < weight; i++)
 		{
 			OptWakeByAddressSingle(&var);
 		}
@ -282,9 +284,9 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
 		NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
 	}
 #else
-	if (weight)
+	if (All || weight)
 	{
-		futex(&var, FUTEX_WAKE_PRIVATE, std::min<u32>(INT_MAX, weight));
+		futex(&var, FUTEX_WAKE_PRIVATE, All ? INT_MAX : std::min<u32>(INT_MAX, weight));
 	}

 	return;
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -32,33 +32,8 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler
 	return std::make_unique<spu_recompiler>();
 }

-spu_runtime::spu_runtime()
-{
-	m_cache_path = fxm::check_unlocked<ppu_module>()->cache;
-
-	if (g_cfg.core.spu_debug)
-	{
-		fs::file(m_cache_path + "spu.log", fs::rewrite);
-	}
-
-	LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized...");
-
-	// Initialize lookup table
-	for (auto& v : m_dispatcher)
-	{
-		v.raw() = &spu_recompiler_base::dispatch;
-	}
-
-	// Initialize "empty" block
-	m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
-}
-
 spu_recompiler::spu_recompiler()
 {
-	if (!g_cfg.core.spu_shared_runtime)
-	{
-		m_spurt = std::make_shared<spu_runtime>();
-	}
 }

 void spu_recompiler::init()
@ -68,6 +43,7 @@ void spu_recompiler::init()
 	{
 		m_cache = fxm::get<spu_cache>();
 		m_spurt = fxm::get_always<spu_runtime>();
+		m_asmrt = m_spurt->get_asmjit_rt();
 	}
 }

@ -83,19 +59,22 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 {
 	init();

-	// Don't lock without shared runtime
-	std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
-
-	if (g_cfg.core.spu_shared_runtime)
-	{
-		lock.lock();
-	}
+	std::unique_lock lock(m_spurt->m_mutex);

 	// Try to find existing function, register new one if necessary
 	const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);

 	auto& fn_location = fn_info.first->second;

+	if (!fn_location && !fn_info.second)
+	{
+		// Wait if already in progress
+		while (!fn_location)
+		{
+			m_spurt->m_cond.wait(lock);
+		}
+	}
+
 	if (fn_location)
 	{
 		return fn_location;
@ -103,6 +82,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 	auto& func = fn_info.first->first;

+	lock.unlock();
+
 	using namespace asmjit;

 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
@ -124,7 +105,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	}

 	CodeHolder code;
-	code.init(m_spurt->m_jitrt.getCodeInfo());
+	code.init(m_asmrt->getCodeInfo());
 	code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign;

 	X86Assembler compiler(&code);
@ -861,14 +842,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	// Compile and get function address
 	spu_function_t fn;

-	if (m_spurt->m_jitrt.add(&fn, &code))
+	if (m_asmrt->add(&fn, &code))
 	{
 		LOG_FATAL(SPU, "Failed to build a function");
 	}

-	// Register function
-	fn_location = fn;
-
 	if (g_cfg.core.spu_debug)
 	{
 		// Add ASMJIT logs
@ -885,6 +863,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		m_cache->add(func);
 	}

+	lock.lock();
+
+	// Register function (possibly temporarily)
+	fn_location = fn;
+
 	// Generate a dispatcher (übertrampoline)
 	std::vector<u32> addrv{func[0]};
 	const auto beg = m_spurt->m_map.lower_bound(addrv);
@ -899,19 +882,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	else
 	{
 		CodeHolder code;
-		code.init(m_spurt->m_jitrt.getCodeInfo());
+		code.init(m_asmrt->getCodeInfo());

 		X86Assembler compiler(&code);
 		this->c = &compiler;

-		if (g_cfg.core.spu_debug)
-		{
-			// Set logger
-			code.setLogger(&logger);
-		}
-
-		compiler.comment("\n\nTrampoline:\n\n");
-
 		struct work
 		{
 			u32 size;
@ -1110,7 +1085,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 		spu_function_t tr;

-		if (m_spurt->m_jitrt.add(&tr, &code))
+		if (m_asmrt->add(&tr, &code))
 		{
 			LOG_FATAL(SPU, "Failed to build a trampoline");
 		}
@ -1118,6 +1093,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		m_spurt->m_dispatcher[func[0] / 4] = tr;
 	}

+	lock.unlock();
+	m_spurt->m_cond.notify_all();
+
 	return fn;
 }

--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -1,33 +1,10 @@
 #pragma once

 #include "Utilities/JIT.h"
-#include "Utilities/mutex.h"
 #include "SPURecompiler.h"

 #include <functional>

-// SPU ASMJIT Runtime object (global)
-class spu_runtime
-{
-	shared_mutex m_mutex;
-
-	asmjit::JitRuntime m_jitrt;
-
-	// All functions
-	std::map<std::vector<u32>, spu_function_t> m_map;
-
-	// All dispatchers
-	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
-
-	// Debug module output location
-	std::string m_cache_path;
-
-	friend class spu_recompiler;
-
-public:
-	spu_runtime();
-};
-
 // SPU ASMJIT Recompiler
 class spu_recompiler : public spu_recompiler_base
 {
@ -43,6 +20,9 @@ public:
 	virtual spu_function_t compile(std::vector<u32>&&) override;

 private:
+	// ASMJIT runtime
+	asmjit::JitRuntime* m_asmrt;
+
 	// emitter:
 	asmjit::X86Assembler* c;

--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -24,7 +24,7 @@ const spu_decoder<spu_iname> s_spu_iname;
 extern u64 get_timebased_time();

 spu_cache::spu_cache(const std::string& loc)
-	: m_file(loc, fs::read + fs::write + fs::create)
+	: m_file(loc, fs::read + fs::write + fs::create + fs::append)
 {
 }

@ -76,18 +76,22 @@ void spu_cache::add(const std::vector<u32>& func)
 		return;
 	}

-	be_t<u32> size = ::size32(func) - 1;
-	be_t<u32> addr = func[0];
-	m_file.write(size);
-	m_file.write(addr);
-	m_file.write(func.data() + 1, func.size() * 4 - 4);
+	// Allocate buffer
+	const auto buf = std::make_unique<be_t<u32>[]>(func.size() + 1);
+
+	buf[0] = ::size32(func) - 1;
+	buf[1] = func[0];
+	std::memcpy(buf.get() + 2, func.data() + 1, func.size() * 4 - 4);
+
+	// Append data
+	m_file.write(buf.get(), func.size() * 4 + 4);
 }

 void spu_cache::initialize()
 {
 	const std::string ppu_cache = Emu.PPUCache();

-	if (ppu_cache.empty() || !g_cfg.core.spu_shared_runtime)
+	if (ppu_cache.empty())
 	{
 		return;
 	}
@ -105,30 +109,34 @@ void spu_cache::initialize()

 	// Read cache
 	auto func_list = cache->get();
+	atomic_t<std::size_t> fnext{};

-	// Recompiler instance for cache initialization
-	std::unique_ptr<spu_recompiler_base> compiler;
+	// Initialize compiler instances for parallel compilation
+	u32 max_threads = static_cast<u32>(g_cfg.core.llvm_threads);
+	u32 thread_count = max_threads > 0 ? std::min(max_threads, std::thread::hardware_concurrency()) : std::thread::hardware_concurrency();
+	std::vector<std::unique_ptr<spu_recompiler_base>> compilers{thread_count};

-	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
+	for (auto& compiler : compilers)
 	{
-		compiler = spu_recompiler_base::make_asmjit_recompiler();
-	}
+		if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
+		{
+			compiler = spu_recompiler_base::make_asmjit_recompiler();
+		}
+		else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+		{
+			compiler = spu_recompiler_base::make_llvm_recompiler();
+		}
+		else
+		{
+			compilers.clear();
+			break;
+		}

-	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
-	{
-		compiler = spu_recompiler_base::make_llvm_recompiler();
-	}
-
-	if (compiler)
-	{
 		compiler->init();
 	}

-	if (compiler && !func_list.empty())
+	if (compilers.size() && !func_list.empty())
 	{
-		// Fake LS
-		std::vector<be_t<u32>> ls(0x10000);
-
 		// Initialize progress dialog (wait for previous progress done)
 		while (g_progr_ptotal)
 		{
@ -137,10 +145,20 @@ void spu_cache::initialize()

 		g_progr = "Building SPU cache...";
 		g_progr_ptotal += func_list.size();
+	}
+
+	std::deque<named_thread<std::function<void()>>> thread_queue;
+
+	for (std::size_t i = 0; i < compilers.size(); i++) thread_queue.emplace_back("Worker " + std::to_string(i), [&, compiler = compilers[i].get()]()
+	{
+		// Fake LS
+		std::vector<be_t<u32>> ls(0x10000);

 		// Build functions
-		for (auto&& func : func_list)
+		for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++)
 		{
+			std::vector<u32>& func = func_list[func_i];
+
 			if (Emu.IsStopped())
 			{
 				g_progr_pdone++;
@ -185,13 +203,22 @@ void spu_cache::initialize()

 			g_progr_pdone++;
 		}
+	});

-		if (Emu.IsStopped())
-		{
-			LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
-			return;
-		}
+	// Join all threads
+	while (!thread_queue.empty())
+	{
+		thread_queue.pop_front();
+	}

+	if (Emu.IsStopped())
+	{
+		LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
+		return;
+	}
+
+	if (compilers.size() && !func_list.empty())
+	{
 		LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
 	}

@ -202,6 +229,317 @@ void spu_cache::initialize()
 	});
 }

+spu_runtime::spu_runtime()
+{
+	// Initialize lookup table
+	for (auto& v : m_dispatcher)
+	{
+		v.raw() = &spu_recompiler_base::dispatch;
+	}
+
+	// Initialize "empty" block
+	m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
+
+	// Clear LLVM output
+	m_cache_path = Emu.PPUCache();
+	fs::create_dir(m_cache_path + "llvm/");
+	fs::remove_all(m_cache_path + "llvm/", false);
+
+	if (g_cfg.core.spu_debug)
+	{
+		fs::file(m_cache_path + "spu.log", fs::rewrite);
+	}
+
+	LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized...");
+}
+
+asmjit::JitRuntime* spu_runtime::get_asmjit_rt()
+{
+	std::lock_guard lock(m_mutex);
+
+	m_asmjit_rts.emplace_back(std::make_unique<asmjit::JitRuntime>());
+
+	return m_asmjit_rts.back().get();
+}
+
+void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled)
+{
+	std::unique_lock lock(m_mutex);
+
+	// Function info
+	const std::vector<u32>& func = where.first;
+
+	//
+	const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
+
+	// Set pointer to the compiled function
+	where.second = compiled;
+
+	// Generate a dispatcher (übertrampoline)
+	std::vector<u32> addrv{func[0]};
+	const auto beg = m_map.lower_bound(addrv);
+	addrv[0] += 4;
+	const auto _end = m_map.lower_bound(addrv);
+	const u32 size0 = std::distance(beg, _end);
+
+	if (size0 == 1)
+	{
+		m_dispatcher[func[0] / 4] = compiled;
+	}
+	else
+	{
+		// Allocate some writable executable memory
+#ifdef LLVM_AVAILABLE
+		const auto wxptr = jit_compiler::alloc(size0 * 20);
+#else
+		u8* const wxptr = new u8[size0 * 20]; // dummy
+#endif
+
+		// Raw assembly pointer
+		u8* raw = wxptr;
+
+		struct work
+		{
+			u32 size;
+			u32 level;
+			u8* rel32;
+			std::map<std::vector<u32>, spu_function_t>::iterator beg;
+			std::map<std::vector<u32>, spu_function_t>::iterator end;
+		};
+
+		// Write jump instruction with rel32 immediate
+		auto make_jump = [&](u8 op, auto target)
+		{
+			verify("Asm overflow" HERE), raw + 6 <= wxptr + size0 * 20;
+
+			if (!target && !tr_dispatch)
+			{
+				// Generate a special trampoline with pause instruction
+#ifdef LLVM_AVAILABLE
+				const auto trptr = jit_compiler::alloc(16);
+#else
+				u8* const trptr = new u8[16]; // dummy
+#endif
+				trptr[0] = 0xf3; // pause
+				trptr[1] = 0x90;
+				trptr[2] = 0xff; // jmp [rip]
+				trptr[3] = 0x25;
+				std::memset(trptr + 4, 0, 4);
+				const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
+				std::memcpy(trptr + 8, &target, 8);
+				tr_dispatch = reinterpret_cast<spu_function_t>(trptr);
+			}
+
+			// Fallback to dispatch if no target
+			const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
+
+			// Compute the distance
+			const s64 rel = taddr - reinterpret_cast<u64>(raw) - (op != 0xe9 ? 6 : 5);
+
+			verify(HERE), rel >= INT32_MIN, rel <= INT32_MAX;
+
+			if (op != 0xe9)
+			{
+				// First jcc byte
+				*raw++ = 0x0f;
+				verify(HERE), (op >> 4) == 0x8;
+			}
+
+			*raw++ = op;
+
+			const s32 r32 = static_cast<s32>(rel);
+
+			std::memcpy(raw, &r32, 4);
+			raw += 4;
+		};
+
+		std::vector<work> workload;
+		workload.reserve(size0);
+		workload.emplace_back();
+		workload.back().size  = size0;
+		workload.back().level = 1;
+		workload.back().rel32 = 0;
+		workload.back().beg   = beg;
+		workload.back().end   = _end;
+
+		for (std::size_t i = 0; i < workload.size(); i++)
+		{
+			// Get copy of the workload info
+			work w = workload[i];
+
+			// Split range in two parts
+			auto it = w.beg;
+			auto it2 = w.beg;
+			u32 size1 = w.size / 2;
+			u32 size2 = w.size - size1;
+			std::advance(it2, w.size / 2);
+
+			while (true)
+			{
+				it = it2;
+				size1 = w.size - size2;
+
+				if (w.level >= w.beg->first.size())
+				{
+					// Cannot split: smallest function is a prefix of bigger ones (TODO)
+					break;
+				}
+
+				const u32 x1 = w.beg->first.at(w.level);
+
+				if (!x1)
+				{
+					// Cannot split: some functions contain holes at this level
+					w.level++;
+					continue;
+				}
+
+				// Adjust ranges (forward)
+				while (it != w.end && x1 == it->first.at(w.level))
+				{
+					it++;
+					size1++;
+				}
+
+				if (it == w.end)
+				{
+					// Cannot split: words are identical within the range at this level
+					w.level++;
+				}
+				else
+				{
+					size2 = w.size - size1;
+					break;
+				}
+			}
+
+			if (w.rel32)
+			{
+				// Patch rel32 linking it to the current location if necessary
+				const s32 r32 = ::narrow<s32>(raw - w.rel32, HERE);
+				std::memcpy(w.rel32 - 4, &r32, 4);
+			}
+
+			if (w.level >= w.beg->first.size())
+			{
+				// If functions cannot be compared, assume smallest function
+				LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
+				make_jump(0xe9, w.beg->second); // jmp rel32
+				continue;
+			}
+
+			// Value for comparison
+			const u32 x = it->first.at(w.level);
+
+			// Adjust ranges (backward)
+			while (true)
+			{
+				it--;
+
+				if (it->first.at(w.level) != x)
+				{
+					it++;
+					break;
+				}
+
+				verify(HERE), it != w.beg;
+				size1--;
+				size2++;
+			}
+
+			// Emit 32-bit comparison: cmp [ls+addr], imm32
+			verify("Asm overflow" HERE), raw + 10 <= wxptr + size0 * 20;
+			const u32 cmp_lsa = start + (w.level - 1) * 4;
+			*raw++ = 0x81;
+#ifdef _WIN32
+			*raw++ = 0xba;
+#else
+			*raw++ = 0xbe;
+#endif
+			std::memcpy(raw, &cmp_lsa, 4);
+			std::memcpy(raw + 4, &x, 4);
+			raw += 8;
+
+			// Low subrange target
+			if (size1 == 1)
+			{
+				make_jump(0x82, w.beg->second); // jb rel32
+			}
+			else
+			{
+				make_jump(0x82, raw); // jb rel32 (stub)
+				workload.push_back(w);
+				workload.back().end = it;
+				workload.back().size = size1;
+				workload.back().rel32 = raw;
+			}
+
+			// Second subrange target
+			if (size2 == 1)
+			{
+				make_jump(0xe9, it->second); // jmp rel32
+			}
+			else
+			{
+				it2 = it;
+
+				// Select additional midrange for equality comparison
+				while (it2 != w.end && it2->first.at(w.level) == x)
+				{
+					size2--;
+					it2++;
+				}
+
+				if (it2 != w.end)
+				{
+					// High subrange target
+					if (size2 == 1)
+					{
+						make_jump(0x87, it2->second); // ja rel32
+					}
+					else
+					{
+						make_jump(0x87, raw); // ja rel32 (stub)
+						workload.push_back(w);
+						workload.back().beg = it2;
+						workload.back().size = size2;
+						workload.back().rel32 = raw;
+					}
+
+					const u32 size3 = w.size - size1 - size2;
+
+					if (size3 == 1)
+					{
+						make_jump(0xe9, it->second); // jmp rel32
+					}
+					else
+					{
+						make_jump(0xe9, raw); // jmp rel32 (stub)
+						workload.push_back(w);
+						workload.back().beg = it;
+						workload.back().end = it2;
+						workload.back().size = size3;
+						workload.back().rel32 = raw;
+					}
+				}
+				else
+				{
+					make_jump(0xe9, raw); // jmp rel32 (stub)
+					workload.push_back(w);
+					workload.back().beg = it;
+					workload.back().size = w.size - size1;
+					workload.back().rel32 = raw;
+				}
+			}
+		}
+
+		m_dispatcher[func[0] / 4] = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
+	}
+
+	lock.unlock();
+	m_cond.notify_all();
+}
+
 spu_recompiler_base::spu_recompiler_base()
 {
 }
@ -1491,55 +1829,14 @@ void spu_recompiler_base::dump(std::string& out)
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Vectorize.h"
-#include "Utilities/JIT.h"
-
-class spu_llvm_runtime
-{
-	shared_mutex m_mutex;
-
-	// All functions
-	std::map<std::vector<u32>, spu_function_t> m_map;
-
-	// All dispatchers
-	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
-
-	// JIT instance
-	jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
-
-	// Debug module output location
-	std::string m_cache_path;
-
-	friend class spu_llvm_recompiler;
-
-public:
-	spu_llvm_runtime()
-	{
-		// Initialize lookup table
-		for (auto& v : m_dispatcher)
-		{
-			v.raw() = &spu_recompiler_base::dispatch;
-		}
-
-		// Initialize "empty" block
-		m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
-
-		// Clear LLVM output
-		m_cache_path = Emu.PPUCache();
-		fs::create_dir(m_cache_path + "llvm/");
-		fs::remove_all(m_cache_path + "llvm/", false);
-
-		if (g_cfg.core.spu_debug)
-		{
-			fs::file(m_cache_path + "spu.log", fs::rewrite);
-		}
-
-		LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized...");
-	}
-};

 class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 {
-	std::shared_ptr<spu_llvm_runtime> m_spurt;
+	// SPU Runtime Instance
+	std::shared_ptr<spu_runtime> m_spurt;
+
+	// JIT Instance
+	jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};

 	// Current function (chunk)
 	llvm::Function* m_function;
@ -2239,11 +2536,6 @@ public:
 		: spu_recompiler_base()
 		, cpu_translator(nullptr, false)
 	{
-		if (g_cfg.core.spu_shared_runtime)
-		{
-			// TODO (local context is unsupported)
-			//m_spurt = std::make_shared<spu_llvm_runtime>();
-		}
 	}

 	virtual void init() override
@ -2252,9 +2544,9 @@ public:
 		if (!m_spurt)
 		{
 			m_cache = fxm::get<spu_cache>();
-			m_spurt = fxm::get_always<spu_llvm_runtime>();
-			m_context = m_spurt->m_jit.get_context();
-			m_use_ssse3 = m_spurt->m_jit.has_ssse3();
+			m_spurt = fxm::get_always<spu_runtime>();
+			m_context = m_jit.get_context();
+			m_use_ssse3 = m_jit.has_ssse3();
 		}
 	}

@ -2271,18 +2563,22 @@ public:
 		init();

 		// Don't lock without shared runtime
-		std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
-
-		if (g_cfg.core.spu_shared_runtime)
-		{
-			lock.lock();
-		}
+		std::unique_lock lock(m_spurt->m_mutex);

 		// Try to find existing function, register new one if necessary
 		const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);

 		auto& fn_location = fn_info.first->second;

+		if (!fn_location && !fn_info.second)
+		{
+			// Wait if already in progress
+			while (!fn_location)
+			{
+				m_spurt->m_cond.wait(lock);
+			}
+		}
+
 		if (fn_location)
 		{
 			return fn_location;
@ -2290,6 +2586,8 @@ public:

 		auto& func = fn_info.first->first;

+		lock.unlock();
+
 		std::string hash;
 		{
 			sha1_context ctx;
@ -2770,179 +3068,6 @@ public:
 		m_scan_queue.clear();
 		m_function_table = nullptr;

-		// Generate a dispatcher (übertrampoline)
-		std::vector<u32> addrv{func[0]};
-		const auto beg = m_spurt->m_map.lower_bound(addrv);
-		addrv[0] += 4;
-		const auto _end = m_spurt->m_map.lower_bound(addrv);
-		const u32 size0 = std::distance(beg, _end);
-
-		if (size0 > 1)
-		{
-			const auto trampoline = cast<Function>(module->getOrInsertFunction(fmt::format("spu-0x%05x-trampoline-%03u", func[0], size0), get_type<void>(), get_type<u8*>(), get_type<u8*>()));
-			set_function(trampoline);
-
-			struct work
-			{
-				u32 size;
-				u32 level;
-				BasicBlock* label;
-				std::map<std::vector<u32>, spu_function_t>::iterator beg;
-				std::map<std::vector<u32>, spu_function_t>::iterator end;
-			};
-
-			std::vector<work> workload;
-			workload.reserve(size0);
-			workload.emplace_back();
-			workload.back().size = size0;
-			workload.back().level = 1;
-			workload.back().beg = beg;
-			workload.back().end = _end;
-			workload.back().label = m_ir->GetInsertBlock();
-
-			for (std::size_t i = 0; i < workload.size(); i++)
-			{
-				// Get copy of the workload info
-				work w = workload[i];
-
-				// Switch targets
-				std::vector<std::pair<u32, llvm::BasicBlock*>> targets;
-
-				llvm::BasicBlock* def{};
-
-				bool unsorted = false;
-
-				while (w.level < w.beg->first.size())
-				{
-					const u32 x1 = w.beg->first.at(w.level);
-
-					if (x1 == 0)
-					{
-						// Cannot split: some functions contain holes at this level
-						auto it = w.end;
-						it--;
-
-						if (it->first.at(w.level) != 0)
-						{
-							unsorted = true;
-						}
-
-						w.level++;
-						continue;
-					}
-
-					auto it = w.beg;
-					auto it2 = it;
-					u32 x = x1;
-					bool split = false;
-
-					while (it2 != w.end)
-					{
-						it2++;
-
-						const u32 x2 = it2 != w.end ? it2->first.at(w.level) : x1;
-
-						if (x2 != x)
-						{
-							const u32 dist = std::distance(it, it2);
-
-							const auto b = llvm::BasicBlock::Create(m_context, "", m_function);
-
-							if (dist == 1 && x != 0)
-							{
-								m_ir->SetInsertPoint(b);
-
-								if (const u64 fval = reinterpret_cast<u64>(it->second))
-								{
-									const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
-									m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
-								}
-								else
-								{
-									verify(HERE, &it->second == &fn_location);
-									m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
-								}
-
-								m_ir->CreateRetVoid();
-							}
-							else
-							{
-								workload.emplace_back(w);
-								workload.back().beg = it;
-								workload.back().end = it2;
-								workload.back().label = b;
-								workload.back().size = dist;
-							}
-
-							if (x == 0)
-							{
-								def = b;
-							}
-							else
-							{
-								targets.emplace_back(std::make_pair(x, b));
-							}
-
-							x = x2;
-							it = it2;
-							split = true;
-						}
-					}
-
-					if (!split)
-					{
-						// Cannot split: words are identical within the range at this level
-						w.level++;
-					}
-					else
-					{
-						break;
-					}
-				}
-
-				if (!def && targets.empty())
-				{
-					LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
-					m_ir->SetInsertPoint(w.label);
-
-					if (const u64 fval = reinterpret_cast<u64>(w.beg->second))
-					{
-						const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
-						m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
-					}
-					else
-					{
-						verify(HERE, &w.beg->second == &fn_location);
-						m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
-					}
-
-					m_ir->CreateRetVoid();
-					continue;
-				}
-
-				if (!def)
-				{
-					def = llvm::BasicBlock::Create(m_context, "", m_function);
-
-					m_ir->SetInsertPoint(def);
-					tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0));
-				}
-
-				m_ir->SetInsertPoint(w.label);
-				const auto add = m_ir->CreateGEP(m_lsptr, m_ir->getInt64(start + w.level * 4 - 4));
-				const auto ptr = m_ir->CreateBitCast(add, get_type<u32*>());
-				const auto val = m_ir->CreateLoad(ptr);
-				const auto sw = m_ir->CreateSwitch(val, def, ::size32(targets));
-
-				for (auto& pair : targets)
-				{
-					sw->addCase(m_ir->getInt32(pair.first), pair.second);
-				}
-			}
-		}
-
-		spu_function_t fn{}, tr{};
-
 		std::string log;

 		raw_string_ostream out(log);
@ -2970,32 +3095,19 @@ public:
 		if (g_cfg.core.spu_debug)
 		{
 			// Testing only
-			m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
+			m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
 		}
 		else
 		{
-			m_spurt->m_jit.add(std::move(module));
+			m_jit.add(std::move(module));
 		}

-		m_spurt->m_jit.fin();
-		fn = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(main_func));
-		tr = fn;
-
-		if (size0 > 1)
-		{
-			tr = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(m_function));
-		}
+		m_jit.fin();

 		// Register function pointer
-		fn_location = fn;
+		const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));

-		// Trampoline
-		m_spurt->m_dispatcher[func[0] / 4] = tr;
-
-		LOG_NOTICE(SPU, "[0x%x] Compiled: %p", func[0], fn);
-
-		if (tr != fn)
-			LOG_NOTICE(SPU, "[0x%x] T: %p", func[0], tr);
+		m_spurt->add(*fn_info.first, fn);

 		if (g_cfg.core.spu_debug)
 		{
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -1,6 +1,9 @@
 #pragma once

 #include "Utilities/File.h"
+#include "Utilities/mutex.h"
+#include "Utilities/cond.h"
+#include "Utilities/JIT.h"
 #include "SPUThread.h"
 #include <vector>
 #include <bitset>
@ -30,6 +33,40 @@ public:
 	static void initialize();
 };

+// Helper class
+class spu_runtime
+{
+public:
+	shared_mutex m_mutex;
+
+	cond_variable m_cond;
+
+	// All functions
+	std::map<std::vector<u32>, spu_function_t> m_map;
+
+	// All dispatchers
+	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
+
+	// Debug module output location
+	std::string m_cache_path;
+
+private:
+	// Temporarily: asmjit runtime collection
+	std::deque<std::unique_ptr<asmjit::JitRuntime>> m_asmjit_rts;
+
+	// Trampoline to spu_recompiler_base::dispatch
+	spu_function_t tr_dispatch = nullptr;
+
+public:
+	spu_runtime();
+
+	// Get new ASMJIT runtime
+	asmjit::JitRuntime* get_asmjit_rt();
+
+	// Add compiled function and generate trampoline if necessary
+	void add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled);
+};
+
 // SPU Recompiler instance base class
 class spu_recompiler_base
 {
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@ -367,7 +367,6 @@ struct cfg_root : cfg::node
 		cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
 		cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
 		cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield
-		cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads
 		cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
 		cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
 		cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};