SPU: analyser v4 and fixes

Build SPU cache after PPU, fix mixing progress SPU ASMJIT: add support for Giga mode SPU ASMJIT: use the same spu.log location as SPU LLVM SPU: improve spu.log disasm SPU: improve trampolines, unify with SPU ASMJIT SPU: decode interrupt handler address from BR/BRA at 0x0 SPU LLVM: support Mega/Giga modes SPU LLVM: implement function chunks SPU LLVM: use PHI nodes, value visibility across basic blocks SPU LLVM: implement function chunk table New simple memory manager for LLVM (bugfix)
2025-01-31 20:41:45 +01:00 · 2018-06-10 15:46:01 +03:00 · 2018-06-10 15:46:01 +03:00 · e4da284176
commit e4da284176
parent 3e433ef05c
7 changed files with 1577 additions and 572 deletions
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@ -308,6 +308,83 @@ struct MemoryManager : llvm::RTDyldMemoryManager
 	}
 };

+// Simple memory manager
+struct MemoryManager2 : llvm::RTDyldMemoryManager
+{
+	// Reserve 2 GiB
+	void* const m_memory = utils::memory_reserve(0x80000000);
+
+	u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
+	u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;
+
+	u64 m_code_pos = 0;
+	u64 m_data_pos = 0;
+
+	MemoryManager2() = default;
+
+	~MemoryManager2() override
+	{
+		utils::memory_release(m_memory, 0x80000000);
+	}
+
+	u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
+	{
+		// Simple allocation
+		const u64 old = m_code_pos;
+		const u64 pos = ::align(m_code_pos, align);
+		m_code_pos = ::align(pos + size, align);
+
+		if (m_code_pos > 0x40000000)
+		{
+			LOG_FATAL(GENERAL, "LLVM: Out of code memory (size=0x%x, align=0x%x)", size, align);
+			return nullptr;
+		}
+
+		const u64 olda = ::align(old, 0x10000);
+		const u64 newa = ::align(m_code_pos, 0x10000);
+
+		if (olda != newa)
+		{
+			// Commit more memory
+			utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
+		}
+
+		LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
+		return m_code + pos;
+	}
+
+	u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
+	{
+		// Simple allocation
+		const u64 old = m_data_pos;
+		const u64 pos = ::align(m_data_pos, align);
+		m_data_pos = ::align(pos + size, align);
+
+		if (m_data_pos > 0x40000000)
+		{
+			LOG_FATAL(GENERAL, "LLVM: Out of data memory (size=0x%x, align=0x%x)", size, align);
+			return nullptr;
+		}
+
+		const u64 olda = ::align(old, 0x10000);
+		const u64 newa = ::align(m_data_pos, 0x10000);
+
+		if (olda != newa)
+		{
+			// Commit more memory
+			utils::memory_commit(m_data + olda, newa - olda);
+		}
+
+		LOG_NOTICE(GENERAL, "LLVM: Data section %u '%s' allocated -> %p (size=0x%x, align=0x%x, %s)", sec_id, sec_name.data(), m_data + pos, size, align, is_ro ? "ro" : "rw");
+		return m_data + pos;
+	}
+
+	bool finalizeMemory(std::string* = nullptr) override
+	{
+		return false;
+	}
+};
+
 // Helper class
 struct EventListener : llvm::JITEventListener
 {
@ -383,7 +460,7 @@ public:
 		std::string name = m_path;
 		name.append(module->getName());
 		fs::file(name, fs::rewrite).write(obj.getBufferStart(), obj.getBufferSize());
-		LOG_SUCCESS(GENERAL, "LLVM: Created module: %s", module->getName().data());
+		LOG_NOTICE(GENERAL, "LLVM: Created module: %s", module->getName().data());
 	}

 	static std::unique_ptr<llvm::MemoryBuffer> load(const std::string& path)
@ -405,7 +482,7 @@ public:

 		if (auto buf = load(path))
 		{
-			LOG_SUCCESS(GENERAL, "LLVM: Loaded module: %s", module->getName().data());
+			LOG_NOTICE(GENERAL, "LLVM: Loaded module: %s", module->getName().data());
 			return buf;
 		}

@ -464,6 +541,7 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
 		m_engine.reset(llvm::EngineBuilder(std::make_unique<llvm::Module>("null_", m_context))
 			.setErrorStr(&result)
 			.setEngineKind(llvm::EngineKind::JIT)
+			.setMCJITMemoryManager(std::make_unique<MemoryManager2>())
 			.setOptLevel(llvm::CodeGenOpt::Aggressive)
 			.setCodeModel(large ? llvm::CodeModel::Large : llvm::CodeModel::Small)
 			.setMCPU(m_cpu)
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -1225,9 +1225,6 @@ extern void ppu_initialize()
 		fmt::throw_exception("Failed to create cache directory: %s (%s)", _main->cache, fs::g_tls_error);
 	}

-	// Initialize SPU cache
-	spu_cache::initialize();
-
 	if (Emu.IsStopped())
 	{
 		return;
@ -1248,6 +1245,9 @@ extern void ppu_initialize()
 	{
 		ppu_initialize(*ptr);
 	}
+
+	// Initialize SPU cache
+	spu_cache::initialize();
 }

 extern void ppu_initialize(const ppu_module& info)
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -7,6 +7,7 @@
 #include "SPUThread.h"
 #include "SPUInterpreter.h"
 #include "Utilities/sysinfo.h"
+#include "PPUAnalyser.h"

 #include <cmath>
 #include <mutex>
@ -32,6 +33,13 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler

 spu_runtime::spu_runtime()
 {
+	m_cache_path = fxm::check_unlocked<ppu_module>()->cache;
+
+	if (g_cfg.core.spu_debug)
+	{
+		fs::file(m_cache_path + "spu.log", fs::rewrite);
+	}
+
 	LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized...");

 	// Initialize lookup table
@ -97,7 +105,12 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	using namespace asmjit;

 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
-	dis_asm.offset = reinterpret_cast<const u8*>(func.data() + 1) - func[0];
+	dis_asm.offset = reinterpret_cast<const u8*>(func.data() + 1);
+
+	if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
+	{
+		dis_asm.offset -= func[0];
+	}

 	StringLogger logger;
 	logger.addOptions(Logger::kOptionBinaryForm);
@ -163,15 +176,16 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 	// Start compilation
 	m_pos = func[0];
-	const u32 start = m_pos;
-	const u32 end = m_pos + (func.size() - 1) * 4;
+	m_size = ::size32(func) * 4 - 4;
+	const u32 start = m_pos * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
+	const u32 end = start + m_size;

 	// Create instruction labels (TODO: some of them are unnecessary)
 	for (u32 i = 1; i < func.size(); i++)
 	{
 		if (func[i])
 		{
-			instr_labels[i * 4 - 4 + m_pos] = c->newLabel();
+			instr_labels[i * 4 - 4 + start] = c->newLabel();
 		}
 	}

@ -210,15 +224,15 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	{
 		// Disable check (unsafe)
 	}
-	else if (func.size() - 1 == 1)
+	else if (m_size == 4)
 	{
-		c->cmp(x86::dword_ptr(*ls, m_pos), func[1]);
+		c->cmp(x86::dword_ptr(*ls, start), func[1]);
 		c->jnz(label_diff);
 	}
-	else if (func.size() - 1 == 2)
+	else if (m_size == 8)
 	{
 		c->mov(*qw1, static_cast<u64>(func[2]) << 32 | func[1]);
-		c->cmp(*qw1, x86::qword_ptr(*ls, m_pos));
+		c->cmp(*qw1, x86::qword_ptr(*ls, start));
 		c->jnz(label_diff);
 	}
 	else if (utils::has_512() && false)
@ -226,16 +240,15 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		// AVX-512 optimized check using 512-bit registers (disabled)
 		words_align = 64;

-		const u32 starta = m_pos & -64;
+		const u32 starta = start & -64;
 		const u32 enda = ::align(end, 64);
 		const u32 sizea = (enda - starta) / 64;
 		verify(HERE), sizea;

 		// Initialize pointers
 		c->lea(x86::rax, x86::qword_ptr(label_code));
-		c->lea(*qw1, x86::qword_ptr(*ls, starta));
 		u32 code_off = 0;
-		u32 ls_off = starta;
+		u32 ls_off = -8192;

 		for (u32 j = starta; j < enda; j += 64)
 		{
@ -246,6 +259,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				continue;
 			}

+			const bool first = ls_off == -8192;
+
 			// Ensure small distance for disp8*N
 			if (j - ls_off >= 8192)
 			{
@ -279,7 +294,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				c->vmovdqa32(x86::zmm0, x86::zword_ptr(*qw1, j - ls_off));
 			}

-			if (j == starta)
+			if (first)
 			{
 				c->vpcmpud(x86::k1, x86::zmm0, x86::zword_ptr(x86::rax, code_off), 4);
 			}
@ -291,7 +306,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 			for (u32 i = j; i < j + 64; i += 4)
 			{
-				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 			}

 			code_off += 64;
@ -305,7 +320,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		// AVX-512 optimized check using 256-bit registers
 		words_align = 32;

-		const u32 starta = m_pos & -32;
+		const u32 starta = start & -32;
 		const u32 enda = ::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		verify(HERE), sizea;
@ -330,10 +345,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 			for (u32 i = starta; i < enda; i += 4)
 			{
-				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 			}
 		}
-		else if (sizea == 2 && (end - m_pos) <= 32)
+		else if (sizea == 2 && (end - start) <= 32)
 		{
 			const u32 cmask0 = get_code_mask(starta, starta + 32);
 			const u32 cmask1 = get_code_mask(starta + 32, enda);
@ -347,7 +362,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 			for (u32 i = starta; i < starta + 32; i += 4)
 			{
-				words.push_back(i >= m_pos ? func[(i - m_pos) / 4 + 1] : i + 32 < end ? func[(i + 32 - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0);
 			}
 		}
 		else
@ -356,9 +371,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 			// Initialize pointers
 			c->lea(x86::rax, x86::qword_ptr(label_code));
-			c->lea(*qw1, x86::qword_ptr(*ls, starta));
 			u32 code_off = 0;
-			u32 ls_off = starta;
+			u32 ls_off = -4096;

 			for (u32 j = starta; j < enda; j += 32)
 			{
@ -369,6 +383,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 					continue;
 				}

+				const bool first = ls_off == -4096;
+
 				// Ensure small distance for disp8*N
 				if (j - ls_off >= 4096)
 				{
@ -398,7 +414,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				}

 				// Perform bitwise comparison and accumulate
-				if (j == starta)
+				if (first)
 				{
 					c->vpxor(x86::ymm0, x86::ymm1, x86::yword_ptr(x86::rax, code_off));
 				}
@ -409,7 +425,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 				for (u32 i = j; i < j + 32; i += 4)
 				{
-					words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+					words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 				}

 				code_off += 32;
@ -424,7 +440,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		// Mainstream AVX
 		words_align = 32;

-		const u32 starta = m_pos & -32;
+		const u32 starta = start & -32;
 		const u32 enda = ::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		verify(HERE), sizea;
@ -449,10 +465,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 			for (u32 i = starta; i < enda; i += 4)
 			{
-				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 			}
 		}
-		else if (sizea == 2 && (end - m_pos) <= 32)
+		else if (sizea == 2 && (end - start) <= 32)
 		{
 			const u32 cmask0 = get_code_mask(starta, starta + 32);
 			const u32 cmask1 = get_code_mask(starta + 32, enda);
@ -466,7 +482,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 			for (u32 i = starta; i < starta + 32; i += 4)
 			{
-				words.push_back(i >= m_pos ? func[(i - m_pos) / 4 + 1] : i + 32 < end ? func[(i + 32 - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0);
 			}
 		}
 		else
@ -541,7 +557,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)

 				for (u32 i = j; i < j + 32; i += 4)
 				{
-					words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+					words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 				}

 				code_off += 32;
@ -568,7 +584,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		// Compatible SSE2
 		words_align = 16;

-		const u32 starta = m_pos & -16;
+		const u32 starta = start & -16;
 		const u32 enda = ::align(end, 16);
 		const u32 sizea = (enda - starta) / 16;
 		verify(HERE), sizea;
@ -614,10 +630,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 			}

 			// Determine which value will be duplicated at hole positions
-			const u32 w3 = func.at((j - m_pos + ~::cntlz32(cmask, true) % 4 * 4) / 4 + 1);
-			words.push_back(cmask & 1 ? func[(j - m_pos + 0) / 4 + 1] : w3);
-			words.push_back(cmask & 2 ? func[(j - m_pos + 4) / 4 + 1] : w3);
-			words.push_back(cmask & 4 ? func[(j - m_pos + 8) / 4 + 1] : w3);
+			const u32 w3 = func.at((j - start + ~::cntlz32(cmask, true) % 4 * 4) / 4 + 1);
+			words.push_back(cmask & 1 ? func[(j - start + 0) / 4 + 1] : w3);
+			words.push_back(cmask & 2 ? func[(j - start + 4) / 4 + 1] : w3);
+			words.push_back(cmask & 4 ? func[(j - start + 8) / 4 + 1] : w3);
 			words.push_back(w3);

 			// PSHUFD immediate table for all possible hole mask values, holes repeat highest valid word
@ -641,7 +657,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				0b11100100, // full
 			};

-			const auto& dest = !order++ ? reg0 : reg1;
+			const bool first = !order++;
+
+			const auto& dest = first ? reg0 : reg1;

 			// Load aligned code block from LS
 			if (cmask != 0xf)
@ -656,7 +674,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 			// Perform bitwise comparison and accumulate
 			c->xorps(dest, x86::dqword_ptr(x86::rax, code_off));

-			if (j != starta && j != starta + 16)
+			if (first)
 			{
 				c->orps(reg0, dest);
 			}
@ -690,24 +708,38 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		c->vzeroupper();
 	}

-	c->inc(SPU_OFF_64(block_counter));
+	// Acknowledge success and add statistics
+	c->add(SPU_OFF_64(block_counter), ::size32(words) / (words_align / 4));
+
+	if (g_cfg.core.spu_block_size == spu_block_size_type::giga && m_pos != start)
+	{
+		// Jump to the entry point if necessary
+		c->jmp(instr_labels[m_pos]);
+		m_pos = -1;
+	}

 	for (u32 i = 1; i < func.size(); i++)
 	{
 		const u32 pos = start + (i - 1) * 4;
+		const u32 op  = se_storage<u32>::swap(func[i]);

 		if (g_cfg.core.spu_debug)
 		{
 			// Disasm
 			dis_asm.dump_pc = pos;
 			dis_asm.disasm(pos);
-			compiler.comment(dis_asm.last_opcode.c_str());
-			log += dis_asm.last_opcode;
-			log += '\n';
-		}

-		// Get opcode
-		const u32 op = se_storage<u32>::swap(func[i]);
+			if (op)
+			{
+				log += '>';
+				log += dis_asm.last_opcode;
+				log += '\n';
+			}
+			else
+			{
+				fmt::append(log, ">[%08x]  xx xx xx xx: <hole>\n", pos);
+			}
+		}

 		if (!op)
 		{
@ -738,6 +770,12 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 			c->bind(found->second);
 		}

+		if (g_cfg.core.spu_debug)
+		{
+			// Disasm inside the ASMJIT log
+			compiler.comment(dis_asm.last_opcode.c_str());
+		}
+
 		// Execute recompiler function
 		(this->*s_spu_decoder.decode(op))({op});

@ -751,6 +789,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	if (g_cfg.core.spu_debug)
 	{
 		log += '\n';
+		this->dump(log);
 	}

 	// Make fallthrough if necessary
@ -784,6 +823,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		c->align(kAlignData, 8);
 		c->bind(instr_table);

+		// Get actual instruction table bounds
+		const u32 start = instr_labels.begin()->first;
+		const u32 end = instr_labels.rbegin()->first + 4;
+
 		for (u32 addr = start; addr < end; addr += 4)
 		{
 			const auto found = instr_labels.find(addr);
@ -825,6 +868,22 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	// Register function
 	fn_location = fn;

+	if (g_cfg.core.spu_debug)
+	{
+		// Add ASMJIT logs
+		fmt::append(log, "Address: %p\n\n", fn);
+		log += logger.getString();
+		log += "\n\n\n";
+
+		// Append log file
+		fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log);
+	}
+
+	if (m_cache && g_cfg.core.spu_cache)
+	{
+		m_cache->add(func);
+	}
+
 	// Generate a dispatcher (übertrampoline)
 	std::vector<u32> addrv{func[0]};
 	const auto beg = m_spurt->m_map.lower_bound(addrv);
@ -886,6 +945,12 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				it = it2;
 				size1 = w.size - size2;

+				if (w.level >= w.beg->first.size())
+				{
+					// Cannot split: smallest function is a prefix of bigger ones (TODO)
+					break;
+				}
+
 				const u32 x1 = w.beg->first.at(w.level);

 				if (!x1)
@ -914,6 +979,20 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				}
 			}

+			if (w.label.isValid())
+			{
+				c->align(kAlignCode, 16);
+				c->bind(w.label);
+			}
+
+			if (w.level >= w.beg->first.size())
+			{
+				// If functions cannot be compared, assume smallest function
+				LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
+				c->jmp(imm_ptr(w.beg->second ? w.beg->second : &dispatch));
+				continue;
+			}
+
 			// Value for comparison
 			const u32 x = it->first.at(w.level);

@ -933,13 +1012,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				size2++;
 			}

-			if (w.label.isValid())
-			{
-				c->align(kAlignCode, 16);
-				c->bind(w.label);
-			}
-
-			c->cmp(x86::dword_ptr(*ls, func[0] + (w.level - 1) * 4), x);
+			c->cmp(x86::dword_ptr(*ls, start + (w.level - 1) * 4), x);

 			// Low subrange target label
 			Label label_below;
@ -1044,22 +1117,6 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		m_spurt->m_dispatcher[func[0] / 4] = tr;
 	}

-	if (g_cfg.core.spu_debug)
-	{
-		// Add ASMJIT logs
-		fmt::append(log, "Address: %p (%p)\n\n", fn, +m_spurt->m_dispatcher[func[0] / 4]);
-		log += logger.getString();
-		log += "\n\n\n";
-
-		// Append log file
-		fs::file(Emu.GetCachePath() + "SPUJIT.log", fs::write + fs::append).write(log);
-	}
-
-	if (m_cache && g_cfg.core.spu_cache)
-	{
-		m_cache->add(func);
-	}
-
 	return fn;
 }

@ -1131,17 +1188,6 @@ static void check_state(SPUThread* _spu, spu_function_t _ret)
 		_ret = &check_state_ret;
 	}

-	if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
-	{
-		// Get stack pointer, try to use native return address (check SPU return address)
-		const auto x = _spu->stack_mirror[(_spu->gpr[1]._u32[3] & 0x3fff0) >> 4];
-
-		if (x._u32[2] == _spu->pc)
-		{
-			_ret = reinterpret_cast<spu_function_t>(x._u64[0]);
-		}
-	}
-
 	_ret(*_spu, _spu->_ptr<u8>(0), nullptr);
 }

@ -1195,36 +1241,12 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 {
 	using namespace asmjit;

-	if (g_cfg.core.spu_block_size != spu_block_size_type::giga && !jt)
-	{
-		// Simply external call (return or indirect call)
-		c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
-		c->xor_(qw0->r32(), qw0->r32());
-	}
-	else
-	{
-		if (!instr_table.isValid())
-		{
-			// Request instruction table
-			instr_table = c->newLabel();
-		}
-
-		const u32 start = instr_labels.begin()->first;
-		const u32 end = instr_labels.rbegin()->first + 4;
-
-		// Load indirect jump address, choose between local and external
-		c->lea(x86::r10, x86::qword_ptr(instr_table));
-		c->lea(*qw1, x86::qword_ptr(*addr, 0 - start));
-		c->xor_(qw0->r32(), qw0->r32());
-		c->cmp(qw1->r32(), end - start);
-		c->cmovae(qw1->r32(), qw0->r32());
-		c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
-		c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
-	}
+	// Initialize third arg to zero
+	c->xor_(qw0->r32(), qw0->r32());

 	if (op.d)
 	{
-		c->lock().btr(SPU_OFF_8(interrupts_enabled), 0);
+		c->mov(SPU_OFF_8(interrupts_enabled), 0);
 	}
 	else if (op.e)
 	{
@ -1232,7 +1254,7 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 		Label intr = c->newLabel();
 		Label fail = c->newLabel();

-		c->lock().bts(SPU_OFF_8(interrupts_enabled), 0);
+		c->mov(SPU_OFF_8(interrupts_enabled), 1);
 		c->mov(qw1->r32(), SPU_OFF_32(ch_event_mask));
 		c->test(qw1->r32(), ~SPU_EVENT_INTR_IMPLEMENTED);
 		c->jnz(fail);
@ -1244,19 +1266,50 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 		c->mov(SPU_OFF_32(pc), *addr);
 		c->mov(addr->r64(), reinterpret_cast<u64>(vm::base(0xffdead00)));
 		c->mov(asmjit::x86::dword_ptr(addr->r64()), "INTR"_u32);
+
+		// Save addr in srr0 and disable interrupts
 		c->bind(intr);
-		c->lock().btr(SPU_OFF_8(interrupts_enabled), 0);
+		c->mov(SPU_OFF_8(interrupts_enabled), 0);
 		c->mov(SPU_OFF_32(srr0), *addr);
-		c->mov(*addr, qw0->r32());
-		c->mov(x86::r10, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher)));
+
+		// Test for BR/BRA instructions (they are equivalent at zero pc)
+		c->mov(*addr, x86::dword_ptr(*ls));
+		c->and_(*addr, 0xfffffffd);
+		c->xor_(*addr, 0x30);
+		c->bswap(*addr);
+		c->test(*addr, 0xff80007f);
+		c->cmovnz(*addr, qw0->r32());
+		c->shr(*addr, 5);
 		c->align(kAlignCode, 16);
 		c->bind(no_intr);
 	}

-	Label label_check = c->newLabel();
-	c->mov(SPU_OFF_32(pc), *addr);
-	c->cmp(SPU_OFF_32(state), 0);
-	c->jnz(label_check);
+	if (!jt && g_cfg.core.spu_block_size != spu_block_size_type::giga)
+	{
+		// Simply external call (return or indirect call)
+		c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
+	}
+	else
+	{
+		if (!instr_table.isValid())
+		{
+			// Request instruction table
+			instr_table = c->newLabel();
+		}
+
+		// Get actual instruction table bounds
+		const u32 start = instr_labels.begin()->first;
+		const u32 end = instr_labels.rbegin()->first + 4;
+
+		// Load indirect jump address, choose between local and external
+		c->lea(*qw1, x86::qword_ptr(addr->r64(), 0 - start));
+		c->lea(x86::r10, x86::qword_ptr(instr_table));
+		c->cmp(qw1->r32(), end - start);
+		c->lea(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
+		c->lea(*qw1, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
+		c->cmovae(x86::r10, *qw1);
+		c->mov(x86::r10, x86::qword_ptr(x86::r10));
+	}

 	if (g_cfg.core.spu_block_size != spu_block_size_type::safe && ret)
 	{
@ -1268,6 +1321,10 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 		c->cmove(x86::r10, x86::qword_ptr(*qw1));
 	}

+	Label label_check = c->newLabel();
+	c->mov(SPU_OFF_32(pc), *addr);
+	c->cmp(SPU_OFF_32(state), 0);
+	c->jnz(label_check);
 	c->jmp(x86::r10);
 	c->bind(label_check);
 	c->mov(*ls, x86::r10);
@ -2856,9 +2913,9 @@ void spu_recompiler::STQX(spu_opcode_t op)
 void spu_recompiler::BI(spu_opcode_t op)
 {
 	const auto found = m_targets.find(m_pos);
-	const auto is_jt = found == m_targets.end() || found->second.size() != 1 || found->second.front() != -1;
+	const auto is_jt = found == m_targets.end() || found->second.size() > 1;

-	if (found == m_targets.end() || found->second.empty())
+	if (found == m_targets.end())
 	{
 		LOG_ERROR(SPU, "[0x%x] BI: no targets", m_pos);
 	}
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -19,6 +19,9 @@ class spu_runtime
 	// All dispatchers
 	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;

+	// Debug module output location
+	std::string m_cache_path;
+
 	friend class spu_recompiler;

 public:
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -42,12 +42,18 @@ protected:
 	// GPR modified by the instruction (-1 = not set)
 	std::array<u8, 0x10000> m_regmod;

-	// List of possible targets for the instruction ({} = next instruction, {-1} = no targets)
+	// List of possible targets for the instruction (entry shouldn't exist for simple instructions)
 	std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_targets;

-	// List of block predecessors (incomplete, doesn't include all fallthrough predecessors)
+	// List of block predecessors
 	std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_preds;

+	// List of function entry points and return points (set after BRSL, BRASL, BISL, BISLED)
+	std::bitset<0x10000> m_entry_info;
+
+	// Compressed address of unique entry point for each instruction
+	std::array<u16, 0x10000> m_entry_map{};
+
 	std::shared_ptr<spu_cache> m_cache;

 private:
@ -77,9 +83,15 @@ public:
 	// Get the block at specified address
 	std::vector<u32> block(const be_t<u32>* ls, u32 lsa);

+	// Print analyser internal state
+	void dump(std::string& out);
+
 	// Create recompiler instance (ASMJIT)
 	static std::unique_ptr<spu_recompiler_base> make_asmjit_recompiler();

 	// Create recompiler instance (LLVM)
 	static std::unique_ptr<spu_recompiler_base> make_llvm_recompiler();
+
+	// Max number of registers (for m_regmod)
+	static constexpr u8 s_reg_max = 128;
 };
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -527,6 +527,8 @@ void SPUThread::cpu_task()
 			jit_dispatcher[pc / 4](*this, vm::_ptr<u8>(offset), nullptr);
 		}

+		// Print some stats
+		LOG_NOTICE(SPU, "Stats: block %u (fails: %u);", block_counter, block_failure);
 		return;
 	}