PPU LLVM: Use symbol resolver function to resolve functions

2024-11-25 20:22:30 +01:00 · 2024-03-18 16:14:45 +02:00 · 2024-03-18 16:14:45 +02:00 · 2f822abb47
commit 2f822abb47
parent 1ca83bc629
4 changed files with 182 additions and 71 deletions
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@ -3703,13 +3703,18 @@ public:
 		return result;
 	}

+	llvm::Value* load_const(llvm::GlobalVariable* g, llvm::Value* i, llvm::Type* type = nullptr)
+	{
+		return m_ir->CreateLoad(type ? type : g->getValueType(), m_ir->CreateGEP(g->getValueType(), g, {m_ir->getInt64(0), m_ir->CreateZExtOrTrunc(i, get_type<u64>())}));
+	}
+
 	template <typename T>
 	llvm::Value* load_const(llvm::GlobalVariable* g, llvm::Value* i)
 	{
-		return m_ir->CreateLoad(get_type<T>(), m_ir->CreateGEP(g->getValueType(), g, {m_ir->getInt64(0), m_ir->CreateZExtOrTrunc(i, get_type<u64>())}));
+		return load_const(g, i, get_type<T>());
 	}

-	template <typename T, typename I>
+	template <typename T, typename I> requires requires () { std::declval<I>().eval(std::declval<llvm::IRBuilder<>*>()); }
 	value_t<T> load_const(llvm::GlobalVariable* g, I i)
 	{
 		value_t<T> result;
@ -3717,6 +3722,12 @@ public:
 		return result;
 	}

+	template <typename T>
+	llvm::GlobalVariable* make_local_variable(T initializing_value)
+	{
+		return new llvm::GlobalVariable(*m_module, get_type<T>(), false, llvm::GlobalVariable::PrivateLinkage, llvm::ConstantInt::get(get_type<T>(), initializing_value));
+	}
+
 	template <typename R = v128>
 	std::pair<bool, R> get_const_vector(llvm::Value*, u32 pos, u32 = __builtin_LINE());

--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -175,7 +175,7 @@ bool serialize<ppu_thread::cr_bits>(utils::serial& ar, typename ppu_thread::cr_b
 extern void ppu_initialize();
 extern void ppu_finalize(const ppu_module& info);
 extern bool ppu_initialize(const ppu_module& info, bool check_only = false, u64 file_size = 0);
-static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name);
+static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module& whole_module);
 extern bool ppu_load_exec(const ppu_exec_object&, bool virtual_load, const std::string&, utils::serial* = nullptr);
 extern std::pair<std::shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const ppu_exec_object&, bool virtual_load, const std::string& path, s64 file_offset, utils::serial* = nullptr);
 extern void ppu_unload_prx(const lv2_prx&);
@ -3460,7 +3460,7 @@ namespace
 	// Compiled PPU module info
 	struct jit_module
 	{
-		std::vector<ppu_intrp_func_t> funcs;
+		void(*symbol_resolver)(u8*, u64) = nullptr;
 		std::shared_ptr<jit_compiler> pjit;
 		bool init = false;
 	};
@ -3502,7 +3502,6 @@ namespace
 				return;
 			}

-			to_destroy.funcs = std::move(found->second.funcs);
 			to_destroy.pjit = std::move(found->second.pjit);

 			bucket.map.erase(found);
@ -4611,6 +4610,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 				accurate_fpcc,
 				accurate_vnan,
 				accurate_nj_mode,
+				contains_symbol_resolver,

 				__bitset_enum_max
 			};
@ -4640,6 +4640,8 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 				settings += ppu_settings::accurate_vnan, settings -= ppu_settings::fixup_vnan, fmt::throw_exception("VNAN Not implemented");
 			if (g_cfg.core.ppu_use_nj_bit)
 				settings += ppu_settings::accurate_nj_mode, settings -= ppu_settings::fixup_nj_denormals, fmt::throw_exception("NJ Not implemented");
+			if (fpos >= info.funcs.size())
+				settings += ppu_settings::contains_symbol_resolver; // Avoid invalidating all modules for this purpose

 			// Write version, hash, CPU, settings
 			fmt::append(obj_name, "v6-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
@ -4724,16 +4726,18 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 		{
 			atomic_t<u32>& work_cv;
 			std::vector<std::pair<std::string, ppu_module>>& workload;
+			const ppu_module& main_module;
 			const std::string& cache_path;
 			const cpu_thread* cpu;

 			std::unique_lock<decltype(jit_core_allocator::sem)> core_lock;

 			thread_op(atomic_t<u32>& work_cv, std::vector<std::pair<std::string, ppu_module>>& workload
-				, const cpu_thread* cpu, const std::string& cache_path, decltype(jit_core_allocator::sem)& sem) noexcept
+				, const cpu_thread* cpu, const ppu_module& main_module, const std::string& cache_path, decltype(jit_core_allocator::sem)& sem) noexcept

 				: work_cv(work_cv)
 				, workload(workload)
+				, main_module(main_module)
 				, cache_path(cache_path)
 				, cpu(cpu)
 			{
@ -4744,6 +4748,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 			thread_op(const thread_op& other) noexcept
 				: work_cv(other.work_cv)
 				, workload(other.workload)
+				, main_module(other.main_module)
 				, cache_path(other.cache_path)
 				, cpu(other.cpu)
 			{
@ -4778,7 +4783,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)

 					// Use another JIT instance
 					jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1);
-					ppu_initialize2(jit2, part, cache_path, obj_name);
+					ppu_initialize2(jit2, part, cache_path, obj_name, i == workload.size() - 1 ? main_module : part);

 					ppu_log.success("LLVM: Compiled module %s", obj_name);
 				}
@ -4791,7 +4796,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 		g_watchdog_hold_ctr++;

 		named_thread_group threads(fmt::format("PPUW.%u.", ++g_fxo->get<thread_index_allocator>().index), thread_count
-			, thread_op(work_cv, workload, cpu, cache_path, g_fxo->get<jit_core_allocator>().sem)
+			, thread_op(work_cv, workload, cpu, info, cache_path, g_fxo->get<jit_core_allocator>().sem)
 			, [&](u32 /*thread_index*/, thread_op& op)
 		{
 			// Allocate "core"
@ -4835,8 +4840,6 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 		}
 	}

-	progr.reset();
-
 	if (!is_being_used_in_emulation || (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()))
 	{
 		return compiled_new;
@ -4851,83 +4854,39 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)

 	const bool is_first = jit && !jit_mod.init;

+	const bool showing_only_apply_stage = !g_progr.load() && !g_progr_ptotal && !g_progr_ftotal && g_progr_ptotal.compare_and_swap_test(0, 1);
+
+	progr.emplace("Applying PPU Code...");
+
 	if (is_first)
 	{
 		jit->fin();
 	}

-	u32 index = 0;
-	u32 max_count = 0;
-
-	for (const auto& func : info.funcs)
+	if (is_first)
 	{
-		if (func.size)
-		{
-			max_count++;
-		}
+		jit_mod.symbol_resolver = reinterpret_cast<void(*)(u8*, u64)>(jit->get("__resolve_symbols"));
+	}
+	else
+	{
+		ensure(jit_mod.symbol_resolver);
 	}

-	u32 pending_progress = umax;
+	jit_mod.symbol_resolver(vm::g_exec_addr, info.segs[0].addr);

-	bool early_exit = false;
-
-	// Get and install function addresses
+	// Find a BLR-only function in order to copy it to all BLRs (some games need it)
 	for (const auto& func : info.funcs)
 	{
-		if (!func.size)
+		if (func.size == 4 && *info.get_ptr<u32>(func.addr) == ppu_instructions::BLR())
 		{
-			continue;
-		}
+			const auto name = fmt::format("__0x%x", func.addr - reloc);

-		if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())
-		{
-			// Revert partially commited changes
-			jit_mod.funcs.clear();
-			BLR_func = nullptr;
-			early_exit = true;
+			BLR_func = reinterpret_cast<ppu_intrp_func_t>(jit->get(name));
 			break;
 		}
-
-		const auto name = fmt::format("__0x%x", func.addr - reloc);
-
-		// Try to locate existing function if it is not the first time
-		const auto addr = is_first ? ensure(reinterpret_cast<ppu_intrp_func_t>(jit->get(name)))
-			: reinterpret_cast<ppu_intrp_func_t>(ensure(jit_mod.funcs[index]));
-
-		jit_mod.funcs.emplace_back(addr);
-
-		if (func.size == 4 && !BLR_func && *info.get_ptr<u32>(func.addr) == ppu_instructions::BLR())
-		{
-			BLR_func = addr;
-		}
-
-		ppu_register_function_at(func.addr, 4, addr);
-
-		if (g_cfg.core.ppu_debug)
-			ppu_log.trace("Installing function %s at 0x%x: %p (reloc = 0x%x)", name, func.addr, ppu_ref(func.addr), reloc);
-
-		index++;
-
-		if (pending_progress != umax)
-		{
-			pending_progress++;
-
-			if (pending_progress == 1024)
-			{
-				pending_progress = 0;
-				g_progr_pdone++;
-			}
-		}
-		else if (!g_progr.load() && !g_progr_ptotal && !g_progr_ftotal)
-		{
-			g_progr_pdone += index / 1024;
-			g_progr_ptotal += max_count / 1024;
-			pending_progress = index % 1024;
-			progr.emplace("Applying PPU Code...");
-		}
 	}

-	if (is_first && !early_exit)
+	if (is_first)
 	{
 		jit_mod.init = true;
 	}
@ -4945,13 +4904,19 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 		}
 	}

+	if (showing_only_apply_stage)
+	{
+		// Done
+		g_progr_pdone++;
+	}
+
 	return compiled_new;
 #else
 	fmt::throw_exception("LLVM is not available in this build.");
 #endif
 }

-static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name)
+static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module& whole_module)
 {
 #ifdef LLVM_AVAILABLE
 	using namespace llvm;
@ -5042,6 +5007,21 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
 			}
 		}

+		// Run this only in one module for all functions
+		if (&whole_module != &module_part)
+		{
+			if (const auto func = translator.GetSymbolResolver(whole_module))
+			{
+				// Run optimization passes
+				pm.run(*func);
+			}
+			else
+			{
+				Emu.Pause();
+				return;
+			}
+		}
+
 		//legacy::PassManager mpm;

 		// Remove unused functions, structs, global variables, etc
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -274,6 +274,125 @@ Function* PPUTranslator::Translate(const ppu_function& info)
 	return m_function;
 }

+Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
+{
+	m_function = cast<Function>(m_module->getOrInsertFunction("__resolve_symbols", FunctionType::get(get_type<void>(), { get_type<u8*>(), get_type<u64>() }, false)).getCallee());
+
+	IRBuilder<> irb(BasicBlock::Create(m_context, "__entry", m_function));
+	m_ir = &irb;
+
+	// Instruction address is (m_addr + base)
+	const u64 base = m_reloc ? m_reloc->addr : 0;
+
+	m_exec = m_function->getArg(0);
+	m_seg0 = m_function->getArg(1);
+
+	const auto ftype = FunctionType::get(get_type<void>(), {
+		get_type<u8*>(), // Exec base
+		GetContextType()->getPointerTo(), // PPU context
+		get_type<u64>(), // Segment address (for PRX)
+		get_type<u8*>(), // Memory base
+		get_type<u64>(), // r0
+		get_type<u64>(), // r1
+		get_type<u64>(), // r2
+		}, false);
+
+	// Store function addresses in PPU jumptable using internal resolving instead of patching it externally.
+	// Because, LLVM processed it extremely slow. (regression)
+	// This is made in loop instead of inlined because it took tremendous amount of time to compile.
+
+	std::vector<u32> vec_addrs;
+	vec_addrs.reserve(info.funcs.size());
+
+	// Create an array of function pointers
+	std::vector<Function*> functions;
+
+	for (const auto& f : info.funcs)
+	{
+		if (!f.size)
+		{
+			continue;
+		}
+
+		vec_addrs.push_back(f.addr - base);
+		functions.push_back(cast<Function>(m_module->getOrInsertFunction(fmt::format("__0x%x", f.addr - base), ftype).getCallee()));
+	}
+
+	if (vec_addrs.empty())
+	{
+		// Possible special case for no functions (allowing the do-while optimization)
+		m_ir->CreateRetVoid();
+		replace_intrinsics(*m_function);
+		return m_function;
+	}
+
+	const auto addr_array_type = ArrayType::get(get_type<u32>(), vec_addrs.size());
+	const auto addr_array = new GlobalVariable(*m_module, addr_array_type, false, GlobalValue::PrivateLinkage, ConstantDataArray::get(m_context, vec_addrs));
+
+	// Initialize the function table with the function pointers
+	std::vector<llvm::Constant*> init_vals;
+
+	for (llvm::Function* func : functions)
+	{
+		llvm::Constant* func_ptr = llvm::ConstantExpr::getBitCast(func, ftype->getPointerTo());
+		init_vals.push_back(func);
+	}
+
+	// Create an array of function pointers
+	const auto func_table_type = ArrayType::get(ftype->getPointerTo(), info.funcs.size());
+	const auto init_func_table = ConstantArray::get(func_table_type,  init_vals);
+	const auto func_table = new GlobalVariable(*m_module, func_table_type, false, GlobalVariable::PrivateLinkage, init_func_table);
+
+	const auto loop_block = BasicBlock::Create(m_context, "__loop", m_function);
+	const auto after_loop = BasicBlock::Create(m_context, "__after_loop", m_function);
+
+	m_ir->CreateBr(loop_block);
+	m_ir->SetInsertPoint(loop_block);
+
+	const auto init_index_value = m_ir->getInt64(0);
+
+	// Loop body
+	const auto body_block = BasicBlock::Create(m_context, "__body", m_function);
+
+	m_ir->CreateBr(body_block); // As do-while because vec_addrs is known to be more than 0
+	m_ir->SetInsertPoint(body_block);
+
+	const auto index_value = m_ir->CreatePHI(get_type<u64>(), 2);
+	index_value->addIncoming(init_index_value, loop_block);
+
+	auto ptr_inst = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(addr_array->getValueType(), addr_array, {m_ir->getInt64(0), index_value}));
+	assert(ptr_inst->getResultElementType() == get_type<u32>());
+
+	const auto func_pc = ZExt(m_ir->CreateLoad(ptr_inst->getResultElementType(), ptr_inst), get_type<u64>());
+
+	ptr_inst = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(func_table->getValueType(), func_table, {m_ir->getInt64(0), index_value}));
+	assert(ptr_inst->getResultElementType() == ftype->getPointerTo());
+
+	const auto faddr = m_ir->CreateLoad(ptr_inst->getResultElementType(), ptr_inst);
+	const auto faddr_int = m_ir->CreatePtrToInt(faddr, get_type<uptr>());
+	const auto fval = m_ir->CreateOr(m_ir->CreateShl(m_seg0, 32 + 3), faddr_int);
+	const auto pos = m_ir->CreateShl(m_reloc ? m_ir->CreateAdd(func_pc, m_seg0) : func_pc, 1);
+	const auto ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), m_exec, pos));
+
+	// Store to jumptable
+	m_ir->CreateStore(fval, ptr);
+
+	// Increment index and branch back to loop
+	const auto post_add = m_ir->CreateAdd(index_value, m_ir->getInt64(1));
+	index_value->addIncoming(post_add, body_block);
+
+	Value* index_check = m_ir->CreateICmpULT(post_add, m_ir->getInt64(vec_addrs.size()));
+	m_ir->CreateCondBr(index_check, body_block, after_loop);
+
+	// Set insertion point to afterloop_block
+	m_ir->SetInsertPoint(after_loop);
+
+	m_ir->CreateRetVoid();
+
+	replace_intrinsics(*m_function);
+	return m_function;
+}
+
 Value* PPUTranslator::VecHandleNan(Value* val)
 {
 	const auto is_nan = m_ir->CreateFCmpUNO(val, val);
--- a/rpcs3/Emu/Cell/PPUTranslator.h
+++ b/rpcs3/Emu/Cell/PPUTranslator.h
@ -336,6 +336,7 @@ public:

 	// Parses PPU opcodes and translate them into LLVM IR
 	llvm::Function* Translate(const ppu_function& info);
+	llvm::Function* GetSymbolResolver(const ppu_module& info);

 	void MFVSCR(ppu_opcode_t op);
 	void MTVSCR(ppu_opcode_t op);