SPU analyser: basic function detection in Giga mode

Misc: fix EH frame registration (LLVM, non-Windows). Misc: constant-folding bitcast (cpu_translator). Misc: add syntax for LLVM arrays (cpu_translator). Misc: use function names for proper linkage (SPU LLVM). Changed function search and verification in Giga mode. Basic stack frame layout analysis. Function detection in Giga mode. Basic use of new information in SPU LLVM. Fixed jump table compilation in SPU LLVM. Disable broken optimization in Accurate xfloat mode. Make compiled SPU modules position-independent in SPU LLVM. Optimizations include but not limited to: * Compiling SPU functions as native functions when eligible * Avoiding register context write-out * Aligned stack assumption (CWD alike instruction)
2025-01-31 12:31:45 +01:00 · 2019-05-05 16:28:41 +03:00 · 2019-05-05 16:28:41 +03:00 · 7492f335e9
commit 7492f335e9
parent fce9d6a7b8
15 changed files with 1588 additions and 492 deletions
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@ -474,7 +474,7 @@ struct MemoryManager : llvm::RTDyldMemoryManager
 		s_unfire.push_front(std::make_pair(addr, size));
 #endif

-		return RTDyldMemoryManager::registerEHFrames(addr, load_addr, size);
+		return RTDyldMemoryManager::registerEHFramesInProcess(addr, size);
 	}

 	void deregisterEHFrames() override
@ -508,6 +508,10 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager

 	void registerEHFrames(u8* addr, u64 load_addr, std::size_t size) override
 	{
+#ifndef _WIN32
+		RTDyldMemoryManager::registerEHFramesInProcess(addr, size);
+		s_unfire.push_front(std::make_pair(addr, size));
+#endif
 	}

 	void deregisterEHFrames() override
@ -770,25 +774,6 @@ jit_compiler::~jit_compiler()
 {
 }

-bool jit_compiler::has_ssse3() const
-{
-	if (m_cpu == "generic" ||
-		m_cpu == "k8" ||
-		m_cpu == "opteron" ||
-		m_cpu == "athlon64" ||
-		m_cpu == "athlon-fx" ||
-		m_cpu == "k8-sse3" ||
-		m_cpu == "opteron-sse3" ||
-		m_cpu == "athlon64-sse3" ||
-		m_cpu == "amdfam10" ||
-		m_cpu == "barcelona")
-	{
-		return false;
-	}
-
-	return true;
-}
-
 void jit_compiler::add(std::unique_ptr<llvm::Module> module, const std::string& path)
 {
 	ObjectCache cache{path};
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -142,9 +142,6 @@ public:
 		return *m_engine;
 	}

-	// Test SSSE3 feature
-	bool has_ssse3() const;
-
 	// Add module (path to obj cache dir)
 	void add(std::unique_ptr<llvm::Module> module, const std::string& path);

--- a/rpcs3/Emu/CPU/CPUTranslator.cpp
+++ b/rpcs3/Emu/CPU/CPUTranslator.cpp
@ -9,7 +9,54 @@ cpu_translator::cpu_translator(llvm::Module* module, bool is_be)
 	, m_module(module)
 	, m_is_be(is_be)
 {
+}

+void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine)
+{
+	m_context = context;
+	m_engine = &engine;
+
+	const auto cpu = m_engine->getTargetMachine()->getTargetCPU();
+
+	m_use_ssse3 = true;
+
+	// Test SSSE3 feature (TODO)
+	if (cpu == "generic" ||
+		cpu == "k8" ||
+		cpu == "opteron" ||
+		cpu == "athlon64" ||
+		cpu == "athlon-fx" ||
+		cpu == "k8-sse3" ||
+		cpu == "opteron-sse3" ||
+		cpu == "athlon64-sse3" ||
+		cpu == "amdfam10" ||
+		cpu == "barcelona")
+	{
+		m_use_ssse3 = false;
+	}
+}
+
+llvm::Value* cpu_translator::bitcast(llvm::Value* val, llvm::Type* type)
+{
+	uint s1 = type->getScalarSizeInBits();
+	uint s2 = val->getType()->getScalarSizeInBits();
+
+	if (type->isVectorTy())
+		s1 *= type->getVectorNumElements();
+	if (val->getType()->isVectorTy())
+		s2 *= val->getType()->getVectorNumElements();
+
+	if (s1 != s2)
+	{
+		fmt::throw_exception("cpu_translator::bitcast(): incompatible type sizes (%u vs %u)", s1, s2);
+	}
+
+	if (const auto c1 = llvm::dyn_cast<llvm::Constant>(val))
+	{
+		return verify(HERE, llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c1, type, m_module->getDataLayout()));
+	}
+
+	return m_ir->CreateBitCast(val, type);
 }

 template <>
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@ -9,6 +9,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #ifdef _MSC_VER
 #pragma warning(pop)
@ -19,6 +20,8 @@
 #include "../Utilities/StrFmt.h"
 #include "../Utilities/BEType.h"
 #include "../Utilities/BitField.h"
+#include "../Utilities/Log.h"
+#include "../Utilities/JIT.h"

 #include <unordered_map>
 #include <map>
@ -47,6 +50,7 @@ struct llvm_value_t
 	static constexpr bool is_sint    = false;
 	static constexpr bool is_uint    = false;
 	static constexpr bool is_float   = false;
+	static constexpr uint is_array   = false;
 	static constexpr uint is_vector  = false;
 	static constexpr uint is_pointer = false;

@ -314,6 +318,7 @@ struct llvm_value_t<T*> : llvm_value_t<T>
 	static constexpr bool is_sint    = false;
 	static constexpr bool is_uint    = false;
 	static constexpr bool is_float   = false;
+	static constexpr uint is_array   = false;
 	static constexpr uint is_vector  = false;
 	static constexpr uint is_pointer = llvm_value_t<T>::is_pointer + 1;

@ -333,6 +338,7 @@ struct llvm_value_t<T[N]> : llvm_value_t<T>
 	using base = llvm_value_t<T>;
 	using base::base;

+	static constexpr uint is_array   = 0;
 	static constexpr uint is_vector  = N;
 	static constexpr uint is_pointer = 0;

@ -342,6 +348,48 @@ struct llvm_value_t<T[N]> : llvm_value_t<T>
 	}
 };

+template <typename T, uint N>
+struct llvm_value_t<T[0][N]> : llvm_value_t<T>
+{
+	using type = T[0][N];
+	using base = llvm_value_t<T>;
+	using base::base;
+
+	static constexpr bool is_int     = false;
+	static constexpr bool is_sint    = false;
+	static constexpr bool is_uint    = false;
+	static constexpr bool is_float   = false;
+	static constexpr uint is_array   = N;
+	static constexpr uint is_vector  = false;
+	static constexpr uint is_pointer = false;
+
+	static llvm::Type* get_type(llvm::LLVMContext& context)
+	{
+		return llvm::ArrayType::get(llvm_value_t<T>::get_type(context), N);
+	}
+};
+
+template <typename T, uint V, uint N>
+struct llvm_value_t<T[V][N]> : llvm_value_t<T[V]>
+{
+	using type = T[V][N];
+	using base = llvm_value_t<T[V]>;
+	using base::base;
+
+	static constexpr bool is_int     = false;
+	static constexpr bool is_sint    = false;
+	static constexpr bool is_uint    = false;
+	static constexpr bool is_float   = false;
+	static constexpr uint is_array   = N;
+	static constexpr uint is_vector  = false;
+	static constexpr uint is_pointer = false;
+
+	static llvm::Type* get_type(llvm::LLVMContext& context)
+	{
+		return llvm::ArrayType::get(llvm_value_t<T[V]>::get_type(context), N);
+	}
+};
+
 template <typename T>
 using llvm_expr_t = std::decay_t<T>;

@ -2368,6 +2416,9 @@ protected:
 	// Module to which all generated code is output to
 	llvm::Module* m_module;

+	// Execution engine from JIT instance
+	llvm::ExecutionEngine* m_engine{};
+
 	// Endianness, affects vector element numbering (TODO)
 	bool m_is_be;

@ -2377,6 +2428,8 @@ protected:
 	// IR builder
 	llvm::IRBuilder<>* m_ir;

+	void initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine);
+
 public:
 	// Convert a C++ type to an LLVM type (TODO: remove)
 	template <typename T>
@ -2421,6 +2474,26 @@ public:
 		return result;
 	}

+	// Call external function: provide name and function pointer
+	template <typename RT, typename... FArgs, typename... Args>
+	llvm::CallInst* call(std::string_view lame, RT(*_func)(FArgs...), Args... args)
+	{
+		static_assert(sizeof...(FArgs) == sizeof...(Args), "spu_llvm_recompiler::call(): unexpected arg number");
+		const auto type = llvm::FunctionType::get(get_type<RT>(), {args->getType()...}, false);
+		const auto func = llvm::cast<llvm::Function>(m_module->getOrInsertFunction({lame.data(), lame.size()}, type).getCallee());
+		m_engine->addGlobalMapping({lame.data(), lame.size()}, reinterpret_cast<std::uintptr_t>(_func));
+		return m_ir->CreateCall(func, {args...});
+	}
+
+	// Bitcast with immediate constant folding
+	llvm::Value* bitcast(llvm::Value* val, llvm::Type* type);
+
+	template <typename T>
+	llvm::Value* bitcast(llvm::Value* val)
+	{
+		return bitcast(val, get_type<T>());
+	}
+
 	template <typename T>
 	static llvm_placeholder_t<T> match()
 	{
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -4677,7 +4677,7 @@ bool ppu_interpreter::MTFSB0(ppu_thread& ppu, ppu_opcode_t op)
 bool ppu_interpreter::MTFSFI(ppu_thread& ppu, ppu_opcode_t op)
 {
 	const u32 bf = op.crfd * 4;
-	if (bf != 4 * 4) 
+	if (bf != 4 * 4)
 	{
 		// Do nothing on non-FPCC field (TODO)
 		LOG_WARNING(PPU, "MTFSFI(%d)", op.crfd);
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -1711,7 +1711,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
 	module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout());

 	// Initialize translator
-	PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.has_ssse3());
+	PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.get_engine());

 	// Define some types
 	const auto _void = Type::getVoidTy(jit.get_context());
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@ -79,7 +79,7 @@ public:
 				result |= bit;
 			}

-			return result;	
+			return result;
 		}

 		// Unpack CR bits
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -11,14 +11,13 @@ using namespace llvm;

 const ppu_decoder<PPUTranslator> s_ppu_decoder;

-PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, bool ssse3)
+PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, ExecutionEngine& engine)
 	: cpu_translator(module, false)
 	, m_info(info)
 	, m_pure_attr(AttributeList::get(m_context, AttributeList::FunctionIndex, {Attribute::NoUnwind, Attribute::ReadNone}))
 {
 	// Bind context
-	m_context = context;
-	m_use_ssse3 = ssse3;
+	cpu_translator::initialize(context, engine);

 	// There is no weak linkage on JIT, so let's create variables with different names for each module part
 	const u32 gsuffix = m_info.name.empty() ? info.funcs[0].addr : info.funcs[0].addr - m_info.segs[0].addr;
--- a/rpcs3/Emu/Cell/PPUTranslator.h
+++ b/rpcs3/Emu/Cell/PPUTranslator.h
@ -315,7 +315,7 @@ public:
 	// Handle compilation errors
 	void CompilationError(const std::string& error);

-	PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, bool ssse3);
+	PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, llvm::ExecutionEngine& engine);
 	~PPUTranslator();

 	// Get thread context struct type
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@ -260,7 +260,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)

 void spu_load_exec(const spu_exec_object& elf)
 {
-	auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x40000, vm::spu));
+	auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x80000, vm::spu));
 	auto spu = idm::make_ptr<named_thread<spu_thread>>("TEST_SPU", ls0, nullptr, 0, "");

 	spu_thread::g_raw_spu_ctr++;
--- a/rpcs3/Emu/Cell/SPUAnalyser.h
+++ b/rpcs3/Emu/Cell/SPUAnalyser.h
@ -11,6 +11,7 @@ struct spu_itype
 	static constexpr struct branch_tag{} branch{}; // Branch Instructions
 	static constexpr struct floating_tag{} floating{}; // Floating-Point Instructions
 	static constexpr struct quadrop_tag{} _quadrop{}; // 4-op Instructions
+	static constexpr struct xfloat_tag{} xfloat{}; // Instructions producing xfloat values

 	enum type : unsigned char
 	{
@ -146,24 +147,26 @@ struct spu_itype
 		FMS, // quadrop_tag last

 		FA,
-		DFA,
 		FS,
-		DFS,
 		FM,
+		FREST,
+		FRSQEST,
+		FI,
+		CSFLT,
+		CUFLT,
+		FRDS, // xfloat_tag last
+
+		DFA,
+		DFS,
 		DFM,
 		DFMA,
 		DFNMS,
 		DFMS,
 		DFNMA,
-		FREST,
-		FRSQEST,
-		FI,
-		CSFLT,
-		CFLTS,
-		CUFLT,
-		CFLTU,
-		FRDS,
 		FESD,
+
+		CFLTS,
+		CFLTU,
 		FCEQ,
 		FCMEQ,
 		FCGT,
@ -252,6 +255,12 @@ struct spu_itype
 	{
 		return value >= MPYA && value <= FMS;
 	}
+
+	// Test for xfloat instruction
+	friend constexpr bool operator &(type value, xfloat_tag)
+	{
+		return value >= FMA && value <= FRDS;
+	}
 };

 struct spu_iflag
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -44,8 +44,14 @@ class spu_runtime

 	atomic_t<u64> m_reset_count{0};

+	struct func_compare
+	{
+		// Comparison function for SPU programs
+		bool operator()(const std::vector<u32>& lhs, const std::vector<u32>& rhs) const;
+	};
+
 	// All functions
-	std::map<std::vector<u32>, spu_function_t> m_map;
+	std::map<std::vector<u32>, spu_function_t, func_compare> m_map;

 	// Debug module output location
 	std::string m_cache_path;
@ -57,8 +63,8 @@ class spu_runtime
 		u16 from;
 		u16 level;
 		u8* rel32;
-		std::map<std::vector<u32>, spu_function_t>::iterator beg;
-		std::map<std::vector<u32>, spu_function_t>::iterator end;
+		decltype(m_map)::iterator beg;
+		decltype(m_map)::iterator end;
 	};

 	// Scratch vector
@ -199,6 +205,17 @@ public:
 		s_reg_max
 	};

+	// Classify terminator instructions
+	enum class term_type : unsigned char
+	{
+		br,
+		ret,
+		call,
+		fallthrough,
+		indirect_call,
+		interrupt_call,
+	};
+
 protected:
 	std::shared_ptr<spu_runtime> m_spurt;

@ -239,12 +256,39 @@ protected:
 		// Internal use flag
 		bool analysed = false;

+		// Terminator instruction type
+		term_type terminator;
+
 		// Bit mask of the registers modified in the block
 		std::bitset<s_reg_max> reg_mod{};

+		// Set if last modifying instruction produces xfloat
+		std::bitset<s_reg_max> reg_mod_xf{};
+
+		// Set if the initial register value in this block may be xfloat
+		std::bitset<s_reg_max> reg_maybe_xf{};
+
 		// Bit mask of the registers used (before modified)
 		std::bitset<s_reg_max> reg_use{};

+		// Bit mask of the trivial (u32 x 4) constant value resulting in this block
+		std::bitset<s_reg_max> reg_const{};
+
+		// Bit mask of register saved onto the stack before use
+		std::bitset<s_reg_max> reg_save_dom{};
+
+		// Address of the function
+		u32 func = 0x40000;
+
+		// Value subtracted from $SP in this block, negative if something funny is done on $SP
+		u32 stack_sub = 0;
+
+		// Constant values associated with reg_const
+		std::array<u32, s_reg_max> reg_val32;
+
+		// Registers loaded from the stack in this block (stack offset)
+		std::array<u32, s_reg_max> reg_load_mod{};
+
 		// Single source of the reg value (dominating block address within the same chunk) or a negative number
 		std::array<u32, s_reg_max> reg_origin, reg_origin_abs;

@ -258,13 +302,27 @@ protected:
 	// Sorted basic block info
 	std::map<u32, block_info> m_bbs;

-	// Advanced block (chunk) information
-	struct chunk_info
+	// Sorted advanced block (chunk) list
+	std::basic_string<u32> m_chunks;
+
+	// Function information
+	struct func_info
 	{
+		// Size to the end of last basic block
+		u16 size = 0;
+
+		// Determines whether a function is eligible for optimizations
+		bool good = false;
+
+		// Call targets
+		std::basic_string<u32> calls;
+
+		// Register save info (stack offset)
+		std::array<u32, s_reg_max> reg_save_off{};
 	};

-	// Sorted chunk info
-	std::map<u32, chunk_info> m_chunks;
+	// Sorted function info
+	std::map<u32, func_info> m_funcs;

 	std::shared_ptr<spu_cache> m_cache;

@ -272,6 +330,9 @@ private:
 	// For private use
 	std::bitset<0x10000> m_bits;

+	// For private use
+	std::vector<u32> workload;
+
 	// Result of analyse(), to avoid copying and allocation
 	std::vector<u32> result;

--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -579,6 +579,10 @@ public:
 	u64 block_recover = 0;
 	u64 block_failure = 0;

+	u64 saved_native_sp = 0; // Host thread's stack pointer for emulated longjmp
+
+	u8* memory_base_addr = vm::g_base_addr;
+
 	std::array<v128, 0x4000> stack_mirror; // Return address information

 	void push_snr(u32 number, u32 value);
--- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp
@ -232,7 +232,7 @@ error_code sys_spu_thread_initialize(vm::ptr<u32> thread, u32 group_id, u32 spu_
 		sys_spu.todo("Unimplemented SPU Thread options (0x%x)", option);
 	}

-	const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x40000, vm::main))};
+	const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x80000, vm::main))};

 	const u32 tid = idm::import<named_thread<spu_thread>>([&]()
 	{