From 7492f335e98e17b4217bf25cc04b2c9a2b51377e Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 5 May 2019 16:28:41 +0300 Subject: [PATCH] SPU analyser: basic function detection in Giga mode Misc: fix EH frame registration (LLVM, non-Windows). Misc: constant-folding bitcast (cpu_translator). Misc: add syntax for LLVM arrays (cpu_translator). Misc: use function names for proper linkage (SPU LLVM). Changed function search and verification in Giga mode. Basic stack frame layout analysis. Function detection in Giga mode. Basic use of new information in SPU LLVM. Fixed jump table compilation in SPU LLVM. Disable broken optimization in Accurate xfloat mode. Make compiled SPU modules position-independent in SPU LLVM. Optimizations include but not limited to: * Compiling SPU functions as native functions when eligible * Avoiding register context write-out * Aligned stack assumption (CWD alike instruction) --- Utilities/JIT.cpp | 25 +- Utilities/JIT.h | 3 - rpcs3/Emu/CPU/CPUTranslator.cpp | 47 + rpcs3/Emu/CPU/CPUTranslator.h | 73 ++ rpcs3/Emu/Cell/PPUInterpreter.cpp | 2 +- rpcs3/Emu/Cell/PPUThread.cpp | 2 +- rpcs3/Emu/Cell/PPUThread.h | 2 +- rpcs3/Emu/Cell/PPUTranslator.cpp | 5 +- rpcs3/Emu/Cell/PPUTranslator.h | 2 +- rpcs3/Emu/Cell/RawSPUThread.cpp | 2 +- rpcs3/Emu/Cell/SPUAnalyser.h | 29 +- rpcs3/Emu/Cell/SPURecompiler.cpp | 1807 ++++++++++++++++++++++------- rpcs3/Emu/Cell/SPURecompiler.h | 75 +- rpcs3/Emu/Cell/SPUThread.h | 4 + rpcs3/Emu/Cell/lv2/sys_spu.cpp | 2 +- 15 files changed, 1588 insertions(+), 492 deletions(-) diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index 8de280bc4f..11e799ba1e 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -474,7 +474,7 @@ struct MemoryManager : llvm::RTDyldMemoryManager s_unfire.push_front(std::make_pair(addr, size)); #endif - return RTDyldMemoryManager::registerEHFrames(addr, load_addr, size); + return RTDyldMemoryManager::registerEHFramesInProcess(addr, size); } void deregisterEHFrames() override @@ -508,6 +508,10 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager void registerEHFrames(u8* addr, u64 load_addr, std::size_t size) override { +#ifndef _WIN32 + RTDyldMemoryManager::registerEHFramesInProcess(addr, size); + s_unfire.push_front(std::make_pair(addr, size)); +#endif } void deregisterEHFrames() override @@ -770,25 +774,6 @@ jit_compiler::~jit_compiler() { } -bool jit_compiler::has_ssse3() const -{ - if (m_cpu == "generic" || - m_cpu == "k8" || - m_cpu == "opteron" || - m_cpu == "athlon64" || - m_cpu == "athlon-fx" || - m_cpu == "k8-sse3" || - m_cpu == "opteron-sse3" || - m_cpu == "athlon64-sse3" || - m_cpu == "amdfam10" || - m_cpu == "barcelona") - { - return false; - } - - return true; -} - void jit_compiler::add(std::unique_ptr module, const std::string& path) { ObjectCache cache{path}; diff --git a/Utilities/JIT.h b/Utilities/JIT.h index eeb03c0ac5..d3028ce47e 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -142,9 +142,6 @@ public: return *m_engine; } - // Test SSSE3 feature - bool has_ssse3() const; - // Add module (path to obj cache dir) void add(std::unique_ptr module, const std::string& path); diff --git a/rpcs3/Emu/CPU/CPUTranslator.cpp b/rpcs3/Emu/CPU/CPUTranslator.cpp index c77567be79..df09467a22 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.cpp +++ b/rpcs3/Emu/CPU/CPUTranslator.cpp @@ -9,7 +9,54 @@ cpu_translator::cpu_translator(llvm::Module* module, bool is_be) , m_module(module) , m_is_be(is_be) { +} +void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine) +{ + m_context = context; + m_engine = &engine; + + const auto cpu = m_engine->getTargetMachine()->getTargetCPU(); + + m_use_ssse3 = true; + + // Test SSSE3 feature (TODO) + if (cpu == "generic" || + cpu == "k8" || + cpu == "opteron" || + cpu == "athlon64" || + cpu == "athlon-fx" || + cpu == "k8-sse3" || + cpu == "opteron-sse3" || + cpu == "athlon64-sse3" || + cpu == "amdfam10" || + cpu == "barcelona") + { + m_use_ssse3 = false; + } +} + +llvm::Value* cpu_translator::bitcast(llvm::Value* val, llvm::Type* type) +{ + uint s1 = type->getScalarSizeInBits(); + uint s2 = val->getType()->getScalarSizeInBits(); + + if (type->isVectorTy()) + s1 *= type->getVectorNumElements(); + if (val->getType()->isVectorTy()) + s2 *= val->getType()->getVectorNumElements(); + + if (s1 != s2) + { + fmt::throw_exception("cpu_translator::bitcast(): incompatible type sizes (%u vs %u)", s1, s2); + } + + if (const auto c1 = llvm::dyn_cast(val)) + { + return verify(HERE, llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c1, type, m_module->getDataLayout())); + } + + return m_ir->CreateBitCast(val, type); } template <> diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 848eda53f8..493048893a 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -9,6 +9,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Analysis/ConstantFolding.h" #ifdef _MSC_VER #pragma warning(pop) @@ -19,6 +20,8 @@ #include "../Utilities/StrFmt.h" #include "../Utilities/BEType.h" #include "../Utilities/BitField.h" +#include "../Utilities/Log.h" +#include "../Utilities/JIT.h" #include #include @@ -47,6 +50,7 @@ struct llvm_value_t static constexpr bool is_sint = false; static constexpr bool is_uint = false; static constexpr bool is_float = false; + static constexpr uint is_array = false; static constexpr uint is_vector = false; static constexpr uint is_pointer = false; @@ -314,6 +318,7 @@ struct llvm_value_t : llvm_value_t static constexpr bool is_sint = false; static constexpr bool is_uint = false; static constexpr bool is_float = false; + static constexpr uint is_array = false; static constexpr uint is_vector = false; static constexpr uint is_pointer = llvm_value_t::is_pointer + 1; @@ -333,6 +338,7 @@ struct llvm_value_t : llvm_value_t using base = llvm_value_t; using base::base; + static constexpr uint is_array = 0; static constexpr uint is_vector = N; static constexpr uint is_pointer = 0; @@ -342,6 +348,48 @@ struct llvm_value_t : llvm_value_t } }; +template +struct llvm_value_t : llvm_value_t +{ + using type = T[0][N]; + using base = llvm_value_t; + using base::base; + + static constexpr bool is_int = false; + static constexpr bool is_sint = false; + static constexpr bool is_uint = false; + static constexpr bool is_float = false; + static constexpr uint is_array = N; + static constexpr uint is_vector = false; + static constexpr uint is_pointer = false; + + static llvm::Type* get_type(llvm::LLVMContext& context) + { + return llvm::ArrayType::get(llvm_value_t::get_type(context), N); + } +}; + +template +struct llvm_value_t : llvm_value_t +{ + using type = T[V][N]; + using base = llvm_value_t; + using base::base; + + static constexpr bool is_int = false; + static constexpr bool is_sint = false; + static constexpr bool is_uint = false; + static constexpr bool is_float = false; + static constexpr uint is_array = N; + static constexpr uint is_vector = false; + static constexpr uint is_pointer = false; + + static llvm::Type* get_type(llvm::LLVMContext& context) + { + return llvm::ArrayType::get(llvm_value_t::get_type(context), N); + } +}; + template using llvm_expr_t = std::decay_t; @@ -2368,6 +2416,9 @@ protected: // Module to which all generated code is output to llvm::Module* m_module; + // Execution engine from JIT instance + llvm::ExecutionEngine* m_engine{}; + // Endianness, affects vector element numbering (TODO) bool m_is_be; @@ -2377,6 +2428,8 @@ protected: // IR builder llvm::IRBuilder<>* m_ir; + void initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine); + public: // Convert a C++ type to an LLVM type (TODO: remove) template @@ -2421,6 +2474,26 @@ public: return result; } + // Call external function: provide name and function pointer + template + llvm::CallInst* call(std::string_view lame, RT(*_func)(FArgs...), Args... args) + { + static_assert(sizeof...(FArgs) == sizeof...(Args), "spu_llvm_recompiler::call(): unexpected arg number"); + const auto type = llvm::FunctionType::get(get_type(), {args->getType()...}, false); + const auto func = llvm::cast(m_module->getOrInsertFunction({lame.data(), lame.size()}, type).getCallee()); + m_engine->addGlobalMapping({lame.data(), lame.size()}, reinterpret_cast(_func)); + return m_ir->CreateCall(func, {args...}); + } + + // Bitcast with immediate constant folding + llvm::Value* bitcast(llvm::Value* val, llvm::Type* type); + + template + llvm::Value* bitcast(llvm::Value* val) + { + return bitcast(val, get_type()); + } + template static llvm_placeholder_t match() { diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index e0c1ba6399..339e5dff47 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -4677,7 +4677,7 @@ bool ppu_interpreter::MTFSB0(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::MTFSFI(ppu_thread& ppu, ppu_opcode_t op) { const u32 bf = op.crfd * 4; - if (bf != 4 * 4) + if (bf != 4 * 4) { // Do nothing on non-FPCC field (TODO) LOG_WARNING(PPU, "MTFSFI(%d)", op.crfd); diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 09affb232a..e09f8e1eef 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1711,7 +1711,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout()); // Initialize translator - PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.has_ssse3()); + PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.get_engine()); // Define some types const auto _void = Type::getVoidTy(jit.get_context()); diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h index f2ab2ed390..b4c7178dd5 100644 --- a/rpcs3/Emu/Cell/PPUThread.h +++ b/rpcs3/Emu/Cell/PPUThread.h @@ -79,7 +79,7 @@ public: result |= bit; } - return result; + return result; } // Unpack CR bits diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 4fa058b827..5531bfa835 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -11,14 +11,13 @@ using namespace llvm; const ppu_decoder s_ppu_decoder; -PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, bool ssse3) +PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, ExecutionEngine& engine) : cpu_translator(module, false) , m_info(info) , m_pure_attr(AttributeList::get(m_context, AttributeList::FunctionIndex, {Attribute::NoUnwind, Attribute::ReadNone})) { // Bind context - m_context = context; - m_use_ssse3 = ssse3; + cpu_translator::initialize(context, engine); // There is no weak linkage on JIT, so let's create variables with different names for each module part const u32 gsuffix = m_info.name.empty() ? info.funcs[0].addr : info.funcs[0].addr - m_info.segs[0].addr; diff --git a/rpcs3/Emu/Cell/PPUTranslator.h b/rpcs3/Emu/Cell/PPUTranslator.h index beb6017bd8..95d44375da 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.h +++ b/rpcs3/Emu/Cell/PPUTranslator.h @@ -315,7 +315,7 @@ public: // Handle compilation errors void CompilationError(const std::string& error); - PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, bool ssse3); + PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, llvm::ExecutionEngine& engine); ~PPUTranslator(); // Get thread context struct type diff --git a/rpcs3/Emu/Cell/RawSPUThread.cpp b/rpcs3/Emu/Cell/RawSPUThread.cpp index aaedc088a0..9a68324234 100644 --- a/rpcs3/Emu/Cell/RawSPUThread.cpp +++ b/rpcs3/Emu/Cell/RawSPUThread.cpp @@ -260,7 +260,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value) void spu_load_exec(const spu_exec_object& elf) { - auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x40000, vm::spu)); + auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x80000, vm::spu)); auto spu = idm::make_ptr>("TEST_SPU", ls0, nullptr, 0, ""); spu_thread::g_raw_spu_ctr++; diff --git a/rpcs3/Emu/Cell/SPUAnalyser.h b/rpcs3/Emu/Cell/SPUAnalyser.h index adaa4ebc64..65ac1d5d97 100644 --- a/rpcs3/Emu/Cell/SPUAnalyser.h +++ b/rpcs3/Emu/Cell/SPUAnalyser.h @@ -11,6 +11,7 @@ struct spu_itype static constexpr struct branch_tag{} branch{}; // Branch Instructions static constexpr struct floating_tag{} floating{}; // Floating-Point Instructions static constexpr struct quadrop_tag{} _quadrop{}; // 4-op Instructions + static constexpr struct xfloat_tag{} xfloat{}; // Instructions producing xfloat values enum type : unsigned char { @@ -146,24 +147,26 @@ struct spu_itype FMS, // quadrop_tag last FA, - DFA, FS, - DFS, FM, + FREST, + FRSQEST, + FI, + CSFLT, + CUFLT, + FRDS, // xfloat_tag last + + DFA, + DFS, DFM, DFMA, DFNMS, DFMS, DFNMA, - FREST, - FRSQEST, - FI, - CSFLT, - CFLTS, - CUFLT, - CFLTU, - FRDS, FESD, + + CFLTS, + CFLTU, FCEQ, FCMEQ, FCGT, @@ -252,6 +255,12 @@ struct spu_itype { return value >= MPYA && value <= FMS; } + + // Test for xfloat instruction + friend constexpr bool operator &(type value, xfloat_tag) + { + return value >= FMA && value <= FRDS; + } }; struct spu_iflag diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 54ef3a8cd2..abb69062cf 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -307,6 +307,53 @@ void spu_cache::initialize() }); } +bool spu_runtime::func_compare::operator()(const std::vector& lhs, const std::vector& rhs) const +{ + if (lhs.empty()) + return !rhs.empty(); + else if (rhs.empty()) + return false; + + const u32 lhs_addr = lhs[0]; + const u32 rhs_addr = rhs[0]; + + if (lhs_addr < rhs_addr) + return true; + else if (lhs_addr > rhs_addr) + return false; + + // Select range for comparison + std::basic_string_view lhs_data(lhs.data() + 1, lhs.size() - 1); + std::basic_string_view rhs_data(rhs.data() + 1, rhs.size() - 1); + + if (lhs_data.empty()) + return !rhs_data.empty(); + else if (rhs_data.empty()) + return false; + + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + // In Giga mode, compare instructions starting from the entry point first + lhs_data.remove_prefix(lhs_addr / 4); + rhs_data.remove_prefix(rhs_addr / 4); + const auto cmp0 = lhs_data.compare(rhs_data); + + if (cmp0 < 0) + return true; + else if (cmp0 > 0) + return false; + + // Compare from address 0 to the point before the entry point (undesirable) + lhs_data = {lhs.data() + 1, lhs_addr / 4}; + rhs_data = {rhs.data() + 1, rhs_addr / 4}; + return lhs_data < rhs_data; + } + else + { + return lhs_data < rhs_data; + } +} + spu_runtime::spu_runtime() { // Initialize "empty" block @@ -411,6 +458,12 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile workload.back().beg = beg; workload.back().end = _end; + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + // In Giga mode, start comparing instructions from the actual entry point + verify("spu_runtime::work::level overflow" HERE), workload.back().level += func[0] / 4; + } + for (std::size_t i = 0; i < workload.size(); i++) { // Get copy of the workload info @@ -835,7 +888,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) { const v128 _info = spu.stack_mirror[(spu.gpr[1]._u32[3] & 0x3fff0) >> 4]; - if (_info._u64[0] != -1) + if (_info._u64[0] + 1) { LOG_TRACE(SPU, "Called from 0x%x", _info._u32[2] - 4); } @@ -904,7 +957,7 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en m_ret_info.reset(); // Simple block entry workload list - std::vector workload; + workload.clear(); workload.push_back(entry_point); std::memset(m_regmod.data(), 0xff, sizeof(m_regmod)); @@ -915,6 +968,8 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en m_preds.clear(); m_preds[entry_point]; m_bbs.clear(); + m_chunks.clear(); + m_funcs.clear(); // Value flags (TODO) enum class vf : u32 @@ -979,7 +1034,7 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en } // Add predecessor - if (m_preds[target].find_first_of(pos) == -1) + if (m_preds[target].find_first_of(pos) + 1 == 0) { m_preds[target].push_back(pos); } @@ -1885,13 +1940,36 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en { block.size++; + // Decode instruction + const spu_opcode_t op{se_storage::swap(result[(ia - lsa) / 4 + 1])}; + + const auto type = s_spu_itype.decode(op.opcode); + + u8 reg_save = 255; + + if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt]) + { + // Register saved onto the stack before use + block.reg_save_dom[op.rt] = true; + + reg_save = op.rt; + } + for (auto* _use : {&m_use_ra, &m_use_rb, &m_use_rc}) { if (u8 reg = (*_use)[ia / 4]; reg < s_reg_max) { // Register reg use only if it happens before reg mod if (!block.reg_mod[reg]) + { block.reg_use.set(reg); + + if (reg_save != reg && block.reg_save_dom[reg]) + { + // Register is still used after saving; probably not eligible for optimization + block.reg_save_dom[reg] = false; + } + } } } @@ -1909,6 +1987,16 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) { block.reg_mod.set(reg); + block.reg_mod_xf.set(reg, type & spu_itype::xfloat); + + if (type == spu_itype::SELB && (block.reg_mod_xf[op.ra] || block.reg_mod_xf[op.rb])) + block.reg_mod_xf.set(reg); + + // Possible post-dominating register load + if (type == spu_itype::LQD && op.ra == s_reg_sp) + block.reg_load_mod[reg] = ia + 1; + else + block.reg_load_mod[reg] = 0; } // Find targets (also means end of the block) @@ -1918,6 +2006,44 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en { // Copy targets block.targets = tfound->second; + + // Assume that the call reads and modifies all volatile registers (TODO) + bool is_call = false; + bool is_tail = false; + switch (type) + { + case spu_itype::BRSL: + is_call = spu_branch_target(ia, op.i16) != ia + 4; + break; + case spu_itype::BRASL: + is_call = spu_branch_target(0, op.i16) != ia + 4; + break; + case spu_itype::BISL: + case spu_itype::BISLED: + is_call = true; + break; + default: + break; + } + + if (is_call) + { + for (u32 i = 0; i < s_reg_max; ++i) + { + if (i == s_reg_lr || (i >= 2 && i < s_reg_80) || i > s_reg_127) + { + if (!block.reg_mod[i]) + block.reg_use.set(i); + + if (!is_tail) + { + block.reg_mod.set(i); + block.reg_mod_xf[i] = false; + } + } + } + } + break; } } @@ -1926,13 +2052,97 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en // Fixup block predeccessors to point to basic blocks, not last instructions for (auto& bb : m_bbs) { + const u32 addr = bb.first; + for (u32& pred : bb.second.preds) { pred = std::prev(m_bbs.upper_bound(pred))->first; } + + if (m_entry_info[addr / 4] && g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + // Register empty chunk + m_chunks.push_back(addr); + + // Register function if necessary + if (!m_ret_info[addr / 4]) + { + m_funcs[addr]; + } + } } - // Fill entry map, add chunk addresses + // Ensure there is a function at the lowest address + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + if (auto emp = m_funcs.try_emplace(m_bbs.begin()->first); emp.second) + { + const u32 addr = emp.first->first; + LOG_ERROR(SPU, "[0x%05x] Fixed first function at 0x%05x", entry_point, addr); + m_entry_info[addr / 4] = true; + m_ret_info[addr / 4] = false; + } + } + + // Split functions + while (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + bool need_repeat = false; + + u32 start = 0; + u32 limit = 0x40000; + + // Walk block list in ascending order + for (auto& block : m_bbs) + { + const u32 addr = block.first; + + if (m_entry_info[addr / 4] && !m_ret_info[addr / 4]) + { + const auto upper = m_funcs.upper_bound(addr); + start = addr; + limit = upper == m_funcs.end() ? 0x40000 : upper->first; + } + + // Find targets that exceed [start; limit) range and make new functions from them + for (u32 target : block.second.targets) + { + const auto tfound = m_bbs.find(target); + + if (tfound == m_bbs.end()) + { + continue; + } + + if (target < start || target >= limit) + { + if (!m_entry_info[target / 4] || m_ret_info[target / 4]) + { + // Create new function entry (likely a tail call) + m_entry_info[target / 4] = true; + + m_ret_info[target / 4] = false; + + m_funcs.try_emplace(target); + + if (target < limit) + { + need_repeat = true; + } + } + } + } + + block.second.func = start; + } + + if (!need_repeat) + { + break; + } + } + + // Fill entry map while (true) { workload.clear(); @@ -1951,7 +2161,7 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en // Check block predecessors for (u32 pred : block.preds) { - const u32 _old = m_bbs[pred].chunk; + const u32 _old = m_bbs.at(pred).chunk; if (_old < 0x40000 && _old != _new) { @@ -2040,6 +2250,16 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en workload.push_back(target); tb.analysed = true; } + + // Limited xfloat hint propagation (possibly TODO) + if (tb.chunk == block.chunk) + { + tb.reg_maybe_xf &= block.reg_mod_xf; + } + else + { + tb.reg_maybe_xf.reset(); + } } block.reg_origin.fill(0x80000000); @@ -2066,13 +2286,13 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en } } - if (m_entry_info[addr / 4] && !m_ret_info[addr / 4]) + if (g_cfg.core.spu_block_size == spu_block_size_type::giga && m_entry_info[addr / 4] && !m_ret_info[addr / 4]) { for (u32 i = 0; i < s_reg_max; i++) { if (block.reg_origin_abs[i] == 0x80000000) block.reg_origin_abs[i] = 0x40000; - else if (block.reg_origin_abs[i] == -1) + else if (block.reg_origin_abs[i] + 1 == 0) block.reg_origin_abs[i] = -2; } } @@ -2090,7 +2310,7 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en for (u32 i = 0; i < s_reg_max; i++) { - if (tb.chunk == block.chunk && tb.reg_origin[i] != -1) + if (tb.chunk == block.chunk && tb.reg_origin[i] + 1) { const u32 expected = block.reg_mod[i] ? addr : block.reg_origin[i]; @@ -2107,13 +2327,7 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en } } - if (tb.chunk != block.chunk && !(m_entry_info[target / 4] && m_ret_info[target / 4])) - { - // Skip call targets completely - continue; - } - - if (tb.reg_origin_abs[i] != -2) + if (g_cfg.core.spu_block_size == spu_block_size_type::giga && tb.func == block.func && tb.reg_origin_abs[i] + 2) { const u32 expected = block.reg_mod[i] ? addr : block.reg_origin_abs[i]; @@ -2123,14 +2337,14 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en } else if (tb.reg_origin_abs[i] != expected) { - if (tb.reg_origin_abs[i] == 0x40000 || expected == -2 || expected == 0x40000) + if (tb.reg_origin_abs[i] == 0x40000 || expected + 2 == 0 || expected == 0x40000) { // Set -2: sticky value indicating possible external reg origin (0x40000) tb.reg_origin_abs[i] = -2; must_repeat |= !tb.targets.empty(); } - else if (tb.reg_origin_abs[i] != -1) + else if (tb.reg_origin_abs[i] + 1) { tb.reg_origin_abs[i] = -1; @@ -2163,6 +2377,510 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en } } + // Fill more block info + for (u32 wi = 0; wi < workload.size(); wi++) + { + if (g_cfg.core.spu_block_size != spu_block_size_type::giga) + { + break; + } + + const u32 addr = workload[wi]; + auto& bb = m_bbs.at(addr); + auto& func = m_funcs.at(bb.func); + + // Update function size + func.size = std::max(func.size, bb.size + (addr - bb.func) / 4); + + // Copy constants according to reg origin info + for (u32 i = 0; i < s_reg_max; i++) + { + const u32 orig = bb.reg_origin_abs[i]; + + if (orig < 0x40000) + { + auto& src = m_bbs.at(orig); + bb.reg_const[i] = src.reg_const[i]; + bb.reg_val32[i] = src.reg_val32[i]; + } + + if (!bb.reg_save_dom[i] && bb.reg_use[i] && (orig == 0x40000 || orig + 2 == 0)) + { + // Destroy offset if external reg value is used + func.reg_save_off[i] = -1; + } + } + + if (u32 orig = bb.reg_origin_abs[s_reg_sp]; orig < 0x40000) + { + auto& prologue = m_bbs.at(orig); + + // Copy stack offset (from the assumed prologue) + bb.stack_sub = prologue.stack_sub; + } + else if (orig > 0x40000) + { + // Unpredictable stack + bb.stack_sub = 0x80000000; + } + + spu_opcode_t op; + + auto last_inst = spu_itype::UNK; + + for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4) + { + // Decode instruction again + op.opcode = se_storage::swap(result[(ia - lsa) / 4 + 1]); + last_inst = s_spu_itype.decode(op.opcode); + + // Propagate some constants + switch (last_inst) + { + case spu_itype::IL: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.si16; + break; + } + case spu_itype::ILA: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i18; + break; + } + case spu_itype::ILHU: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i16 << 16; + break; + } + case spu_itype::ILH: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i16 << 16 | op.i16; + break; + } + case spu_itype::IOHL: + { + bb.reg_val32[op.rt] = bb.reg_val32[op.rt] | op.i16; + break; + } + case spu_itype::ORI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | op.si10; + break; + } + case spu_itype::OR: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | bb.reg_val32[op.rb]; + break; + } + case spu_itype::AI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + op.si10; + break; + } + case spu_itype::A: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + bb.reg_val32[op.rb]; + break; + } + case spu_itype::SFI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = op.si10 - bb.reg_val32[op.ra]; + break; + } + case spu_itype::SF: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.rb] - bb.reg_val32[op.ra]; + break; + } + case spu_itype::STQD: + { + if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_save_dom[op.rt]) + { + const u32 offset = 0x80000000 + op.si10 * 16 - bb.stack_sub; + + if (func.reg_save_off[op.rt] == 0) + { + // Store reg save offset + func.reg_save_off[op.rt] = offset; + } + else if (func.reg_save_off[op.rt] != offset) + { + // Conflict of different offsets + func.reg_save_off[op.rt] = -1; + } + } + + break; + } + case spu_itype::LQD: + { + if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_load_mod[op.rt] == ia + 1) + { + // Adjust reg load offset + bb.reg_load_mod[op.rt] = 0x80000000 + op.si10 * 16 - bb.stack_sub; + } + + // Clear const + bb.reg_const[op.rt] = false; + break; + } + default: + { + // Clear const if reg is modified here + if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) + bb.reg_const[reg] = false; + break; + } + } + + // $SP is modified + if (m_regmod[ia / 4] == s_reg_sp) + { + if (bb.reg_const[s_reg_sp]) + { + // Making $SP a constant is a funny thing too. + bb.stack_sub = 0x80000000; + } + + if (bb.stack_sub != 0x80000000) + { + switch (last_inst) + { + case spu_itype::AI: + { + if (op.ra == s_reg_sp) + bb.stack_sub -= op.si10; + else + bb.stack_sub = 0x80000000; + break; + } + case spu_itype::A: + { + if (op.ra == s_reg_sp && bb.reg_const[op.rb]) + bb.stack_sub -= bb.reg_val32[op.rb]; + else if (op.rb == s_reg_sp && bb.reg_const[op.ra]) + bb.stack_sub -= bb.reg_val32[op.ra]; + else + bb.stack_sub = 0x80000000; + break; + } + case spu_itype::SF: + { + if (op.rb == s_reg_sp && bb.reg_const[op.ra]) + bb.stack_sub += bb.reg_val32[op.ra]; + else + bb.stack_sub = 0x80000000; + break; + } + default: + { + bb.stack_sub = 0x80000000; + break; + } + } + } + + // Check for funny values. + if (bb.stack_sub >= 0x40000 || bb.stack_sub % 16) + { + bb.stack_sub = 0x80000000; + } + } + } + + // Analyse terminator instruction + const u32 tia = addr + bb.size * 4 - 4; + + switch (last_inst) + { + case spu_itype::BR: + case spu_itype::BRA: + case spu_itype::BRNZ: + case spu_itype::BRZ: + case spu_itype::BRHNZ: + case spu_itype::BRHZ: + case spu_itype::BRSL: + case spu_itype::BRASL: + { + const u32 target = spu_branch_target(last_inst == spu_itype::BRA || last_inst == spu_itype::BRASL ? 0 : tia, op.i16); + + if (target == tia + 4) + { + bb.terminator = term_type::fallthrough; + } + else if (last_inst != spu_itype::BRSL && last_inst != spu_itype::BRASL) + { + // No-op terminator or simple branch instruction + bb.terminator = term_type::br; + + if (target == bb.func) + { + // Recursive tail call + bb.terminator = term_type::ret; + } + } + else if (op.rt == s_reg_lr) + { + bb.terminator = term_type::call; + } + else + { + bb.terminator = term_type::interrupt_call; + } + + break; + } + case spu_itype::BI: + { + if (op.d || op.e || bb.targets.size() == 1) + { + bb.terminator = term_type::interrupt_call; + } + else if (bb.targets.size() > 1) + { + // Jump table + bb.terminator = term_type::br; + } + else if (op.ra == s_reg_lr) + { + // Return (TODO) + bb.terminator = term_type::ret; + } + else + { + // Indirect tail call (TODO) + bb.terminator = term_type::interrupt_call; + } + + break; + } + case spu_itype::BISLED: + case spu_itype::IRET: + { + bb.terminator = term_type::interrupt_call; + break; + } + case spu_itype::BISL: + case spu_itype::BIZ: + case spu_itype::BINZ: + case spu_itype::BIHZ: + case spu_itype::BIHNZ: + { + if (op.d || op.e || bb.targets.size() != 1) + { + bb.terminator = term_type::interrupt_call; + } + else if (last_inst != spu_itype::BISL && bb.targets[0] == tia + 4 && op.ra == s_reg_lr) + { + // Conditional return (TODO) + bb.terminator = term_type::ret; + } + else if (last_inst == spu_itype::BISL) + { + // Indirect call + bb.terminator = term_type::indirect_call; + } + else + { + // TODO + bb.terminator = term_type::interrupt_call; + } + + break; + } + default: + { + // Normal instruction + bb.terminator = term_type::fallthrough; + break; + } + } + } + + // Check function blocks, verify and print some reasons + for (auto& f : m_funcs) + { + if (g_cfg.core.spu_block_size != spu_block_size_type::giga) + { + break; + } + + bool is_ok = true; + + u32 used_stack = 0; + + for (auto it = m_bbs.lower_bound(f.first); it != m_bbs.end() && it->second.func == f.first; ++it) + { + auto& bb = it->second; + auto& func = m_funcs.at(bb.func); + const u32 addr = it->first; + const u32 flim = bb.func + func.size * 4; + + used_stack |= bb.stack_sub; + + if (is_ok && bb.terminator >= term_type::indirect_call) + { + is_ok = false; + } + + if (is_ok && bb.terminator == term_type::ret) + { + // Check $LR (alternative return registers are currently not supported) + if (u32 lr_orig = bb.reg_mod[s_reg_lr] ? addr : bb.reg_origin_abs[s_reg_lr]; lr_orig < 0x40000) + { + auto& src = m_bbs.at(lr_orig); + + if (src.reg_load_mod[s_reg_lr] != func.reg_save_off[s_reg_lr]) + { + LOG_ERROR(SPU, "Function 0x%05x: [0x%05x] $LR mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, addr, lr_orig, src.reg_load_mod[0], func.reg_save_off[0]); + is_ok = false; + } + else if (src.reg_load_mod[s_reg_lr] == 0) + { + LOG_ERROR(SPU, "Function 0x%05x: [0x%05x] $LR modified (src=0x%x)", f.first, addr, lr_orig); + is_ok = false; + } + } + else if (lr_orig > 0x40000) + { + LOG_TODO(SPU, "Function 0x%05x: [0x%05x] $LR unpredictable (src=0x%x)", f.first, addr, lr_orig); + is_ok = false; + } + + // Check $80..$127 (should be restored or unmodified) + for (u32 i = s_reg_80; is_ok && i <= s_reg_127; i++) + { + if (u32 orig = bb.reg_mod[i] ? addr : bb.reg_origin_abs[i]; orig < 0x40000) + { + auto& src = m_bbs.at(orig); + + if (src.reg_load_mod[i] != func.reg_save_off[i]) + { + LOG_ERROR(SPU, "Function 0x%05x: [0x%05x] $%u mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, addr, i, orig, src.reg_load_mod[i], func.reg_save_off[i]); + is_ok = false; + } + } + else if (orig > 0x40000) + { + LOG_TODO(SPU, "Function 0x%05x: [0x%05x] $%u unpredictable (src=0x%x)", f.first, addr, i, orig); + is_ok = false; + } + + if (func.reg_save_off[i] + 1 == 0) + { + LOG_ERROR(SPU, "Function 0x%05x: [0x%05x] $%u used incorrectly", f.first, addr, i); + is_ok = false; + } + } + + // Check $SP (should be restored or unmodified) + if (bb.stack_sub != 0 && bb.stack_sub != 0x80000000) + { + LOG_ERROR(SPU, "Function 0x%05x: [0x%05x] return with stack frame 0x%x", f.first, addr, bb.stack_sub); + is_ok = false; + } + } + + if (is_ok && bb.terminator == term_type::call) + { + // Check call instruction (TODO) + if (bb.stack_sub == 0) + { + // Call without a stack frame + LOG_ERROR(SPU, "Function 0x%05x: [0x%05x] frameless call", f.first, addr); + is_ok = false; + } + } + + if (is_ok && bb.terminator == term_type::fallthrough) + { + // Can't just fall out of the function + if (bb.targets.size() != 1 || bb.targets[0] >= flim) + { + LOG_ERROR(SPU, "Function 0x%05x: [0x%05x] bad fallthrough to 0x%x", f.first, addr, bb.targets[0]); + is_ok = false; + } + } + + if (is_ok && bb.stack_sub == 0x80000000) + { + LOG_ERROR(SPU, "Function 0x%05x: [0x%05x] bad stack frame", f.first, addr); + is_ok = false; + } + + // Fill external function targets (calls, possibly tail calls) + for (u32 target : bb.targets) + { + if (target < bb.func || target >= flim || (bb.terminator == term_type::call && target == bb.func)) + { + if (func.calls.find_first_of(target) + 1 == 0) + { + func.calls.push_back(target); + } + } + } + } + + if (is_ok && used_stack && f.first == entry_point) + { + LOG_ERROR(SPU, "Function 0x%05x: considered possible chunk", f.first); + is_ok = false; + } + + // if (is_ok && f.first > 0x1d240 && f.first < 0x1e000) + // { + // LOG_ERROR(SPU, "Function 0x%05x: manually disabled", f.first); + // is_ok = false; + // } + + f.second.good = is_ok; + } + + // Check function call graph + while (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + bool need_repeat = false; + + for (auto& f : m_funcs) + { + if (!f.second.good) + { + continue; + } + + for (u32 call : f.second.calls) + { + const auto ffound = std::as_const(m_funcs).find(call); + + if (ffound == m_funcs.cend() || ffound->second.good == false) + { + need_repeat = true; + + if (f.second.good) + { + LOG_ERROR(SPU, "Function 0x%05x: calls bad function (0x%05x)", f.first, ffound->first); + f.second.good = false; + } + } + } + } + + if (!need_repeat) + { + break; + } + } + if (result.size() == 1) { // Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback @@ -2178,7 +2896,9 @@ void spu_recompiler_base::dump(std::string& out) { if (m_block_info[bb.first / 4]) { - fmt::append(out, "?: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block"); + fmt::append(out, "A: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block"); + + fmt::append(out, "\tF: 0x%05x\n", bb.second.func); for (u32 pred : bb.second.preds) { @@ -2187,12 +2907,24 @@ void spu_recompiler_base::dump(std::string& out) for (u32 target : bb.second.targets) { - fmt::append(out, "\t-> 0x%05x\n", target); + fmt::append(out, "\t-> 0x%05x%s\n", target, m_bbs.count(target) ? "" : " (null)"); } } else { - fmt::append(out, "?: [0x%05x] ?\n", bb.first); + fmt::append(out, "A: [0x%05x] ?\n", bb.first); + } + } + + for (auto& f : m_funcs) + { + fmt::append(out, "F: [0x%05x]%s\n", f.first, f.second.good ? " (good)" : " (bad)"); + + fmt::append(out, "\tN: 0x%05x\n", f.second.size * 4 + f.first); + + for (u32 call : f.second.calls) + { + fmt::append(out, "\t>> 0x%05x%s\n", call, m_funcs.count(call) ? "" : " (null)"); } } @@ -2225,6 +2957,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Current function chunk entry point u32 m_entry; + // Main entry point offset + u32 m_base; + // Current function (chunk) llvm::Function* m_function; @@ -2237,6 +2972,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator llvm::Value* m_interp_regs; // Helpers + llvm::Value* m_base_pc; llvm::Value* m_interp_pc_next; llvm::BasicBlock* m_interp_bblock; @@ -2256,11 +2992,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Helper for check_state llvm::GlobalVariable* m_fake_global1{}; + // Function for check_state execution + llvm::Function* m_test_state{}; + llvm::MDNode* m_md_unlikely; llvm::MDNode* m_md_likely; struct block_info { + // Pointer to the analyser + spu_recompiler_base::block_info* bb{}; + // Current block's entry block llvm::BasicBlock* block; @@ -2277,27 +3019,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator std::array store{}; }; - struct chunk_info + struct function_info { + // Standard callable chunk + llvm::Function* chunk{}; + // Callable function - llvm::Function* func; + llvm::Function* fn{}; - // Constants in non-volatile registers at the entry point - std::array reg{}; - - chunk_info() = default; - - chunk_info(llvm::Function* func) - : func(func) - { - } + // Registers possibly loaded in the entry block + std::array load{}; }; // Current block block_info* m_block; - // Current chunk - chunk_info* m_finfo; + // Current function or chunk + function_info* m_finfo; // All blocks in the current function chunk std::unordered_map> m_blocks; @@ -2306,52 +3044,152 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator std::vector m_block_queue; // All function chunks in current SPU compile unit - std::unordered_map> m_functions; + std::unordered_map> m_functions; // Function chunk list for processing std::vector m_function_queue; - // Helper - std::vector m_scan_queue; - // Add or get the function chunk - llvm::Function* add_function(u32 addr) + function_info* add_function(u32 addr) { + // Enqueue if necessary + const auto empl = m_functions.try_emplace(addr); + + if (!empl.second) + { + return &empl.first->second; + } + + // Chunk function type + // 0. Result (void) + // 1. Thread context + // 2. Local storage pointer + // 3. + const auto chunk_type = get_ftype(); + // Get function chunk name const std::string name = fmt::format("spu-chunk-0x%05x", addr); - llvm::Function* result = llvm::cast(m_module->getOrInsertFunction(name, get_ftype()).getCallee()); + llvm::Function* result = llvm::cast(m_module->getOrInsertFunction(name, chunk_type).getCallee()); // Set parameters result->setLinkage(llvm::GlobalValue::InternalLinkage); result->addAttribute(1, llvm::Attribute::NoAlias); result->addAttribute(2, llvm::Attribute::NoAlias); + result->setCallingConv(llvm::CallingConv::GHC); - // Enqueue if necessary - const auto empl = m_functions.emplace(addr, chunk_info{result}); + empl.first->second.chunk = result; - if (empl.second) + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) { - m_function_queue.push_back(addr); + // Find good real function + const auto ffound = m_funcs.find(addr); - if (m_block && g_cfg.core.spu_block_size != spu_block_size_type::safe) + if (ffound != m_funcs.end() && ffound->second.good) { - // Initialize constants for non-volatile registers (TODO) - auto& regs = empl.first->second.reg; + // Real function type (not equal to chunk type) + // 4. $SP (only 32 bit value) + const auto func_type = get_ftype(); - for (u32 i = 80; i <= 127; i++) - { - if (auto c = llvm::dyn_cast_or_null(m_block->reg[i])) - { - if (m_bbs.at(addr).reg_origin_abs[i] < 0x40000) - { - regs[i] = c; - } - } - } + const std::string fname = fmt::format("spu-function-0x%05x", addr); + llvm::Function* fn = llvm::cast(m_module->getOrInsertFunction(fname, func_type).getCallee()); + + fn->setLinkage(llvm::GlobalValue::InternalLinkage); + fn->addAttribute(1, llvm::Attribute::NoAlias); + fn->addAttribute(2, llvm::Attribute::NoAlias); + fn->setCallingConv(llvm::CallingConv::GHC); + empl.first->second.fn = fn; } } - return result; + // Enqueue + m_function_queue.push_back(addr); + + return &empl.first->second; + } + + // Create tail call to the function chunk (non-tail calls are just out of question) + void tail_chunk(llvm::Value* chunk, llvm::Value* base_pc = nullptr) + { + auto call = m_ir->CreateCall(chunk, {m_thread, m_lsptr, base_pc ? base_pc : m_base_pc}); + call->setCallingConv(llvm::CallingConv::GHC); + call->setTailCall(); + m_ir->CreateRetVoid(); + } + + // Call the real function + void call_function(llvm::Function* fn, bool tail = false) + { + llvm::Value* lr{}; + llvm::Value* sp{}; + llvm::Value* args[2]{}; + + if (!m_finfo->fn && !m_block) + { + lr = m_ir->CreateLoad(spu_ptr(&spu_thread::gpr, +s_reg_lr, &v128::_u32, 3)); + sp = m_ir->CreateLoad(spu_ptr(&spu_thread::gpr, +s_reg_sp, &v128::_u32, 3)); + + for (u32 i = 3; i < 3 + std::size(args); i++) + { + args[i - 3] = m_ir->CreateLoad(spu_ptr(&spu_thread::gpr, +i)); + } + } + else + { + lr = m_ir->CreateExtractElement(get_reg_fixed(s_reg_lr).value, 3); + sp = m_ir->CreateExtractElement(get_reg_fixed(s_reg_sp).value, 3); + + for (u32 i = 3; i < 3 + std::size(args); i++) + { + args[i - 3] = get_reg_fixed(i).value; + } + } + + const auto _call = m_ir->CreateCall(verify(HERE, fn), {m_thread, m_lsptr, m_base_pc, sp, args[0], args[1]}); + + _call->setCallingConv(llvm::CallingConv::GHC); + + // Tail call using loaded LR value (gateway from a chunk) + if (!m_finfo->fn) + { + lr = m_ir->CreateAnd(lr, 0x3fffc); + m_ir->CreateStore(lr, spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(_call, spu_ptr(&spu_thread::gpr, 3)); + m_ir->CreateBr(add_block_indirect({}, value(lr))); + } + else if (tail) + { + _call->setTailCall(); + m_ir->CreateRet(_call); + } + else + { + // TODO: initialize $LR with a constant + for (u32 i = 0; i < s_reg_max; i++) + { + if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127)) + { + m_block->reg[i] = m_ir->CreateLoad(init_reg_fixed(i)); + } + } + + for (u32 i = 3; i < 3 + std::size(args); i++) + { + m_block->reg[i] = m_ir->CreateExtractValue(_call, {i - 3}); + } + } + } + + // Emit return from the real function + void ret_function() + { + llvm::Value* r = llvm::ConstantAggregateZero::get(get_type()); + + for (u32 i = 3; i < 5; i++) + { + r = m_ir->CreateInsertValue(r, get_reg_fixed(i).value, {i - 3}); + } + + m_ir->CreateRet(r); } void set_function(llvm::Function* func) @@ -2359,6 +3197,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_function = func; m_thread = &*func->arg_begin(); m_lsptr = &*(func->arg_begin() + 1); + m_base_pc = &*(func->arg_begin() + 2); m_reg_addr.fill(nullptr); m_block = nullptr; @@ -2366,27 +3205,76 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_blocks.clear(); m_block_queue.clear(); m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", m_function)); - m_memptr = m_ir->CreateIntToPtr(m_ir->getInt64((u64)vm::g_base_addr), get_type()); + m_memptr = m_ir->CreateLoad(spu_ptr(&spu_thread::memory_base_addr)); } // Add block with current block as a predecessor llvm::BasicBlock* add_block(u32 target) { // Check the predecessor - const bool pred_found = m_block_info[target / 4] && m_preds[target].find_first_of(m_pos) != -1; + const bool pred_found = m_block_info[target / 4] && m_preds[target].find_first_of(m_pos) + 1; if (m_blocks.empty()) { // Special case: first block, proceed normally + if (auto fn = std::exchange(m_finfo->fn, nullptr)) + { + // Create a gateway + call_function(fn, true); + + m_finfo->fn = fn; + m_function = fn; + m_thread = &*fn->arg_begin(); + m_lsptr = &*(fn->arg_begin() + 1); + m_base_pc = &*(fn->arg_begin() + 2); + m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", fn)); + m_memptr = m_ir->CreateLoad(spu_ptr(&spu_thread::memory_base_addr)); + + // Load registers at the entry chunk + for (u32 i = 0; i < s_reg_max; i++) + { + if (i >= s_reg_80 && i <= s_reg_127) + { + // TODO + //m_finfo->load[i] = llvm::UndefValue::get(get_reg_type(i)); + } + + m_finfo->load[i] = m_ir->CreateLoad(init_reg_fixed(i)); + } + + // Load $SP + //m_finfo->load[s_reg_sp] = m_ir->CreateVectorSplat(4, &*(fn->arg_begin() + 3)); + + // Load first args + for (u32 i = 3; i < 5; i++) + { + m_finfo->load[i] = &*(fn->arg_begin() + i + 1); + } + } } - else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target)) + else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target) && (!m_finfo->fn || !m_ret_info[target / 4])) { // Generate a tail call to the function chunk const auto cblock = m_ir->GetInsertBlock(); const auto result = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->SetInsertPoint(result); - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); - tail(add_function(target)); + const auto pfinfo = add_function(target); + + if (pfinfo->fn) + { + // Tail call to the real function + call_function(pfinfo->fn, true); + + if (!result->getTerminator()) + ret_function(); + } + else + { + // Just a boring tail call to another chunk + update_pc(target); + tail_chunk(pfinfo->chunk); + } + m_ir->SetInsertPoint(cblock); return result; } @@ -2397,14 +3285,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator LOG_ERROR(SPU, "[0x%x] Predecessor not found for target 0x%x (chunk=0x%x, entry=0x%x, size=%u)", m_pos, target, m_entry, m_function_queue[0], m_size / 4); } - // Generate a patchpoint for fixed location const auto cblock = m_ir->GetInsertBlock(); - const auto ppptr = m_spurt->make_branch_patchpoint(target); const auto result = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->SetInsertPoint(result); - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); - const auto type = llvm::FunctionType::get(get_type(), {get_type(), get_type(), get_type()}, false)->getPointerTo(); - tail(m_ir->CreateIntToPtr(m_ir->getInt64(reinterpret_cast(ppptr ? ppptr : &spu_recompiler_base::dispatch)), type)); + update_pc(target); + m_ir->CreateRetVoid(); m_ir->SetInsertPoint(cblock); return result; } @@ -2541,58 +3426,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator llvm::Value* double_as_uint64(llvm::Value* val) { - if (llvm::isa(val)) - { - return splat(0).eval(m_ir); - } - - if (auto cv = llvm::dyn_cast(val)) - { - const f64 data[4] - { - cv->getElementAsDouble(0), - cv->getElementAsDouble(1), - cv->getElementAsDouble(2), - cv->getElementAsDouble(3) - }; - - return llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef((const u64*)(const u8*)+data, 4)); - } - - if (llvm::isa(val)) - { - fmt::throw_exception("[0x%x] double_as_uint64: bad constant type", m_pos); - } - - return m_ir->CreateBitCast(val, get_type()); + return bitcast(val); } llvm::Value* uint64_as_double(llvm::Value* val) { - if (llvm::isa(val)) - { - return fsplat(0.).eval(m_ir); - } - - if (auto cv = llvm::dyn_cast(val)) - { - const u64 data[4] - { - cv->getElementAsInteger(0), - cv->getElementAsInteger(1), - cv->getElementAsInteger(2), - cv->getElementAsInteger(3) - }; - - return llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef((const f64*)(const u8*)+data, 4)); - } - - if (llvm::isa(val)) - { - fmt::throw_exception("[0x%x] uint64_as_double: bad constant type", m_pos); - } - - return m_ir->CreateBitCast(val, get_type()); + return bitcast(val); } llvm::Value* double_to_xfloat(llvm::Value* val) @@ -2664,7 +3503,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!reg) { // Load register value if necessary - reg = m_ir->CreateLoad(init_reg_fixed(index)); + reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(init_reg_fixed(index)); } if (reg->getType() == get_type()) @@ -2674,79 +3513,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return reg; } - const auto res = double_to_xfloat(reg); - - if (auto c = llvm::dyn_cast(res)) - { - return make_const_vector(get_const_vector(c, m_pos, 1000 + index), type); - } - - return m_ir->CreateBitCast(res, type); + return bitcast(double_to_xfloat(reg), type); } if (type == get_type()) { - if (const auto phi = llvm::dyn_cast(reg)) - { - if (phi->getNumUses()) - { - LOG_WARNING(SPU, "[0x%x] $%u: Phi has uses :(", m_pos, index); - } - else - { - const auto cblock = m_ir->GetInsertBlock(); - m_ir->SetInsertPoint(phi); - - const auto newphi = m_ir->CreatePHI(get_type(), phi->getNumIncomingValues()); - - for (u32 i = 0; i < phi->getNumIncomingValues(); i++) - { - const auto iblock = phi->getIncomingBlock(i); - m_ir->SetInsertPoint(iblock->getTerminator()); - const auto ivalue = phi->getIncomingValue(i); - newphi->addIncoming(xfloat_to_double(ivalue), iblock); - } - - for (auto& b : m_blocks) - { - if (b.second.phi[index] == phi) - { - b.second.phi[index] = newphi; - } - - if (b.second.reg[index] == phi) - { - b.second.reg[index] = newphi; - } - } - - reg = newphi; - - m_ir->SetInsertPoint(cblock); - phi->eraseFromParent(); - return reg; - } - } - - if (auto c = llvm::dyn_cast(reg)) - { - return xfloat_to_double(make_const_vector(get_const_vector(c, m_pos, 2000 + index), get_type())); - } - - return xfloat_to_double(m_ir->CreateBitCast(reg, get_type())); + return xfloat_to_double(bitcast(reg)); } - // Bitcast the constant if necessary - if (auto c = llvm::dyn_cast(reg)) - { - // TODO - if (index < 128) - { - return make_const_vector(get_const_vector(c, m_pos, index), type); - } - } - - return m_ir->CreateBitCast(reg, type); + return bitcast(reg, type); } template @@ -2765,7 +3540,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if ((m_op_const_mask & index.data_mask()) != index.data_mask()) { // Update const mask if necessary - if (I >= (32 - m_interp_magn)) + if (I >= (32u - m_interp_magn)) { m_op_const_mask |= index.data_mask(); } @@ -2828,7 +3603,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator template bool match_vr(const bf_t& index, F&& pred) { - return ((match_vr(index) && pred(match_vr(index), match())) || ...); + return (( match_vr(index) ? pred(match_vr(index), match()) : false ) || ...); } template @@ -2839,28 +3614,32 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Extract scalar value from the preferred slot template - auto get_scalar(T&& value) + auto get_scalar(value_t value) { - using v_type = typename llvm_expr_t::type; - using e_type = std::remove_extent_t; + using e_type = std::remove_extent_t; - static_assert(sizeof(v_type) == 16 || std::is_same_v, "Unknown vector type"); + static_assert(sizeof(T) == 16 || std::is_same_v, "Unknown vector type"); + + if (auto [ok, v] = match_expr(value, vsplat(match())); ok) + { + return eval(v); + } if constexpr (sizeof(e_type) == 1) { - return extract(std::forward(value), 12); + return eval(extract(value, 12)); } else if constexpr (sizeof(e_type) == 2) { - return extract(std::forward(value), 6); + return eval(extract(value, 6)); } - else if constexpr (sizeof(e_type) == 4 || sizeof(v_type) == 32) + else if constexpr (sizeof(e_type) == 4 || sizeof(T) == 32) { - return extract(std::forward(value), 3); + return eval(extract(value, 3)); } else { - return extract(std::forward(value), 1); + return eval(extract(value, 1)); } } @@ -2895,6 +3674,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator _store->eraseFromParent(); } + if (m_finfo && m_finfo->fn) + { + if (index == s_reg_lr || (index >= 3 && index <= 4) || (index >= s_reg_80 && index <= s_reg_127)) + { + // Don't save some registers in true functions + return; + } + } + // Write register to the context _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, addr->getType()->getPointerElementType()), addr); } @@ -2911,7 +3699,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if ((m_op_const_mask & index.data_mask()) != index.data_mask()) { // Update const mask if necessary - if (I >= (32 - m_interp_magn)) + if (I >= (32u - m_interp_magn)) { m_op_const_mask |= index.data_mask(); } @@ -2933,7 +3721,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if ((m_op_const_mask & imm.data_mask()) != imm.data_mask()) { // Update const mask if necessary - if (I >= (32 - m_interp_magn)) + if (I >= (32u - m_interp_magn)) { m_op_const_mask |= imm.data_mask(); } @@ -2966,7 +3754,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if ((m_op_const_mask & imm.data_mask()) != imm.data_mask()) { // Update const mask if necessary - if (I >= (32 - m_interp_magn)) + if (I >= (32u - m_interp_magn)) { m_op_const_mask |= imm.data_mask(); } @@ -2974,8 +3762,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Extract signed immediate (skip sign ext if truncated anyway) value_t r; r.value = m_interp_op; - r.value = I + N == 32 || N >= r.esize ? r.value : m_ir->CreateShl(r.value, u64{32 - I - N}); - r.value = N == 32 || N >= r.esize ? r.value : m_ir->CreateAShr(r.value, u64{32 - N}); + r.value = I + N == 32 || N >= r.esize ? r.value : m_ir->CreateShl(r.value, u64{32u - I - N}); + r.value = N == 32 || N >= r.esize ? r.value : m_ir->CreateAShr(r.value, u64{32u - N}); r.value = I == 0 || N < r.esize ? r.value : m_ir->CreateLShr(r.value, u64{I}); if (r.esize != 32) @@ -2994,9 +3782,16 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return eval(splat(imm)); } - void update_pc() + // Get PC for given instruction address + llvm::Value* get_pc(u32 addr) { - m_ir->CreateStore(m_ir->getInt32(m_pos), spu_ptr(&spu_thread::pc))->setVolatile(true); + return m_ir->CreateAdd(m_base_pc, m_ir->getInt32(addr - m_base)); + } + + // Update PC for current or explicitly specified instruction address + void update_pc(u32 target = -1) + { + m_ir->CreateStore(get_pc(target + 1 ? target : m_pos), spu_ptr(&spu_thread::pc), true); } // Call cpu_thread::check_state if necessary and return or continue (full check) @@ -3005,50 +3800,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto pstate = spu_ptr(&spu_thread::state); const auto _body = llvm::BasicBlock::Create(m_context, "", m_function); const auto check = llvm::BasicBlock::Create(m_context, "", m_function); - const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(pstate, true), m_ir->getInt32(0)), _body, check, m_md_likely); m_ir->SetInsertPoint(check); - m_ir->CreateStore(m_ir->getInt32(addr), spu_ptr(&spu_thread::pc)); - m_ir->CreateCondBr(m_ir->CreateLoad(m_fake_global1, true), stop, _body, m_md_unlikely); - m_ir->SetInsertPoint(stop); + update_pc(addr); m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true); m_ir->CreateBr(_body); m_ir->SetInsertPoint(_body); } - // Perform external call - template - llvm::CallInst* call(RT(*_func)(FArgs...), Args... args) - { - static_assert(sizeof...(FArgs) == sizeof...(Args), "spu_llvm_recompiler::call(): unexpected arg number"); - const auto iptr = reinterpret_cast(_func); - const auto type = llvm::FunctionType::get(get_type(), {args->getType()...}, false)->getPointerTo(); - return m_ir->CreateCall(m_ir->CreateIntToPtr(m_ir->getInt64(iptr), type), {args...}); - } - - // Perform external call and return - template - void tail(RT(*_func)(FArgs...), Args... args) - { - const auto inst = call(_func, args...); - inst->setTailCall(); - - if (inst->getType() == get_type()) - { - m_ir->CreateRetVoid(); - } - else - { - m_ir->CreateRet(inst); - } - } - - void tail(llvm::Value* func_ptr) - { - m_ir->CreateCall(func_ptr, {m_thread, m_lsptr, m_ir->getInt32(0)})->setTailCall(); - m_ir->CreateRetVoid(); - } - public: spu_llvm_recompiler(u8 interp_magn = 0) : spu_recompiler_base() @@ -3064,8 +3823,7 @@ public: { m_cache = fxm::get(); m_spurt = fxm::get_always(); - m_context = m_jit.get_context(); - m_use_ssse3 = m_jit.has_ssse3(); + cpu_translator::initialize(m_jit.get_context(), m_jit.get_engine()); const auto md_name = llvm::MDString::get(m_context, "branch_weights"); const auto md_low = llvm::ValueAsMetadata::get(llvm::ConstantInt::get(GetType(), 1)); @@ -3131,6 +3889,7 @@ public: } m_pos = func[0]; + m_base = func[0]; m_size = (func.size() - 1) * 4; const u32 start = m_pos * (g_cfg.core.spu_block_size != spu_block_size_type::giga); const u32 end = start + m_size; @@ -3187,14 +3946,14 @@ public: set_function(main_func); // Start compilation - - update_pc(); - const auto label_test = BasicBlock::Create(m_context, "", m_function); const auto label_diff = BasicBlock::Create(m_context, "", m_function); const auto label_body = BasicBlock::Create(m_context, "", m_function); const auto label_stop = BasicBlock::Create(m_context, "", m_function); + // Load PC, which will be the actual value of 'm_base' + m_base_pc = m_ir->CreateLoad(spu_ptr(&spu_thread::pc)); + // Emit state check const auto pstate = spu_ptr(&spu_thread::state); m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(pstate, true), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); @@ -3210,24 +3969,40 @@ public: } else if (func.size() - 1 == 1) { - const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr(m_lsptr, start)), m_ir->getInt32(func[1])); + const auto pu32 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type()); + const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu32), m_ir->getInt32(func[1])); m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } - else if (func.size() - 1 == 2) + else if (func.size() - 1 == 2 && g_cfg.core.spu_block_size != spu_block_size_type::giga) { - const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr(m_lsptr, start)), m_ir->getInt64(static_cast(func[2]) << 32 | func[1])); + const auto pu64 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type()); + const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu64), m_ir->getInt64(static_cast(func[2]) << 32 | func[1])); m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } else { - const u32 starta = start & -32; - const u32 enda = ::align(end, 32); - const u32 sizea = (enda - starta) / 32; - verify(HERE), sizea; + u32 starta = start; + + // Skip holes at the beginning (giga only) + for (u32 j = start; j < end; j += 4) + { + if (!func[(j - start) / 4 + 1]) + { + starta += 4; + } + else + { + break; + } + } + + // Get actual pc corresponding to the found beginning of the data + llvm::Value* starta_pc = m_ir->CreateAnd(get_pc(starta), 0x3fffc); + llvm::Value* data_addr = m_ir->CreateGEP(m_lsptr, starta_pc); llvm::Value* acc = nullptr; - for (u32 j = starta; j < enda; j += 32) + for (u32 j = starta; j < end; j += 32) { u32 indices[8]; bool holes = false; @@ -3251,12 +4026,12 @@ public: if (!data) { - // Skip aligned holes + // Skip full-sized holes continue; } - // Load aligned code block from LS - llvm::Value* vls = m_ir->CreateLoad(_ptr(m_lsptr, j)); + // Load unaligned code block from LS + llvm::Value* vls = m_ir->CreateAlignedLoad(_ptr(data_addr, j - starta), 4); // Mask if necessary if (holes) @@ -3295,11 +4070,12 @@ public: const auto pbcount = spu_ptr(&spu_thread::block_counter); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbcount), m_ir->getInt64(check_iterations)), pbcount); - // Call the entry function chunk - const auto entry_chunk = add_function(m_pos); - m_ir->CreateCall(entry_chunk, {m_thread, m_lsptr, m_ir->getInt32(0)})->setTailCall(); - m_ir->CreateRetVoid(); + const auto gateway = llvm::cast(m_module->getOrInsertFunction("spu_chunk_gateway", get_ftype()).getCallee()); + gateway->setLinkage(GlobalValue::InternalLinkage); + gateway->setCallingConv(CallingConv::GHC); + m_ir->CreateCall(gateway, {m_thread, m_lsptr, m_base_pc})->setCallingConv(CallingConv::GHC); + m_ir->CreateRetVoid(); m_ir->SetInsertPoint(label_stop); m_ir->CreateRetVoid(); @@ -3309,22 +4085,56 @@ public: { const auto pbfail = spu_ptr(&spu_thread::block_failure); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail); - tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), main_arg2); + call("spu_dispatch", &spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), main_arg2)->setTailCall(); + m_ir->CreateRetVoid(); } else { m_ir->CreateUnreachable(); } + set_function(gateway); + + // Save host thread's stack pointer in the gateway + const auto native_sp = spu_ptr(&spu_thread::saved_native_sp); + const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); + m_ir->CreateStore(m_ir->CreateCall(get_intrinsic(Intrinsic::read_register), {rsp_name}), native_sp); + + // Call the entry function chunk + const auto entry_chunk = add_function(m_pos); + tail_chunk(entry_chunk->chunk); + + // Longjmp analogue (restore saved host thread's stack pointer) + const auto escape = llvm::cast(m_module->getOrInsertFunction("spu_escape", get_ftype()).getCallee()); + escape->setLinkage(GlobalValue::InternalLinkage); + m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", escape)); + const auto load_sp = m_ir->CreateLoad(_ptr(&*escape->arg_begin(), ::offset32(&spu_thread::saved_native_sp))); + m_ir->CreateCall(get_intrinsic(Intrinsic::write_register), {rsp_name, load_sp}); + m_ir->CreateRetVoid(); + + // Function that executes check_state and escapes if necessary + m_test_state = llvm::cast(m_module->getOrInsertFunction("spu_test_state", get_ftype()).getCallee()); + m_test_state->setLinkage(GlobalValue::InternalLinkage); + m_test_state->setCallingConv(CallingConv::PreserveAll); + m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", m_test_state)); + const auto escape_yes = BasicBlock::Create(m_context, "", m_test_state); + const auto escape_no = BasicBlock::Create(m_context, "", m_test_state); + m_ir->CreateCondBr(call("spu_exec_check_state", &exec_check_state, &*m_test_state->arg_begin()), escape_yes, escape_no); + m_ir->SetInsertPoint(escape_yes); + m_ir->CreateCall(escape, {&*m_test_state->arg_begin()}); + m_ir->CreateRetVoid(); + m_ir->SetInsertPoint(escape_no); + m_ir->CreateRetVoid(); + // Create function table (uninitialized) - m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); + m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); // Create function chunks for (std::size_t fi = 0; fi < m_function_queue.size(); fi++) { // Initialize function info m_entry = m_function_queue[fi]; - set_function(m_functions[m_entry].func); + set_function(m_functions[m_entry].chunk); m_finfo = &m_functions[m_entry]; m_ir->CreateBr(add_block(m_entry)); @@ -3337,18 +4147,21 @@ public: m_ir->SetInsertPoint(m_block->block); auto& bb = m_bbs.at(baddr); bool need_check = false; + m_block->bb = &bb; if (bb.preds.size()) { // Initialize registers and build PHI nodes if necessary for (u32 i = 0; i < s_reg_max; i++) { - const u32 src = bb.reg_origin[i]; + const u32 src = m_finfo->fn ? bb.reg_origin_abs[i] : bb.reg_origin[i]; - if (src == -1) + if (src > 0x40000) { - // TODO: type - const auto _phi = m_ir->CreatePHI(get_reg_type(i), ::size32(bb.preds)); + // Use the xfloat hint to create 256-bit (4x double) PHI + llvm::Type* type = g_cfg.core.spu_accurate_xfloat && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); + + const auto _phi = m_ir->CreatePHI(type, ::size32(bb.preds), fmt::format("phi0x%05x_r%u", baddr, i)); m_block->phi[i] = _phi; m_block->reg[i] = _phi; @@ -3369,22 +4182,20 @@ public: if (!value) { // Value hasn't been loaded yet - value = m_finfo->reg[i] ? m_finfo->reg[i] : m_ir->CreateLoad(regptr); + value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(regptr); } - if (value->getType() == get_type()) + if (value->getType() == get_type() && type != get_type()) { value = double_to_xfloat(value); } - else if (i < 128 && llvm::isa(value)) + else if (value->getType() != get_type() && type == get_type()) { - // Bitcast the constant - value = make_const_vector(get_const_vector(llvm::cast(value), baddr, i), _phi->getType()); + value = xfloat_to_double(bitcast(value)); } else { - // Ensure correct value type - value = m_ir->CreateBitCast(value, _phi->getType()); + value = bitcast(value, _phi->getType()); } m_ir->SetInsertPoint(cblock); @@ -3402,7 +4213,7 @@ public: const auto regptr = init_reg_fixed(i); const auto cblock = m_ir->GetInsertBlock(); m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); - const auto value = m_finfo->reg[i] ? m_finfo->reg[i] : m_ir->CreateLoad(regptr); + const auto value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(regptr); m_ir->SetInsertPoint(cblock); _phi->addIncoming(value, &m_function->getEntryBlock()); } @@ -3421,10 +4232,9 @@ public: LOG_ERROR(SPU, "[0x%05x] Value not found ($%u from 0x%05x)", baddr, i, src); } } - else if (baddr == m_entry) + else { - // Passthrough constant from a different chunk (will be removed in future) - m_block->reg[i] = m_finfo->reg[i]; + m_block->reg[i] = m_finfo->load[i]; } } @@ -3491,7 +4301,7 @@ public: { const auto tfound = m_targets.find(m_pos); - if (tfound == m_targets.end() || tfound->second.find_first_of(target) == -1) + if (tfound == m_targets.end() || tfound->second.find_first_of(target) + 1 == 0) { LOG_ERROR(SPU, "Unregistered fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", target, m_entry, m_function_queue[0]); } @@ -3512,8 +4322,9 @@ public: std::vector chunks; chunks.reserve(m_size / 4); - const auto null = cast(module->getOrInsertFunction("spu-null", get_ftype()).getCallee()); + const auto null = cast(module->getOrInsertFunction("spu-null", entry_chunk->chunk->getFunctionType()).getCallee()); null->setLinkage(llvm::GlobalValue::InternalLinkage); + null->setCallingConv(llvm::CallingConv::GHC); set_function(null); m_ir->CreateRetVoid(); @@ -3523,29 +4334,14 @@ public: if (found == m_functions.end()) { - if (m_entry_info[i / 4]) - { - LOG_ERROR(SPU, "[0x%x] Function chunk not compiled: 0x%x", func[0], i); - } - chunks.push_back(null); continue; } - chunks.push_back(found->second.func); - - // If a chunk has incoming constants, we can't add it to the function table (TODO) - for (const auto c : found->second.reg) - { - if (c != nullptr) - { - chunks.back() = null; - break; - } - } + chunks.push_back(found->second.chunk); } - m_function_table->setInitializer(llvm::ConstantArray::get(llvm::ArrayType::get(entry_chunk->getType(), m_size / 4), chunks)); + m_function_table->setInitializer(llvm::ConstantArray::get(llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), chunks)); } else { @@ -3566,44 +4362,31 @@ public: for (const auto& func : m_functions) { - const auto f = func.second.func; + const auto f = func.second.fn ? func.second.fn : func.second.chunk; pm.run(*f); for (auto& bb : *f) { for (auto& i : bb) { - // Replace volatile fake load with check_state call - if (auto li = dyn_cast(&i); li && li->getOperand(0) == m_fake_global1) - { - m_ir->SetInsertPoint(bb.getTerminator()); - li->replaceAllUsesWith(call(&exec_check_state, &*f->arg_begin())); - li->eraseFromParent(); - break; - } - - // Replace volatile fake store with return + // Replace volatile fake store with spu_test_state call if (auto si = dyn_cast(&i); si && si->getOperand(1) == m_fake_global1) { - const auto br = bb.getTerminator(); + m_ir->SetInsertPoint(si); - for (auto& j : *br->getSuccessor(0)) + CallInst* ci{}; + if (si->getOperand(0) == m_ir->getFalse()) { - // Cleanup PHI nodes if exist - if (auto phi = dyn_cast(&j)) - { - phi->removeIncomingValue(&bb, false); - } - else - { - break; - } + ci = m_ir->CreateCall(m_test_state, {&*f->arg_begin()}); + ci->setCallingConv(CallingConv::PreserveAll); + } + else + { + continue; } - m_ir->SetInsertPoint(bb.getTerminator()); - m_ir->CreateRetVoid(); + si->replaceAllUsesWith(ci); si->eraseFromParent(); - br->eraseFromParent(); break; } } @@ -3615,7 +4398,6 @@ public: m_block_queue.clear(); m_functions.clear(); m_function_queue.clear(); - m_scan_queue.clear(); m_function_table = nullptr; std::string log; @@ -3752,8 +4534,13 @@ public: // Pinned constant, address of first register m_interp_regs = _ptr(m_thread, get_reg_offset(0)); + // Save host thread's stack pointer + const auto native_sp = spu_ptr(&spu_thread::saved_native_sp); + const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); + m_ir->CreateStore(m_ir->CreateCall(get_intrinsic(Intrinsic::read_register), {rsp_name}), native_sp); + // Decode (shift) and load function pointer - const auto first = m_ir->CreateLoad(m_ir->CreateGEP(m_ir->CreateBitCast(m_interp_table, if_pptr), m_ir->CreateLShr(m_interp_op, 32 - m_interp_magn))); + const auto first = m_ir->CreateLoad(m_ir->CreateGEP(m_ir->CreateBitCast(m_interp_table, if_pptr), m_ir->CreateLShr(m_interp_op, 32u - m_interp_magn))); const auto call0 = m_ir->CreateCall(first, {m_lsptr, m_thread, m_interp_pc, m_interp_op, m_interp_table, m_interp_7f0, m_interp_regs}); call0->setCallingConv(CallingConv::GHC); m_ir->CreateRetVoid(); @@ -3787,7 +4574,7 @@ public: for (u32 i = 0; i < 1u << m_interp_magn;) { // Fake opcode - const u32 op = i << (32 - m_interp_magn); + const u32 op = i << (32u - m_interp_magn); // Instruction type const auto itype = s_spu_itype.decode(op); @@ -3803,7 +4590,7 @@ public: else { // Inject const mask into function name - fmt::append(fname, "_%X", (i & (m_op_const_mask >> (32 - m_interp_magn))) | (1u << m_interp_magn)); + fmt::append(fname, "_%X", (i & (m_op_const_mask >> (32u - m_interp_magn))) | (1u << m_interp_magn)); } // Decode instruction name, access function @@ -3825,6 +4612,7 @@ public: m_interp_regs = &*(f->arg_begin() + 6); m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", f)); + m_memptr = m_ir->CreateLoad(spu_ptr(&spu_thread::memory_base_addr)); switch (itype) { @@ -3892,14 +4680,14 @@ public: const auto next_pc = itype & spu_itype::branch ? m_interp_pc : m_interp_pc_next; const auto be32_op = m_ir->CreateLoad(m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_ir->CreateZExt(next_pc, get_type())), get_type())); const auto next_op = m_ir->CreateCall(get_intrinsic(Intrinsic::bswap), {be32_op}); - const auto next_if = m_ir->CreateLoad(m_ir->CreateGEP(m_ir->CreateBitCast(m_interp_table, if_pptr), m_ir->CreateLShr(next_op, 32 - m_interp_magn))); + const auto next_if = m_ir->CreateLoad(m_ir->CreateGEP(m_ir->CreateBitCast(m_interp_table, if_pptr), m_ir->CreateLShr(next_op, 32u - m_interp_magn))); llvm::cast(next_if)->setVolatile(true); if (!(itype & spu_itype::branch)) { if (check) { - call(&interp_check, m_thread, m_ir->getFalse()); + call("spu_interp_check", &interp_check, m_thread, m_ir->getFalse()); } // Normal instruction. @@ -3907,7 +4695,7 @@ public: if (check && !m_ir->GetInsertBlock()->getTerminator()) { - call(&interp_check, m_thread, m_ir->getTrue()); + call("spu_interp_check", &interp_check, m_thread, m_ir->getTrue()); } m_interp_pc = m_interp_pc_next; @@ -4048,14 +4836,16 @@ public: template void fall(spu_opcode_t op) { + std::string name = fmt::format("spu_%s", s_spu_iname.decode(op.opcode)); + if (m_interp_magn) { - call(F, m_thread, m_interp_op); + call(name, F, m_thread, m_interp_op); return; } update_pc(); - call(&exec_fall, m_thread, m_ir->getInt32(op.opcode)); + call(name, &exec_fall, m_thread, m_ir->getInt32(op.opcode)); } static void exec_unk(spu_thread* _spu, u32 op) @@ -4068,13 +4858,14 @@ public: if (m_interp_magn) { m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); - call(&exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); + call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); return; } m_block->block_end = m_ir->GetInsertBlock(); update_pc(); - tail(&exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); + call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); + m_ir->CreateRetVoid(); } static bool exec_stop(spu_thread* _spu, u32 code) @@ -4086,7 +4877,7 @@ public: { if (m_interp_magn) { - const auto succ = call(&exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff))); + const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff))); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(succ, next, stop); @@ -4097,18 +4888,19 @@ public: } update_pc(); - const auto succ = call(&exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff)); + const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(succ, next, stop); m_ir->SetInsertPoint(stop); - m_ir->CreateRetVoid(); + m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true); + m_ir->CreateBr(next); m_ir->SetInsertPoint(next); if (g_cfg.core.spu_block_size == spu_block_size_type::safe) { m_block->block_end = m_ir->GetInsertBlock(); - m_ir->CreateStore(m_ir->getInt32(m_pos + 4), spu_ptr(&spu_thread::pc)); + update_pc(m_pos + 4); m_ir->CreateRetVoid(); } else @@ -4121,7 +4913,7 @@ public: { if (m_interp_magn) { - const auto succ = call(&exec_stop, m_thread, m_ir->getInt32(0x3fff)); + const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(0x3fff)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(succ, next, stop); @@ -4180,8 +4972,8 @@ public: } else { - const auto val = m_ir->CreateLoad(ptr); - m_ir->CreateStore(m_ir->getInt64(0), ptr); + const auto val = m_ir->CreateLoad(ptr, true); + m_ir->CreateStore(m_ir->getInt64(0), ptr, true); val0 = val; } @@ -4191,14 +4983,16 @@ public: const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpSLT(val0, m_ir->getInt64(0)), done, wait); m_ir->SetInsertPoint(wait); - const auto val1 = call(&exec_rdch, m_thread, m_ir->getInt32(op.ra)); + const auto val1 = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra)); m_ir->CreateCondBr(m_ir->CreateICmpSLT(val1, m_ir->getInt64(0)), stop, done); m_ir->SetInsertPoint(stop); - m_ir->CreateRetVoid(); + m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true); + m_ir->CreateBr(done); m_ir->SetInsertPoint(done); const auto rval = m_ir->CreatePHI(get_type(), 2); rval->addIncoming(val0, _cur); rval->addIncoming(val1, wait); + rval->addIncoming(m_ir->getInt64(0), stop); return m_ir->CreateTrunc(rval, get_type()); } @@ -4208,7 +5002,7 @@ public: if (m_interp_magn) { - res.value = call(&exec_rdch, m_thread, get_imm(op.ra).value); + res.value = call("spu_read_channel", &exec_rdch, m_thread, get_imm(op.ra).value); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next); @@ -4230,12 +5024,13 @@ public: case SPU_RdInMbox: { update_pc(); - res.value = call(&exec_read_in_mbox, m_thread); + res.value = call("spu_read_in_mbox", &exec_read_in_mbox, m_thread); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next); m_ir->SetInsertPoint(stop); - m_ir->CreateRetVoid(); + m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true); + m_ir->CreateBr(next); m_ir->SetInsertPoint(next); res.value = m_ir->CreateTrunc(res.value, get_type()); break; @@ -4272,7 +5067,7 @@ public: } case SPU_RdDec: { - res.value = call(&exec_read_dec, m_thread); + res.value = call("spu_read_decrementer", &exec_read_dec, m_thread); break; } case SPU_RdEventMask: @@ -4283,12 +5078,13 @@ public: case SPU_RdEventStat: { update_pc(); - res.value = call(&exec_read_events, m_thread); + res.value = call("spu_read_events", &exec_read_events, m_thread); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next); m_ir->SetInsertPoint(stop); - m_ir->CreateRetVoid(); + m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true); + m_ir->CreateBr(next); m_ir->SetInsertPoint(next); res.value = m_ir->CreateTrunc(res.value, get_type()); break; @@ -4302,12 +5098,13 @@ public: default: { update_pc(); - res.value = call(&exec_rdch, m_thread, m_ir->getInt32(op.ra)); + res.value = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next); m_ir->SetInsertPoint(stop); - m_ir->CreateRetVoid(); + m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true); + m_ir->CreateBr(next); m_ir->SetInsertPoint(next); res.value = m_ir->CreateTrunc(res.value, get_type()); break; @@ -4340,7 +5137,7 @@ public: if (m_interp_magn) { - res.value = call(&exec_rchcnt, m_thread, get_imm(op.ra).value); + res.value = call("spu_read_channel_count", &exec_rchcnt, m_thread, get_imm(op.ra).value); set_vr(op.rt, insert(splat(0), 3, res)); return; } @@ -4404,7 +5201,7 @@ public: } case SPU_RdEventStat: { - res.value = call(&exec_get_events, m_thread); + res.value = call("spu_get_events", &exec_get_events, m_thread); res.value = m_ir->CreateICmpNE(res.value, m_ir->getInt32(0)); res.value = m_ir->CreateZExt(res.value, get_type()); break; @@ -4412,7 +5209,7 @@ public: default: { - res.value = call(&exec_rchcnt, m_thread, m_ir->getInt32(op.ra)); + res.value = call("spu_read_channel_count", &exec_rchcnt, m_thread, m_ir->getInt32(op.ra)); break; } } @@ -4454,7 +5251,7 @@ public: if (m_interp_magn) { - const auto succ = call(&exec_wrch, m_thread, get_imm(op.ra).value, val.value); + const auto succ = call("spu_write_channel", &exec_wrch, m_thread, get_imm(op.ra).value, val.value); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(succ, next, stop); @@ -4612,7 +5409,7 @@ public: m_ir->CreateUnreachable(); m_ir->SetInsertPoint(next); m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); - call(&exec_mfc_cmd, m_thread); + call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); return; } case MFC_SNDSIG_CMD: @@ -4665,7 +5462,7 @@ public: m_ir->CreateCondBr(m_ir->CreateICmpUGE(eal.value, m_ir->getInt32(0xe0000000)), mmio, copy, m_md_unlikely); m_ir->SetInsertPoint(mmio); m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); - call(&exec_mfc_cmd, m_thread); + call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); m_ir->CreateBr(next); m_ir->SetInsertPoint(copy); @@ -4842,14 +5639,14 @@ public: const auto _mfc = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpNE(_old, _new), _mfc, next); m_ir->SetInsertPoint(_mfc); - call(&exec_list_unstall, m_thread, eval(val & 0x1f).value); + call("spu_list_unstall", &exec_list_unstall, m_thread, eval(val & 0x1f).value); m_ir->CreateBr(next); m_ir->SetInsertPoint(next); return; } case SPU_WrDec: { - m_ir->CreateStore(call(&get_timebased_time), spu_ptr(&spu_thread::ch_dec_start_timestamp)); + m_ir->CreateStore(call("get_timebased_time", &get_timebased_time), spu_ptr(&spu_thread::ch_dec_start_timestamp)); m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_dec_value)); return; } @@ -4870,12 +5667,13 @@ public: } update_pc(); - const auto succ = call(&exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value); + const auto succ = call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(succ, next, stop); m_ir->SetInsertPoint(stop); - m_ir->CreateRetVoid(); + m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true); + m_ir->CreateBr(next); m_ir->SetInsertPoint(next); } @@ -4895,7 +5693,7 @@ public: if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !m_interp_magn) { m_block->block_end = m_ir->GetInsertBlock(); - m_ir->CreateStore(m_ir->getInt32(m_pos + 4), spu_ptr(&spu_thread::pc)); + update_pc(m_pos + 4); m_ir->CreateRetVoid(); } } @@ -5196,24 +5994,52 @@ public: void CBX(spu_opcode_t op) { + if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) + { + // Optimization with aligned stack assumption. Strange because SPU code could use CBD instead, but encountered in wild. + set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_scalar(get_vr(op.rb)) & 0xf)); + return; + } + const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb)); set_vr(op.rt, spu_get_insertion_shuffle_mask(~s & 0xf)); } void CHX(spu_opcode_t op) { + if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) + { + // See CBX. + set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_scalar(get_vr(op.rb)) >> 1 & 0x7)); + return; + } + const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb)); set_vr(op.rt, spu_get_insertion_shuffle_mask(~s >> 1 & 0x7)); } void CWX(spu_opcode_t op) { + if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) + { + // See CBX. + set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_scalar(get_vr(op.rb)) >> 2 & 0x3)); + return; + } + const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb)); set_vr(op.rt, spu_get_insertion_shuffle_mask(~s >> 2 & 0x3)); } void CDX(spu_opcode_t op) { + if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) + { + // See CBX. + set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_scalar(get_vr(op.rb)) >> 3 & 0x1)); + return; + } + const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb)); set_vr(op.rt, spu_get_insertion_shuffle_mask(~s >> 3 & 0x1)); } @@ -5276,24 +6102,52 @@ public: void CBD(spu_opcode_t op) { + if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) + { + // Known constant with aligned stack assumption (optimization). + set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_imm(op.i7) & 0xf)); + return; + } + const auto a = get_scalar(get_vr(op.ra)) + get_imm(op.i7); set_vr(op.rt, spu_get_insertion_shuffle_mask(~a & 0xf)); } void CHD(spu_opcode_t op) { + if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) + { + // See CBD. + set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_imm(op.i7) >> 1 & 0x7)); + return; + } + const auto a = get_scalar(get_vr(op.ra)) + get_imm(op.i7); set_vr(op.rt, spu_get_insertion_shuffle_mask(~a >> 1 & 0x7)); } void CWD(spu_opcode_t op) { + if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) + { + // See CBD. + set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_imm(op.i7) >> 2 & 0x3)); + return; + } + const auto a = get_scalar(get_vr(op.ra)) + get_imm(op.i7); set_vr(op.rt, spu_get_insertion_shuffle_mask(~a >> 2 & 0x3)); } void CDD(spu_opcode_t op) { + if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) + { + // See CBD. + set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_imm(op.i7) >> 3 & 0x1)); + return; + } + const auto a = get_scalar(get_vr(op.ra)) + get_imm(op.i7); set_vr(op.rt, spu_get_insertion_shuffle_mask(~a >> 3 & 0x1)); } @@ -5460,7 +6314,7 @@ public: { const auto [a, b] = get_vrs(op.ra, op.rb); const auto c = get_vr(op.rt) << 31; - set_vr(op.rt, zext(a <= b & ~(a == b & c >= 0))); + set_vr(op.rt, zext((a <= b) & ~((a == b) & (c >= 0)))); } void MPYHHA(spu_opcode_t op) @@ -5661,75 +6515,52 @@ public: void SELB(spu_opcode_t op) { - if (auto ei = llvm::dyn_cast_or_null(get_reg_raw(op.rc))) + if (match_vr(op.rc, [&](auto c, auto MP) { - // Detect if the mask comes from a comparison instruction - if (ei->getOpcode() == llvm::Instruction::SExt && ei->getSrcTy()->isIntOrIntVectorTy(1)) + using VT = typename decltype(MP)::type; + + // If the control mask comes from a comparison instruction, replace SELB with select + if (auto [ok, x] = match_expr(c, sext(match]>())); ok) { - auto op0 = ei->getOperand(0); - auto typ = ei->getDestTy(); - auto op1 = get_reg_raw(op.rb); - auto op2 = get_reg_raw(op.ra); - - if (typ == get_type()) + if constexpr (std::extent_v == 2) // u64[2] { - if (op1 && op1->getType() == get_type() || op2 && op2->getType() == get_type()) + // Try to select floats as floats if a OR b is typed as f64[2] + if (auto [a, b] = match_vrs(op.ra, op.rb); a || b) { - op1 = get_vr(op.rb).value; - op2 = get_vr(op.ra).value; + set_vr(op.rt4, select(x, get_vr(op.rb), get_vr(op.ra))); + return true; } - else - { - op1 = get_vr(op.rb).value; - op2 = get_vr(op.ra).value; - } - } - else if (typ == get_type()) - { - if (op1 && op1->getType() == get_type() || op2 && op2->getType() == get_type()) - { - op1 = get_vr(op.rb).value; - op2 = get_vr(op.ra).value; - } - else if (op1 && op1->getType() == get_type() || op2 && op2->getType() == get_type()) - { - op1 = get_vr(op.rb).value; - op2 = get_vr(op.ra).value; - } - else - { - op1 = get_vr(op.rb).value; - op2 = get_vr(op.ra).value; - } - } - else if (typ == get_type()) - { - op1 = get_vr(op.rb).value; - op2 = get_vr(op.ra).value; - } - else if (typ == get_type()) - { - op1 = get_vr(op.rb).value; - op2 = get_vr(op.ra).value; - } - else - { - LOG_ERROR(SPU, "[0x%x] SELB: unknown cast destination type", m_pos); - op0 = nullptr; } - if (op0 && op1 && op2) + if constexpr (std::extent_v == 4) // u32[4] { - set_reg_fixed(op.rt4, m_ir->CreateSelect(op0, op1, op2)); - return; + if (auto [a, b] = match_vrs(op.ra, op.rb); a || b) + { + set_vr(op.rt4, select(x, get_vr(op.rb), get_vr(op.ra))); + return true; + } + + if (auto [a, b] = match_vrs(op.ra, op.rb); a || b) + { + set_vr(op.rt4, select(x, get_vr(op.rb), get_vr(op.ra))); + return true; + } } + + set_vr(op.rt4, select(x, get_vr(op.rb), get_vr(op.ra))); + return true; } + + return false; + })) + { + return; } const auto op1 = get_reg_raw(op.rb); const auto op2 = get_reg_raw(op.ra); - if (op1 && op1->getType() == get_type() || op2 && op2->getType() == get_type()) + if ((op1 && op1->getType() == get_type()) || (op2 && op2->getType() == get_type())) { // Optimization: keep xfloat values in doubles even if the mask is unpredictable (hard way) const auto c = get_vr(op.rc); @@ -5755,7 +6586,7 @@ public: // If the mask comes from a constant generation instruction, replace SHUFB with insert if (auto [ok, i] = match_expr(c, spu_get_insertion_shuffle_mask(match())); ok) { - set_vr(op.rt4, insert(get_vr_as(c, op.rb), i, get_scalar(get_vr_as(c, op.ra)))); + set_vr(op.rt4, insert(get_vr(op.rb), i, get_scalar(get_vr(op.ra)))); return true; } @@ -6428,7 +7259,7 @@ public: void STQR(spu_opcode_t op) // { value_t addr; - addr.value = m_interp_magn ? m_ir->CreateZExt(m_interp_pc, get_type()) : m_ir->getInt64(m_pos); + addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type()); addr = eval(((get_imm(op.i16, false) << 2) + addr) & 0x3fff0); make_store_ls(addr, get_vr(op.rt)); } @@ -6436,13 +7267,24 @@ public: void LQR(spu_opcode_t op) // { value_t addr; - addr.value = m_interp_magn ? m_ir->CreateZExt(m_interp_pc, get_type()) : m_ir->getInt64(m_pos); + addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type()); addr = eval(((get_imm(op.i16, false) << 2) + addr) & 0x3fff0); set_vr(op.rt, make_load_ls(addr)); } void STQD(spu_opcode_t op) { + if (m_finfo && m_finfo->fn) + { + if (op.rt == s_reg_lr || (op.rt >= s_reg_80 && op.rt <= s_reg_127)) + { + if (m_block->bb->reg_save_dom[op.rt] && get_reg_raw(op.rt) == m_finfo->load[op.rt]) + { + return; + } + } + } + value_t addr = eval(zext((extract(get_vr(op.ra), 3) + (get_imm(op.si10) << 4)) & 0x3fff0)); make_store_ls(addr, get_vr(op.rt)); } @@ -6560,7 +7402,7 @@ public: m_ir->SetInsertPoint(result); m_ir->CreateCondBr(get_imm(op.e).value, e_exec, d_test, m_md_unlikely); m_ir->SetInsertPoint(e_exec); - const auto e_addr = call(&exec_check_interrupts, m_thread, addr.value); + const auto e_addr = call("spu_check_interrupts", &exec_check_interrupts, m_thread, addr.value); m_ir->CreateBr(d_test); m_ir->SetInsertPoint(d_test); const auto target = m_ir->CreatePHI(get_type(), 2); @@ -6578,7 +7420,7 @@ public: } // Convert an indirect branch into a static one if possible - if (const auto _int = llvm::dyn_cast(addr.value)) + if (const auto _int = llvm::dyn_cast(addr.value); _int && op.opcode) { const u32 target = ::narrow(_int->getZExtValue(), HERE); @@ -6601,17 +7443,34 @@ public: // Fixed branch excludes the possibility it's a function return (TODO) ret = false; } - else if (llvm::isa(addr.value)) + else if (llvm::isa(addr.value) && op.opcode) { LOG_ERROR(SPU, "[0x%x] Unexpected constant (add_block_indirect)", m_pos); } + if (m_finfo && m_finfo->fn && op.opcode) + { + const auto cblock = m_ir->GetInsertBlock(); + const auto result = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->SetInsertPoint(result); + ret_function(); + m_ir->SetInsertPoint(cblock); + return result; + } + // Load stack addr if necessary value_t sp; if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe) { - sp = eval(extract(get_reg_fixed(1), 3) & 0x3fff0); + if (op.opcode) + { + sp = eval(extract(get_reg_fixed(1), 3) & 0x3fff0); + } + else + { + sp.value = m_ir->CreateLoad(spu_ptr(&spu_thread::gpr, 1, &v128::_u32, 3)); + } } const auto cblock = m_ir->GetInsertBlock(); @@ -6620,7 +7479,7 @@ public: if (op.e) { - addr.value = call(&exec_check_interrupts, m_thread, addr.value); + addr.value = call("spu_check_interrupts", &exec_check_interrupts, m_thread, addr.value); } if (op.d) @@ -6629,9 +7488,7 @@ public: } m_ir->CreateStore(addr.value, spu_ptr(&spu_thread::pc)); - const auto type = llvm::FunctionType::get(get_type(), {get_type(), get_type(), get_type()}, false)->getPointerTo()->getPointerTo(); - const auto disp = m_ir->CreateIntToPtr(m_ir->getInt64((u64)spu_runtime::g_dispatcher), type); - const auto ad64 = m_ir->CreateZExt(addr.value, get_type()); + const auto type = m_finfo->chunk->getFunctionType()->getPointerTo()->getPointerTo(); if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe) { @@ -6642,25 +7499,30 @@ public: const auto link = m_ir->CreateLoad(m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack1.value), get_type())); const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); const auto done = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpEQ(ad64, link), done, fail, m_md_likely); + m_ir->CreateCondBr(m_ir->CreateICmpEQ(addr.value, m_ir->CreateTrunc(link, get_type())), done, fail, m_md_likely); m_ir->SetInsertPoint(done); // Clear stack mirror and return by tail call to the provided return address m_ir->CreateStore(splat(-1).eval(m_ir), m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack0.value), get_type())); - tail(_ret); + tail_chunk(_ret, m_ir->CreateTrunc(m_ir->CreateLShr(link, 32), get_type())); m_ir->SetInsertPoint(fail); } - llvm::Value* ptr = m_ir->CreateGEP(disp, m_ir->CreateLShr(ad64, 2, "", true)); - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) { // Try to load chunk address from the function table - const auto use_ftable = m_ir->CreateICmpULT(ad64, m_ir->getInt64(m_size)); - ptr = m_ir->CreateSelect(use_ftable, m_ir->CreateGEP(m_function_table, {m_ir->getInt64(0), m_ir->CreateLShr(ad64, 2, "", true)}), ptr); + const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); + const auto done = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(m_ir->CreateICmpULT(addr.value, m_ir->getInt32(m_size)), done, fail, m_md_likely); + m_ir->SetInsertPoint(done); + + const auto ad64 = m_ir->CreateZExt(addr.value, get_type()); + const auto pptr = m_ir->CreateGEP(m_function_table, {m_ir->getInt64(0), m_ir->CreateLShr(ad64, 2, "", true)}); + tail_chunk(m_ir->CreateLoad(pptr)); + m_ir->SetInsertPoint(fail); } - tail(m_ir->CreateLoad(ptr)); + m_ir->CreateRetVoid(); m_ir->SetInsertPoint(cblock); return result; } @@ -6732,10 +7594,11 @@ public: // Create jump table if necessary (TODO) const auto tfound = m_targets.find(m_pos); - if (!op.d && !op.e && tfound != m_targets.end() && tfound->second.size()) + if (!op.d && !op.e && tfound != m_targets.end() && tfound->second.size() > 1) { // Shift aligned address for switch - const auto sw_arg = m_ir->CreateLShr(addr.value, 2, "", true); + const auto addrfx = m_ir->CreateAdd(m_ir->CreateSub(addr.value, m_base_pc), m_ir->getInt32(m_base)); + const auto sw_arg = m_ir->CreateLShr(addrfx, 2, "", true); // Initialize jump table targets std::map targets; @@ -6754,6 +7617,14 @@ public: pair.second = add_block(pair.first); } + if (targets.empty()) + { + // Emergency exit + LOG_ERROR(SPU, "[0x%05x] No jump table targets at 0x%05x (%u)", m_entry, m_pos, tfound->second.size()); + m_ir->CreateBr(add_block_indirect(op, addr)); + return; + } + // Get jump table bounds (optimization) const u32 start = targets.begin()->first; const u32 end = targets.rbegin()->first + 4; @@ -6779,8 +7650,19 @@ public: // Exit function on unexpected target m_ir->SetInsertPoint(sw->getDefaultDest()); - m_ir->CreateStore(addr.value, spu_ptr(&spu_thread::pc)); - m_ir->CreateRetVoid(); + m_ir->CreateStore(addr.value, spu_ptr(&spu_thread::pc), true); + + if (m_finfo && m_finfo->fn) + { + // Can't afford external tail call in true functions + m_ir->CreateStore(m_ir->getInt32("BIJT"_u32), _ptr(m_memptr, 0xffdead20))->setVolatile(true); + m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true); + m_ir->CreateBr(sw->getDefaultDest()); + } + else + { + m_ir->CreateRetVoid(); + } } else { @@ -6810,10 +7692,9 @@ public: if (m_block) m_block->block_end = m_ir->GetInsertBlock(); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); set_link(op); - value_t res; - res.value = call(&exec_get_events, m_thread); + const auto res = call("spu_get_events", &exec_get_events, m_thread); const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(m_ir->CreateICmpNE(res.value, m_ir->getInt32(0)), target, add_block_next()); + m_ir->CreateCondBr(m_ir->CreateICmpNE(res, m_ir->getInt32(0)), target, add_block_next()); } void BRZ(spu_opcode_t op) // @@ -6920,6 +7801,23 @@ public: void BRASL(spu_opcode_t op) // { set_link(op); + + const u32 target = spu_branch_target(0, op.i16); + + if (m_finfo && m_finfo->fn && target != m_pos + 4) + { + if (auto fn = add_function(target)->fn) + { + call_function(fn); + return; + } + else + { + LOG_FATAL(SPU, "[0x%x] Can't add function 0x%x", m_pos, target); + return; + } + } + BRA(op); } @@ -6946,6 +7844,23 @@ public: void BRSL(spu_opcode_t op) // { set_link(op); + + const u32 target = spu_branch_target(m_pos, op.i16); + + if (m_finfo && m_finfo->fn && target != m_pos + 4) + { + if (auto fn = add_function(target)->fn) + { + call_function(fn); + return; + } + else + { + LOG_FATAL(SPU, "[0x%x] Can't add function 0x%x", m_pos, target); + return; + } + } + BR(op); } @@ -6959,16 +7874,22 @@ public: return; } - set_vr(op.rt, build(0, 0, 0, spu_branch_target(m_pos + 4))); + set_vr(op.rt, insert(splat(0), 3, value(get_pc(m_pos + 4)))); + + if (m_finfo && m_finfo->fn) + { + return; + } if (g_cfg.core.spu_block_size != spu_block_size_type::safe && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1]) { // Store the return function chunk address at the stack mirror - const auto func = add_function(m_pos + 4); + const auto pfunc = add_function(m_pos + 4); const auto stack0 = eval(zext(extract(get_reg_fixed(1), 3) & 0x3fff0) + ::offset32(&spu_thread::stack_mirror)); const auto stack1 = eval(stack0 + 8); - m_ir->CreateStore(func, m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack0.value), func->getType()->getPointerTo())); - m_ir->CreateStore(m_ir->getInt64(m_pos + 4), m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack1.value), get_type())); + const auto base_plus_pc = m_ir->CreateOr(m_ir->CreateShl(m_ir->CreateZExt(m_base_pc, get_type()), 32), m_ir->getInt64(m_pos + 4)); + m_ir->CreateStore(pfunc->chunk, m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack0.value), pfunc->chunk->getType()->getPointerTo())); + m_ir->CreateStore(base_plus_pc, m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack1.value), get_type())); } } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index af5ad3c70f..0815b917f0 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -44,8 +44,14 @@ class spu_runtime atomic_t m_reset_count{0}; + struct func_compare + { + // Comparison function for SPU programs + bool operator()(const std::vector& lhs, const std::vector& rhs) const; + }; + // All functions - std::map, spu_function_t> m_map; + std::map, spu_function_t, func_compare> m_map; // Debug module output location std::string m_cache_path; @@ -57,8 +63,8 @@ class spu_runtime u16 from; u16 level; u8* rel32; - std::map, spu_function_t>::iterator beg; - std::map, spu_function_t>::iterator end; + decltype(m_map)::iterator beg; + decltype(m_map)::iterator end; }; // Scratch vector @@ -199,6 +205,17 @@ public: s_reg_max }; + // Classify terminator instructions + enum class term_type : unsigned char + { + br, + ret, + call, + fallthrough, + indirect_call, + interrupt_call, + }; + protected: std::shared_ptr m_spurt; @@ -239,12 +256,39 @@ protected: // Internal use flag bool analysed = false; + // Terminator instruction type + term_type terminator; + // Bit mask of the registers modified in the block std::bitset reg_mod{}; + // Set if last modifying instruction produces xfloat + std::bitset reg_mod_xf{}; + + // Set if the initial register value in this block may be xfloat + std::bitset reg_maybe_xf{}; + // Bit mask of the registers used (before modified) std::bitset reg_use{}; + // Bit mask of the trivial (u32 x 4) constant value resulting in this block + std::bitset reg_const{}; + + // Bit mask of register saved onto the stack before use + std::bitset reg_save_dom{}; + + // Address of the function + u32 func = 0x40000; + + // Value subtracted from $SP in this block, negative if something funny is done on $SP + u32 stack_sub = 0; + + // Constant values associated with reg_const + std::array reg_val32; + + // Registers loaded from the stack in this block (stack offset) + std::array reg_load_mod{}; + // Single source of the reg value (dominating block address within the same chunk) or a negative number std::array reg_origin, reg_origin_abs; @@ -258,13 +302,27 @@ protected: // Sorted basic block info std::map m_bbs; - // Advanced block (chunk) information - struct chunk_info + // Sorted advanced block (chunk) list + std::basic_string m_chunks; + + // Function information + struct func_info { + // Size to the end of last basic block + u16 size = 0; + + // Determines whether a function is eligible for optimizations + bool good = false; + + // Call targets + std::basic_string calls; + + // Register save info (stack offset) + std::array reg_save_off{}; }; - // Sorted chunk info - std::map m_chunks; + // Sorted function info + std::map m_funcs; std::shared_ptr m_cache; @@ -272,6 +330,9 @@ private: // For private use std::bitset<0x10000> m_bits; + // For private use + std::vector workload; + // Result of analyse(), to avoid copying and allocation std::vector result; diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 55181a622d..8cdce4e74e 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -579,6 +579,10 @@ public: u64 block_recover = 0; u64 block_failure = 0; + u64 saved_native_sp = 0; // Host thread's stack pointer for emulated longjmp + + u8* memory_base_addr = vm::g_base_addr; + std::array stack_mirror; // Return address information void push_snr(u32 number, u32 value); diff --git a/rpcs3/Emu/Cell/lv2/sys_spu.cpp b/rpcs3/Emu/Cell/lv2/sys_spu.cpp index 06b42e871c..3b855de455 100644 --- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp @@ -232,7 +232,7 @@ error_code sys_spu_thread_initialize(vm::ptr thread, u32 group_id, u32 spu_ sys_spu.todo("Unimplemented SPU Thread options (0x%x)", option); } - const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x40000, vm::main))}; + const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x80000, vm::main))}; const u32 tid = idm::import>([&]() {