diff --git a/Utilities/JITASM.cpp b/Utilities/JITASM.cpp index 94d1ca5fe9..2eef415ca9 100644 --- a/Utilities/JITASM.cpp +++ b/Utilities/JITASM.cpp @@ -4,14 +4,10 @@ #include "StrFmt.h" #include "File.h" #include "util/logs.hpp" -#include "mutex.h" #include "util/vm.hpp" #include "util/asm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" -#include "Crypto/unzip.h" - -#include #ifdef __linux__ #define CAN_OVERCOMMIT @@ -845,648 +841,3 @@ void asmjit::simd_builder::vec_extract_gpr(u32 esize, const x86::Gp& dst, const } #endif /* X86 */ - -#ifdef LLVM_AVAILABLE - -#include -#include -#include - -#ifdef _MSC_VER -#pragma warning(push, 0) -#else -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wall" -#pragma GCC diagnostic ignored "-Wextra" -#pragma GCC diagnostic ignored "-Wold-style-cast" -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wredundant-decls" -#pragma GCC diagnostic ignored "-Weffc++" -#pragma GCC diagnostic ignored "-Wmissing-noreturn" -#endif -#include "llvm/Support/TargetSelect.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/TargetParser/Host.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" -#include "llvm/ExecutionEngine/ObjectCache.h" -#include "llvm/ExecutionEngine/JITEventListener.h" -#include "llvm/Object/ObjectFile.h" -#include "llvm/Object/SymbolSize.h" -#ifdef _MSC_VER -#pragma warning(pop) -#else -#pragma GCC diagnostic pop -#endif - -const bool jit_initialize = []() -> bool -{ - llvm::InitializeNativeTarget(); - llvm::InitializeNativeTargetAsmPrinter(); - llvm::InitializeNativeTargetAsmParser(); - LLVMLinkInMCJIT(); - return true; -}(); - -[[noreturn]] static void null(const char* name) -{ - fmt::throw_exception("Null function: %s", name); -} - -namespace vm -{ - extern u8* const g_sudo_addr; -} - -static shared_mutex null_mtx; - -static std::unordered_map null_funcs; - -static u64 make_null_function(const std::string& name) -{ - if (name.starts_with("__0x")) - { - u32 addr = -1; - auto res = std::from_chars(name.c_str() + 4, name.c_str() + name.size(), addr, 16); - - if (res.ec == std::errc() && res.ptr == name.c_str() + name.size() && addr < 0x8000'0000) - { - // Point the garbage to reserved, non-executable memory - return reinterpret_cast(vm::g_sudo_addr + addr); - } - } - - std::lock_guard lock(null_mtx); - - if (u64& func_ptr = null_funcs[name]) [[likely]] - { - // Already exists - return func_ptr; - } - else - { - using namespace asmjit; - - // Build a "null" function that contains its name - const auto func = build_function_asm("NULL", [&](native_asm& c, auto& args) - { -#if defined(ARCH_X64) - Label data = c.newLabel(); - c.lea(args[0], x86::qword_ptr(data, 0)); - c.jmp(Imm(&null)); - c.align(AlignMode::kCode, 16); - c.bind(data); - - // Copy function name bytes - for (char ch : name) - c.db(ch); - c.db(0); - c.align(AlignMode::kData, 16); -#else - // AArch64 implementation - Label jmp_address = c.newLabel(); - Label data = c.newLabel(); - // Force absolute jump to prevent out of bounds PC-rel jmp - c.ldr(args[0], arm::ptr(jmp_address)); - c.br(args[0]); - c.align(AlignMode::kCode, 16); - - c.bind(data); - c.embed(name.c_str(), name.size()); - c.embedUInt8(0U); - c.bind(jmp_address); - c.embedUInt64(reinterpret_cast(&null)); - c.align(AlignMode::kData, 16); -#endif - }); - - func_ptr = reinterpret_cast(func); - return func_ptr; - } -} - -struct JITAnnouncer : llvm::JITEventListener -{ - void notifyObjectLoaded(u64, const llvm::object::ObjectFile& obj, const llvm::RuntimeDyld::LoadedObjectInfo& info) override - { - using namespace llvm; - - object::OwningBinary debug_obj_ = info.getObjectForDebug(obj); - if (!debug_obj_.getBinary()) - { -#ifdef __linux__ - jit_log.error("LLVM: Failed to announce JIT events (no debug object)"); -#endif - return; - } - - const object::ObjectFile& debug_obj = *debug_obj_.getBinary(); - - for (const auto& [sym, size] : computeSymbolSizes(debug_obj)) - { - Expected type_ = sym.getType(); - if (!type_ || *type_ != object::SymbolRef::ST_Function) - continue; - - Expected name = sym.getName(); - if (!name) - continue; - - Expected addr = sym.getAddress(); - if (!addr) - continue; - - jit_announce(*addr, size, {name->data(), name->size()}); - } - } -}; - -// Simple memory manager -struct MemoryManager1 : llvm::RTDyldMemoryManager -{ - // 256 MiB for code or data - static constexpr u64 c_max_size = 0x20000000 / 2; - - // Allocation unit (2M) - static constexpr u64 c_page_size = 2 * 1024 * 1024; - - // Reserve 512 MiB - u8* const ptr = static_cast(utils::memory_reserve(c_max_size * 2)); - - u64 code_ptr = 0; - u64 data_ptr = c_max_size; - - MemoryManager1() = default; - - MemoryManager1(const MemoryManager1&) = delete; - - MemoryManager1& operator=(const MemoryManager1&) = delete; - - ~MemoryManager1() override - { - // Hack: don't release to prevent reuse of address space, see jit_announce - utils::memory_decommit(ptr, c_max_size * 2); - } - - llvm::JITSymbol findSymbol(const std::string& name) override - { - u64 addr = RTDyldMemoryManager::getSymbolAddress(name); - - if (!addr) - { - addr = make_null_function(name); - - if (!addr) - { - fmt::throw_exception("Failed to link '%s'", name); - } - } - - return {addr, llvm::JITSymbolFlags::Exported}; - } - - u8* allocate(u64& oldp, uptr size, uint align, utils::protection prot) - { - if (align > c_page_size) - { - jit_log.fatal("Unsupported alignment (size=0x%x, align=0x%x)", size, align); - return nullptr; - } - - const u64 olda = utils::align(oldp, align); - const u64 newp = utils::align(olda + size, align); - - if ((newp - 1) / c_max_size != oldp / c_max_size) - { - jit_log.fatal("Out of memory (size=0x%x, align=0x%x)", size, align); - return nullptr; - } - - if ((oldp - 1) / c_page_size != (newp - 1) / c_page_size) - { - // Allocate pages on demand - const u64 pagea = utils::align(oldp, c_page_size); - const u64 psize = utils::align(newp - pagea, c_page_size); - utils::memory_commit(this->ptr + pagea, psize, prot); - } - - // Update allocation counter - oldp = newp; - - return this->ptr + olda; - } - - u8* allocateCodeSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/) override - { - return allocate(code_ptr, size, align, utils::protection::wx); - } - - u8* allocateDataSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/, bool /*is_ro*/) override - { - return allocate(data_ptr, size, align, utils::protection::rw); - } - - bool finalizeMemory(std::string* = nullptr) override - { - return false; - } - - void registerEHFrames(u8*, u64, usz) override - { - } - - void deregisterEHFrames() override - { - } -}; - -// Simple memory manager -struct MemoryManager2 : llvm::RTDyldMemoryManager -{ - MemoryManager2() = default; - - ~MemoryManager2() override - { - } - - llvm::JITSymbol findSymbol(const std::string& name) override - { - u64 addr = RTDyldMemoryManager::getSymbolAddress(name); - - if (!addr) - { - addr = make_null_function(name); - - if (!addr) - { - fmt::throw_exception("Failed to link '%s' (MM2)", name); - } - } - - return {addr, llvm::JITSymbolFlags::Exported}; - } - - u8* allocateCodeSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/) override - { - return jit_runtime::alloc(size, align, true); - } - - u8* allocateDataSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/, bool /*is_ro*/) override - { - return jit_runtime::alloc(size, align, false); - } - - bool finalizeMemory(std::string* = nullptr) override - { - return false; - } - - void registerEHFrames(u8*, u64, usz) override - { - } - - void deregisterEHFrames() override - { - } -}; - -// Helper class -class ObjectCache final : public llvm::ObjectCache -{ - const std::string& m_path; - -public: - ObjectCache(const std::string& path) - : m_path(path) - { - } - - ~ObjectCache() override = default; - - void notifyObjectCompiled(const llvm::Module* _module, llvm::MemoryBufferRef obj) override - { - std::string name = m_path; - name.append(_module->getName().data()); - //fs::file(name, fs::rewrite).write(obj.getBufferStart(), obj.getBufferSize()); - name.append(".gz"); - - fs::file module_file(name, fs::rewrite); - - if (!module_file) - { - jit_log.error("LLVM: Failed to create module file: %s (%s)", name, fs::g_tls_error); - return; - } - - if (!zip(obj.getBufferStart(), obj.getBufferSize(), module_file)) - { - jit_log.error("LLVM: Failed to compress module: %s", _module->getName().data()); - module_file.close(); - fs::remove_file(name); - return; - } - - jit_log.notice("LLVM: Created module: %s", _module->getName().data()); - } - - static std::unique_ptr load(const std::string& path) - { - if (fs::file cached{path + ".gz", fs::read}) - { - const std::vector cached_data = cached.to_vector(); - - if (cached_data.empty()) [[unlikely]] - { - return nullptr; - } - - const std::vector out = unzip(cached_data); - - if (out.empty()) - { - jit_log.error("LLVM: Failed to unzip module: '%s'", path); - return nullptr; - } - - auto buf = llvm::WritableMemoryBuffer::getNewUninitMemBuffer(out.size()); - std::memcpy(buf->getBufferStart(), out.data(), out.size()); - return buf; - } - - if (fs::file cached{path, fs::read}) - { - if (cached.size() == 0) [[unlikely]] - { - return nullptr; - } - - auto buf = llvm::WritableMemoryBuffer::getNewUninitMemBuffer(cached.size()); - cached.read(buf->getBufferStart(), buf->getBufferSize()); - return buf; - } - - return nullptr; - } - - std::unique_ptr getObject(const llvm::Module* _module) override - { - std::string path = m_path; - path.append(_module->getName().data()); - - if (auto buf = load(path)) - { - jit_log.notice("LLVM: Loaded module: %s", _module->getName().data()); - return buf; - } - - return nullptr; - } -}; - -std::string jit_compiler::cpu(const std::string& _cpu) -{ - std::string m_cpu = _cpu; - - if (m_cpu.empty()) - { - m_cpu = llvm::sys::getHostCPUName().str(); - - if (m_cpu == "sandybridge" || - m_cpu == "ivybridge" || - m_cpu == "haswell" || - m_cpu == "broadwell" || - m_cpu == "skylake" || - m_cpu == "skylake-avx512" || - m_cpu == "cascadelake" || - m_cpu == "cooperlake" || - m_cpu == "cannonlake" || - m_cpu == "icelake" || - m_cpu == "icelake-client" || - m_cpu == "icelake-server" || - m_cpu == "tigerlake" || - m_cpu == "rocketlake" || - m_cpu == "alderlake" || - m_cpu == "raptorlake" || - m_cpu == "meteorlake") - { - // Downgrade if AVX is not supported by some chips - if (!utils::has_avx()) - { - m_cpu = "nehalem"; - } - } - - if (m_cpu == "skylake-avx512" || - m_cpu == "cascadelake" || - m_cpu == "cooperlake" || - m_cpu == "cannonlake" || - m_cpu == "icelake" || - m_cpu == "icelake-client" || - m_cpu == "icelake-server" || - m_cpu == "tigerlake" || - m_cpu == "rocketlake") - { - // Downgrade if AVX-512 is disabled or not supported - if (!utils::has_avx512()) - { - m_cpu = "skylake"; - } - } - - if (m_cpu == "znver1" && utils::has_clwb()) - { - // Upgrade - m_cpu = "znver2"; - } - - if ((m_cpu == "znver3" || m_cpu == "goldmont" || m_cpu == "alderlake" || m_cpu == "raptorlake" || m_cpu == "meteorlake") && utils::has_avx512_icl()) - { - // Upgrade - m_cpu = "icelake-client"; - } - - if (m_cpu == "goldmont" && utils::has_avx2()) - { - // Upgrade - m_cpu = "alderlake"; - } - } - - return m_cpu; -} - -std::string jit_compiler::triple1() -{ -#if defined(_WIN32) - return llvm::Triple::normalize(llvm::sys::getProcessTriple()); -#elif defined(__APPLE__) && defined(ARCH_X64) - return llvm::Triple::normalize("x86_64-unknown-linux-gnu"); -#elif defined(__APPLE__) && defined(ARCH_ARM64) - return llvm::Triple::normalize("aarch64-unknown-linux-gnu"); -#else - return llvm::Triple::normalize(llvm::sys::getProcessTriple()); -#endif -} - -std::string jit_compiler::triple2() -{ -#if defined(_WIN32) && defined(ARCH_X64) - return llvm::Triple::normalize("x86_64-unknown-linux-gnu"); -#elif defined(_WIN32) && defined(ARCH_ARM64) - return llvm::Triple::normalize("aarch64-unknown-linux-gnu"); -#elif defined(__APPLE__) && defined(ARCH_X64) - return llvm::Triple::normalize("x86_64-unknown-linux-gnu"); -#elif defined(__APPLE__) && defined(ARCH_ARM64) - return llvm::Triple::normalize("aarch64-unknown-linux-gnu"); -#else - return llvm::Triple::normalize(llvm::sys::getProcessTriple()); -#endif -} - -jit_compiler::jit_compiler(const std::unordered_map& _link, const std::string& _cpu, u32 flags) - : m_context(new llvm::LLVMContext) - , m_cpu(cpu(_cpu)) -{ - std::string result; - - auto null_mod = std::make_unique ("null_", *m_context); - null_mod->setTargetTriple(jit_compiler::triple1()); - - std::unique_ptr mem; - - if (_link.empty()) - { - // Auxiliary JIT (does not use custom memory manager, only writes the objects) - if (flags & 0x1) - { - mem = std::make_unique(); - } - else - { - mem = std::make_unique(); - null_mod->setTargetTriple(jit_compiler::triple2()); - } - } - else - { - mem = std::make_unique(); - } - - { - m_engine.reset(llvm::EngineBuilder(std::move(null_mod)) - .setErrorStr(&result) - .setEngineKind(llvm::EngineKind::JIT) - .setMCJITMemoryManager(std::move(mem)) - .setOptLevel(llvm::CodeGenOpt::Aggressive) - .setCodeModel(flags & 0x2 ? llvm::CodeModel::Large : llvm::CodeModel::Small) -#ifdef __APPLE__ - //.setCodeModel(llvm::CodeModel::Large) -#endif - .setRelocationModel(llvm::Reloc::Model::PIC_) - .setMCPU(m_cpu) - .create()); - } - - if (!_link.empty()) - { - for (auto&& [name, addr] : _link) - { - m_engine->updateGlobalMapping(name, addr); - } - } - - if (!_link.empty() || !(flags & 0x1)) - { - m_engine->RegisterJITEventListener(llvm::JITEventListener::createIntelJITEventListener()); - m_engine->RegisterJITEventListener(new JITAnnouncer); - } - - if (!m_engine) - { - fmt::throw_exception("LLVM: Failed to create ExecutionEngine: %s", result); - } -} - -jit_compiler::~jit_compiler() -{ -} - -void jit_compiler::add(std::unique_ptr _module, const std::string& path) -{ - ObjectCache cache{path}; - m_engine->setObjectCache(&cache); - - const auto ptr = _module.get(); - m_engine->addModule(std::move(_module)); - m_engine->generateCodeForModule(ptr); - m_engine->setObjectCache(nullptr); - - for (auto& func : ptr->functions()) - { - // Delete IR to lower memory consumption - func.deleteBody(); - } -} - -void jit_compiler::add(std::unique_ptr _module) -{ - const auto ptr = _module.get(); - m_engine->addModule(std::move(_module)); - m_engine->generateCodeForModule(ptr); - - for (auto& func : ptr->functions()) - { - // Delete IR to lower memory consumption - func.deleteBody(); - } -} - -void jit_compiler::add(const std::string& path) -{ - auto cache = ObjectCache::load(path); - - if (auto object_file = llvm::object::ObjectFile::createObjectFile(*cache)) - { - m_engine->addObjectFile(llvm::object::OwningBinary(std::move(*object_file), std::move(cache))); - } - else - { - jit_log.error("ObjectCache: Adding failed: %s", path); - } -} - -bool jit_compiler::check(const std::string& path) -{ - if (auto cache = ObjectCache::load(path)) - { - if (auto object_file = llvm::object::ObjectFile::createObjectFile(*cache)) - { - return true; - } - - if (fs::remove_file(path)) - { - jit_log.error("ObjectCache: Removed damaged file: %s", path); - } - } - - return false; -} - -void jit_compiler::update_global_mapping(const std::string& name, u64 addr) -{ - m_engine->updateGlobalMapping(name, addr); -} - -void jit_compiler::fin() -{ - m_engine->finalizeObject(); -} - -u64 jit_compiler::get(const std::string& name) -{ - return m_engine->getGlobalValueAddress(name); -} - -#endif // LLVM_AVAILABLE diff --git a/Utilities/JITLLVM.cpp b/Utilities/JITLLVM.cpp index 94d1ca5fe9..a14cf25a67 100644 --- a/Utilities/JITLLVM.cpp +++ b/Utilities/JITLLVM.cpp @@ -7,850 +7,15 @@ #include "mutex.h" #include "util/vm.hpp" #include "util/asm.hpp" -#include "util/v128.hpp" -#include "util/simd.hpp" #include "Crypto/unzip.h" #include -#ifdef __linux__ -#define CAN_OVERCOMMIT -#endif - LOG_CHANNEL(jit_log, "JIT"); -void jit_announce(uptr func, usz size, std::string_view name) -{ -#ifdef __linux__ - static const struct tmp_perf_map - { - std::string name{fmt::format("/tmp/perf-%d.map", getpid())}; - fs::file data{name, fs::rewrite + fs::append}; - - tmp_perf_map() = default; - tmp_perf_map(const tmp_perf_map&) = delete; - tmp_perf_map& operator=(const tmp_perf_map&) = delete; - - ~tmp_perf_map() - { - fs::remove_file(name); - } - } s_map; - - if (size && name.size()) - { - s_map.data.write(fmt::format("%x %x %s\n", func, size, name)); - } - - if (!func && !size && !name.size()) - { - fs::remove_file(s_map.name); - return; - } -#endif - - if (!size) - { - jit_log.error("Empty function announced: %s (%p)", name, func); - return; - } - - // If directory ASMJIT doesn't exist, nothing will be written - static constexpr u64 c_dump_size = 0x1'0000'0000; - static constexpr u64 c_index_size = c_dump_size / 16; - static atomic_t g_index_off = 0; - static atomic_t g_data_off = c_index_size; - - static void* g_asm = []() -> void* - { - fs::remove_all(fs::get_cache_dir() + "/ASMJIT/", false); - - fs::file objs(fmt::format("%s/ASMJIT/.objects", fs::get_cache_dir()), fs::read + fs::rewrite); - - if (!objs || !objs.trunc(c_dump_size)) - { - return nullptr; - } - - return utils::memory_map_fd(objs.get_handle(), c_dump_size, utils::protection::rw); - }(); - - if (g_asm && size < c_index_size) - { - struct entry - { - u64 addr; // RPCS3 process address - u32 size; // Function size - u32 off; // Function offset - }; - - // Write index entry at the beginning of file, and data + NTS name at fixed offset - const u64 index_off = g_index_off.fetch_add(1); - const u64 size_all = size + name.size() + 1; - const u64 data_off = g_data_off.fetch_add(size_all); - - // If either index or data area is exhausted, nothing will be written - if (index_off < c_index_size / sizeof(entry) && data_off + size_all < c_dump_size) - { - entry& index = static_cast(g_asm)[index_off]; - - std::memcpy(static_cast(g_asm) + data_off, reinterpret_cast(func), size); - std::memcpy(static_cast(g_asm) + data_off + size, name.data(), name.size()); - index.size = static_cast(size); - index.off = static_cast(data_off); - atomic_storage::store(index.addr, func); - } - } - - if (g_asm && !name.empty() && name[0] != '_') - { - // Save some objects separately - fs::file dump(fmt::format("%s/ASMJIT/%s", fs::get_cache_dir(), name), fs::rewrite); - - if (dump) - { - dump.write(reinterpret_cast(func), size); - } - } -} - -static u8* get_jit_memory() -{ - // Reserve 2G memory (magic static) - static void* const s_memory2 = []() -> void* - { - void* ptr = utils::memory_reserve(0x80000000); -#ifdef CAN_OVERCOMMIT - utils::memory_commit(ptr, 0x80000000); - utils::memory_protect(ptr, 0x40000000, utils::protection::wx); -#endif - return ptr; - }(); - - return static_cast(s_memory2); -} - -// Allocation counters (1G code, 1G data subranges) -static atomic_t s_code_pos{0}, s_data_pos{0}; - -// Snapshot of code generated before main() -static std::vector s_code_init, s_data_init; - -template & Ctr, uint Off, utils::protection Prot> -static u8* add_jit_memory(usz size, usz align) -{ - // Select subrange - u8* pointer = get_jit_memory() + Off; - - if (!size && !align) [[unlikely]] - { - // Return subrange info - return pointer; - } - - u64 olda, newa; - - // Simple allocation by incrementing pointer to the next free data - const u64 pos = Ctr.atomic_op([&](u64& ctr) -> u64 - { - const u64 _pos = utils::align(ctr & 0xffff'ffff, align); - const u64 _new = utils::align(_pos + size, align); - - if (_new > 0x40000000) [[unlikely]] - { - // Sorry, we failed, and further attempts should fail too. - ctr |= 0x40000000; - return -1; - } - - // Last allocation is stored in highest bits - olda = ctr >> 32; - newa = olda; - - // Check the necessity to commit more memory - if (_new > olda) [[unlikely]] - { - newa = utils::align(_new, 0x200000); - } - - ctr += _new - (ctr & 0xffff'ffff); - return _pos; - }); - - if (pos == umax) [[unlikely]] - { - jit_log.error("Out of memory (size=0x%x, align=0x%x, off=0x%x)", size, align, Off); - return nullptr; - } - - if (olda != newa) [[unlikely]] - { -#ifndef CAN_OVERCOMMIT - // Commit more memory - utils::memory_commit(pointer + olda, newa - olda, Prot); -#endif - // Acknowledge committed memory - Ctr.atomic_op([&](u64& ctr) - { - if ((ctr >> 32) < newa) - { - ctr += (newa - (ctr >> 32)) << 32; - } - }); - } - - ensure(pointer + pos >= get_jit_memory() + Off); - ensure(pointer + pos < get_jit_memory() + Off + 0x40000000); - - return pointer + pos; -} - -const asmjit::Environment& jit_runtime_base::environment() const noexcept -{ - static const asmjit::Environment g_env = asmjit::Environment::host(); - - return g_env; -} - -void* jit_runtime_base::_add(asmjit::CodeHolder* code) noexcept -{ - ensure(!code->flatten()); - ensure(!code->resolveUnresolvedLinks()); - usz codeSize = code->codeSize(); - if (!codeSize) - return nullptr; - - auto p = ensure(this->_alloc(codeSize, 64)); - ensure(!code->relocateToBase(uptr(p))); - - { - // We manage rw <-> rx transitions manually on Apple - // because it's easier to keep track of when and where we need to toggle W^X -#if !(defined(ARCH_ARM64) && defined(__APPLE__)) - asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize); -#endif - - for (asmjit::Section* section : code->_sections) - { - std::memcpy(p + section->offset(), section->data(), section->bufferSize()); - } - } - - return p; -} - -jit_runtime::jit_runtime() -{ -} - -jit_runtime::~jit_runtime() -{ -} - -uchar* jit_runtime::_alloc(usz size, usz align) noexcept -{ - return jit_runtime::alloc(size, align, true); -} - -u8* jit_runtime::alloc(usz size, usz align, bool exec) noexcept -{ - if (exec) - { - return add_jit_memory(size, align); - } - else - { - return add_jit_memory(size, align); - } -} - -void jit_runtime::initialize() -{ - if (!s_code_init.empty() || !s_data_init.empty()) - { - return; - } - - // Create code/data snapshot - s_code_init.resize(s_code_pos & 0xffff'ffff); - std::memcpy(s_code_init.data(), alloc(0, 0, true), s_code_init.size()); - s_data_init.resize(s_data_pos & 0xffff'ffff); - std::memcpy(s_data_init.data(), alloc(0, 0, false), s_data_init.size()); -} - -void jit_runtime::finalize() noexcept -{ -#ifdef __APPLE__ - pthread_jit_write_protect_np(false); -#endif - // Reset JIT memory -#ifdef CAN_OVERCOMMIT - utils::memory_reset(get_jit_memory(), 0x80000000); - utils::memory_protect(get_jit_memory(), 0x40000000, utils::protection::wx); -#else - utils::memory_decommit(get_jit_memory(), 0x80000000); -#endif - - s_code_pos = 0; - s_data_pos = 0; - - // Restore code/data snapshot - std::memcpy(alloc(s_code_init.size(), 1, true), s_code_init.data(), s_code_init.size()); - std::memcpy(alloc(s_data_init.size(), 1, false), s_data_init.data(), s_data_init.size()); - -#ifdef __APPLE__ - pthread_jit_write_protect_np(true); -#endif -#ifdef ARCH_ARM64 - // Flush all cache lines after potentially writing executable code - asm("ISB"); - asm("DSB ISH"); -#endif -} - -jit_runtime_base& asmjit::get_global_runtime() -{ - // 16 MiB for internal needs - static constexpr u64 size = 1024 * 1024 * 16; - - struct custom_runtime final : jit_runtime_base - { - custom_runtime() noexcept - { - // Search starting in first 2 GiB of memory - for (u64 addr = size;; addr += size) - { - if (auto ptr = utils::memory_reserve(size, reinterpret_cast(addr))) - { - m_pos.raw() = static_cast(ptr); - break; - } - } - - // Initialize "end" pointer - m_max = m_pos + size; - - // Make memory writable + executable - utils::memory_commit(m_pos, size, utils::protection::wx); - } - - uchar* _alloc(usz size, usz align) noexcept override - { - return m_pos.atomic_op([&](uchar*& pos) -> uchar* - { - const auto r = reinterpret_cast(utils::align(uptr(pos), align)); - - if (r >= pos && r + size > pos && r + size <= m_max) - { - pos = r + size; - return r; - } - - return nullptr; - }); - } - - private: - atomic_t m_pos{}; - - uchar* m_max{}; - }; - - // Magic static - static custom_runtime g_rt; - return g_rt; -} - -asmjit::inline_runtime::inline_runtime(uchar* data, usz size) - : m_data(data) - , m_size(size) -{ -} - -uchar* asmjit::inline_runtime::_alloc(usz size, usz align) noexcept -{ - ensure(align <= 4096); - - return size <= m_size ? m_data : nullptr; -} - -asmjit::inline_runtime::~inline_runtime() -{ - utils::memory_protect(m_data, m_size, utils::protection::rx); -} - -#if defined(ARCH_X64) -asmjit::simd_builder::simd_builder(CodeHolder* ch) noexcept - : native_asm(ch) -{ - _init(0); - consts[~v128()] = this->newLabel(); -} - -asmjit::simd_builder::~simd_builder() -{ -} - -void asmjit::simd_builder::_init(uint new_vsize) -{ - if ((!new_vsize && utils::has_avx512_icl()) || new_vsize == 64) - { - v0 = x86::zmm0; - v1 = x86::zmm1; - v2 = x86::zmm2; - v3 = x86::zmm3; - v4 = x86::zmm4; - v5 = x86::zmm5; - vsize = 64; - } - else if ((!new_vsize && utils::has_avx2()) || new_vsize == 32) - { - v0 = x86::ymm0; - v1 = x86::ymm1; - v2 = x86::ymm2; - v3 = x86::ymm3; - v4 = x86::ymm4; - v5 = x86::ymm5; - vsize = 32; - } - else - { - v0 = x86::xmm0; - v1 = x86::xmm1; - v2 = x86::xmm2; - v3 = x86::xmm3; - v4 = x86::xmm4; - v5 = x86::xmm5; - vsize = new_vsize ? new_vsize : 16; - } - - if (utils::has_avx512()) - { - if (!new_vsize) - vmask = -1; - } - else - { - vmask = 0; - } -} - -void asmjit::simd_builder::operator()() noexcept -{ - for (auto&& [x, y] : consts) - { - this->align(AlignMode::kData, 16); - this->bind(y); - this->embed(&x, 16); - } -} - -void asmjit::simd_builder::vec_cleanup_ret() -{ - if (utils::has_avx() && vsize > 16) - this->vzeroupper(); - this->ret(); -} - -void asmjit::simd_builder::vec_set_all_zeros(const Operand& v) -{ - x86::Xmm reg(v.id()); - if (utils::has_avx()) - this->vpxor(reg, reg, reg); - else - this->xorps(reg, reg); -} - -void asmjit::simd_builder::vec_set_all_ones(const Operand& v) -{ - x86::Xmm reg(v.id()); - if (x86::Zmm zr(v.id()); zr == v) - this->vpternlogd(zr, zr, zr, 0xff); - else if (x86::Ymm yr(v.id()); yr == v) - this->vpcmpeqd(yr, yr, yr); - else if (utils::has_avx()) - this->vpcmpeqd(reg, reg, reg); - else - this->pcmpeqd(reg, reg); -} - -void asmjit::simd_builder::vec_set_const(const Operand& v, const v128& val) -{ - if (!val._u) - return vec_set_all_zeros(v); - if (!~val._u) - return vec_set_all_ones(v); - else - { - Label co = consts[val]; - if (!co.isValid()) - co = consts[val] = this->newLabel(); - if (x86::Zmm zr(v.id()); zr == v) - this->vbroadcasti32x4(zr, x86::oword_ptr(co)); - else if (x86::Ymm yr(v.id()); yr == v) - this->vbroadcasti128(yr, x86::oword_ptr(co)); - else if (utils::has_avx()) - this->vmovaps(x86::Xmm(v.id()), x86::oword_ptr(co)); - else - this->movaps(x86::Xmm(v.id()), x86::oword_ptr(co)); - } -} - -void asmjit::simd_builder::vec_clobbering_test(u32 esize, const Operand& v, const Operand& rhs) -{ - if (esize == 64) - { - this->emit(x86::Inst::kIdVptestmd, x86::k0, v, rhs); - this->ktestw(x86::k0, x86::k0); - } - else if (esize == 32) - { - this->emit(x86::Inst::kIdVptest, v, rhs); - } - else if (esize == 16 && utils::has_avx()) - { - this->emit(x86::Inst::kIdVptest, v, rhs); - } - else if (esize == 16 && utils::has_sse41()) - { - this->emit(x86::Inst::kIdPtest, v, rhs); - } - else - { - if (v != rhs) - this->emit(x86::Inst::kIdPand, v, rhs); - if (esize == 16) - this->emit(x86::Inst::kIdPacksswb, v, v); - this->emit(x86::Inst::kIdMovq, x86::rax, v); - if (esize == 16 || esize == 8) - this->test(x86::rax, x86::rax); - else if (esize == 4) - this->test(x86::eax, x86::eax); - else if (esize == 2) - this->test(x86::ax, x86::ax); - else if (esize == 1) - this->test(x86::al, x86::al); - else - fmt::throw_exception("Unimplemented"); - } -} - -void asmjit::simd_builder::vec_broadcast_gpr(u32 esize, const Operand& v, const x86::Gp& r) -{ - if (esize == 2) - { - if (utils::has_avx512()) - this->emit(x86::Inst::kIdVpbroadcastw, v, r.r32()); - else if (utils::has_avx()) - { - this->emit(x86::Inst::kIdVmovd, v, r.r32()); - if (utils::has_avx2()) - this->emit(x86::Inst::kIdVpbroadcastw, v, v); - else - { - this->emit(x86::Inst::kIdVpunpcklwd, v, v, v); - this->emit(x86::Inst::kIdVpshufd, v, v, Imm(0)); - } - } - else - { - this->emit(x86::Inst::kIdMovd, v, r.r32()); - this->emit(x86::Inst::kIdPunpcklwd, v, v); - this->emit(x86::Inst::kIdPshufd, v, v, Imm(0)); - } - } - else if (esize == 4) - { - if (utils::has_avx512()) - this->emit(x86::Inst::kIdVpbroadcastd, v, r.r32()); - else if (utils::has_avx()) - { - this->emit(x86::Inst::kIdVmovd, v, r.r32()); - if (utils::has_avx2()) - this->emit(x86::Inst::kIdVpbroadcastd, v, v); - else - this->emit(x86::Inst::kIdVpshufd, v, v, Imm(0)); - } - else - { - this->emit(x86::Inst::kIdMovd, v, r.r32()); - this->emit(x86::Inst::kIdPshufd, v, v, Imm(0)); - } - } - else - { - fmt::throw_exception("Unimplemented"); - } -} - -asmjit::x86::Mem asmjit::simd_builder::ptr_scale_for_vec(u32 esize, const x86::Gp& base, const x86::Gp& index) -{ - switch (ensure(esize)) - { - case 1: return x86::ptr(base, index, 0, 0); - case 2: return x86::ptr(base, index, 1, 0); - case 4: return x86::ptr(base, index, 2, 0); - case 8: return x86::ptr(base, index, 3, 0); - default: fmt::throw_exception("Bad esize"); - } -} - -void asmjit::simd_builder::vec_load_unaligned(u32 esize, const Operand& v, const x86::Mem& src) -{ - ensure(std::has_single_bit(esize)); - ensure(std::has_single_bit(vsize)); - - if (esize == 2) - { - ensure(vsize >= 2); - if (vsize == 2) - vec_set_all_zeros(v); - if (vsize == 2 && utils::has_avx()) - this->emit(x86::Inst::kIdVpinsrw, x86::Xmm(v.id()), x86::Xmm(v.id()), src, Imm(0)); - else if (vsize == 2) - this->emit(x86::Inst::kIdPinsrw, v, src, Imm(0)); - else if ((vmask && vmask < 8) || vsize >= 64) - this->emit(x86::Inst::kIdVmovdqu16, v, src); - else - return vec_load_unaligned(vsize, v, src); - } - else if (esize == 4) - { - ensure(vsize >= 4); - if (vsize == 4 && utils::has_avx()) - this->emit(x86::Inst::kIdVmovd, x86::Xmm(v.id()), src); - else if (vsize == 4) - this->emit(x86::Inst::kIdMovd, v, src); - else if ((vmask && vmask < 8) || vsize >= 64) - this->emit(x86::Inst::kIdVmovdqu32, v, src); - else - return vec_load_unaligned(vsize, v, src); - } - else if (esize == 8) - { - ensure(vsize >= 8); - if (vsize == 8 && utils::has_avx()) - this->emit(x86::Inst::kIdVmovq, x86::Xmm(v.id()), src); - else if (vsize == 8) - this->emit(x86::Inst::kIdMovq, v, src); - else if ((vmask && vmask < 8) || vsize >= 64) - this->emit(x86::Inst::kIdVmovdqu64, v, src); - else - return vec_load_unaligned(vsize, v, src); - } - else if (esize >= 16) - { - ensure(vsize >= 16); - if ((vmask && vmask < 8) || vsize >= 64) - this->emit(x86::Inst::kIdVmovdqu64, v, src); // Not really needed - else if (utils::has_avx()) - this->emit(x86::Inst::kIdVmovdqu, v, src); - else - this->emit(x86::Inst::kIdMovups, v, src); - } - else - { - fmt::throw_exception("Unimplemented"); - } -} - -void asmjit::simd_builder::vec_store_unaligned(u32 esize, const Operand& v, const x86::Mem& dst) -{ - ensure(std::has_single_bit(esize)); - ensure(std::has_single_bit(vsize)); - - if (esize == 2) - { - ensure(vsize >= 2); - if (vsize == 2 && utils::has_avx()) - this->emit(x86::Inst::kIdVpextrw, dst, x86::Xmm(v.id()), Imm(0)); - else if (vsize == 2 && utils::has_sse41()) - this->emit(x86::Inst::kIdPextrw, dst, v, Imm(0)); - else if (vsize == 2) - this->push(x86::rax), this->pextrw(x86::eax, x86::Xmm(v.id()), 0), this->mov(dst, x86::ax), this->pop(x86::rax); - else if ((vmask && vmask < 8) || vsize >= 64) - this->emit(x86::Inst::kIdVmovdqu16, dst, v); - else - return vec_store_unaligned(vsize, v, dst); - } - else if (esize == 4) - { - ensure(vsize >= 4); - if (vsize == 4 && utils::has_avx()) - this->emit(x86::Inst::kIdVmovd, dst, x86::Xmm(v.id())); - else if (vsize == 4) - this->emit(x86::Inst::kIdMovd, dst, v); - else if ((vmask && vmask < 8) || vsize >= 64) - this->emit(x86::Inst::kIdVmovdqu32, dst, v); - else - return vec_store_unaligned(vsize, v, dst); - } - else if (esize == 8) - { - ensure(vsize >= 8); - if (vsize == 8 && utils::has_avx()) - this->emit(x86::Inst::kIdVmovq, dst, x86::Xmm(v.id())); - else if (vsize == 8) - this->emit(x86::Inst::kIdMovq, dst, v); - else if ((vmask && vmask < 8) || vsize >= 64) - this->emit(x86::Inst::kIdVmovdqu64, dst, v); - else - return vec_store_unaligned(vsize, v, dst); - } - else if (esize >= 16) - { - ensure(vsize >= 16); - if ((vmask && vmask < 8) || vsize >= 64) - this->emit(x86::Inst::kIdVmovdqu64, dst, v); // Not really needed - else if (utils::has_avx()) - this->emit(x86::Inst::kIdVmovdqu, dst, v); - else - this->emit(x86::Inst::kIdMovups, dst, v); - } - else - { - fmt::throw_exception("Unimplemented"); - } -} - -void asmjit::simd_builder::_vec_binary_op(x86::Inst::Id sse_op, x86::Inst::Id vex_op, x86::Inst::Id evex_op, const Operand& dst, const Operand& lhs, const Operand& rhs) -{ - if (utils::has_avx()) - { - if (evex_op != x86::Inst::kIdNone && (vex_op == x86::Inst::kIdNone || this->_extraReg.isReg() || vsize >= 64)) - { - this->evex().emit(evex_op, dst, lhs, rhs); - } - else - { - this->emit(vex_op, dst, lhs, rhs); - } - } - else if (dst == lhs) - { - this->emit(sse_op, dst, rhs); - } - else if (dst == rhs) - { - fmt::throw_exception("Unimplemented"); - } - else - { - this->emit(x86::Inst::kIdMovaps, dst, lhs); - this->emit(sse_op, dst, rhs); - } -} - -void asmjit::simd_builder::vec_umin(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs) -{ - using enum x86::Inst::Id; - if (esize == 2) - { - if (utils::has_sse41()) - return _vec_binary_op(kIdPminuw, kIdVpminuw, kIdVpminuw, dst, lhs, rhs); - } - else if (esize == 4) - { - if (utils::has_sse41()) - return _vec_binary_op(kIdPminud, kIdVpminud, kIdVpminud, dst, lhs, rhs); - } - - fmt::throw_exception("Unimplemented"); -} - -void asmjit::simd_builder::vec_umax(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs) -{ - using enum x86::Inst::Id; - if (esize == 2) - { - if (utils::has_sse41()) - return _vec_binary_op(kIdPmaxuw, kIdVpmaxuw, kIdVpmaxuw, dst, lhs, rhs); - } - else if (esize == 4) - { - if (utils::has_sse41()) - return _vec_binary_op(kIdPmaxud, kIdVpmaxud, kIdVpmaxud, dst, lhs, rhs); - } - - fmt::throw_exception("Unimplemented"); -} - -void asmjit::simd_builder::vec_cmp_eq(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs) -{ - using enum x86::Inst::Id; - if (esize == 2) - { - if (vsize == 64) - { - this->evex().emit(kIdVpcmpeqw, x86::k0, lhs, rhs); - this->evex().emit(kIdVpmovm2w, dst, x86::k0); - } - else - { - _vec_binary_op(kIdPcmpeqw, kIdVpcmpeqw, kIdNone, dst, lhs, rhs); - } - } - else if (esize == 4) - { - if (vsize == 64) - { - this->evex().emit(kIdVpcmpeqd, x86::k0, lhs, rhs); - this->evex().emit(kIdVpmovm2d, dst, x86::k0); - } - else - { - _vec_binary_op(kIdPcmpeqd, kIdVpcmpeqd, kIdNone, dst, lhs, rhs); - } - } - else - { - fmt::throw_exception("Unimplemented"); - } -} - -void asmjit::simd_builder::vec_extract_high(u32, const Operand& dst, const Operand& src) -{ - if (vsize == 32) - this->vextracti32x8(x86::Ymm(dst.id()), x86::Zmm(src.id()), 1); - else if (vsize == 16) - this->vextracti128(x86::Xmm(dst.id()), x86::Ymm(src.id()), 1); - else - { - if (utils::has_avx()) - this->vpsrldq(x86::Xmm(dst.id()), x86::Xmm(src.id()), vsize); - else - { - this->movdqa(x86::Xmm(dst.id()), x86::Xmm(src.id())); - this->psrldq(x86::Xmm(dst.id()), vsize); - } - } -} - -void asmjit::simd_builder::vec_extract_gpr(u32 esize, const x86::Gp& dst, const Operand& src) -{ - if (esize == 8 && utils::has_avx()) - this->vmovq(dst.r64(), x86::Xmm(src.id())); - else if (esize == 8) - this->movq(dst.r64(), x86::Xmm(src.id())); - else if (esize == 4 && utils::has_avx()) - this->vmovd(dst.r32(), x86::Xmm(src.id())); - else if (esize == 4) - this->movd(dst.r32(), x86::Xmm(src.id())); - else if (esize == 2 && utils::has_avx()) - this->vpextrw(dst.r32(), x86::Xmm(src.id()), 0); - else if (esize == 2) - this->pextrw(dst.r32(), x86::Xmm(src.id()), 0); - else - fmt::throw_exception("Unimplemented"); -} - -#endif /* X86 */ - #ifdef LLVM_AVAILABLE #include -#include -#include #ifdef _MSC_VER #pragma warning(push, 0) @@ -866,7 +31,6 @@ void asmjit::simd_builder::vec_extract_gpr(u32 esize, const x86::Gp& dst, const #pragma GCC diagnostic ignored "-Wmissing-noreturn" #endif #include "llvm/Support/TargetSelect.h" -#include "llvm/Support/FormattedStream.h" #include "llvm/TargetParser/Host.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index ea546e40d4..39913591b3 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -80,29 +80,15 @@ target_link_libraries(rpcs3_emu PUBLIC 3rdparty::pugixml) -if(MSVC) - set_source_files_properties("../../Utilities/JIT.cpp" PROPERTIES - COMPILE_FLAGS /GR- - SKIP_PRECOMPILE_HEADERS ON - ) -else() - set_source_files_properties("../../Utilities/JIT.cpp" PROPERTIES - COMPILE_FLAGS -fno-rtti - SKIP_PRECOMPILE_HEADERS ON - ) -endif() +set_source_files_properties("../../Utilities/JITLLVM.cpp" "../../Utilities/JITASM.cpp" PROPERTIES + COMPILE_FLAGS "$,/GR-,-fno-rtti>" + SKIP_PRECOMPILE_HEADERS ON +) -if(MSVC) - set_source_files_properties("../util/yaml.cpp" PROPERTIES - COMPILE_FLAGS /EHsc - SKIP_PRECOMPILE_HEADERS ON - ) -else() - set_source_files_properties("../util/yaml.cpp" PROPERTIES - COMPILE_FLAGS -fexceptions - SKIP_PRECOMPILE_HEADERS ON - ) -endif() +set_source_files_properties("../util/yaml.cpp" PROPERTIES + COMPILE_FLAGS "$,/EHsc,-fexceptions>" + SKIP_PRECOMPILE_HEADERS ON +) # Crypto target_sources(rpcs3_emu PRIVATE diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index ffe3373425..f9c8726b19 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -7,7 +7,6 @@ #include "Emu/system_utils.hpp" #include "Emu/cache_utils.hpp" #include "Emu/IdManager.h" -#include "Emu/Cell/timers.hpp" #include "Crypto/sha1.h" #include "Utilities/StrUtil.h" #include "Utilities/JIT.h" @@ -19,8 +18,6 @@ #include "SPUInterpreter.h" #include "SPUDisAsm.h" #include -#include -#include #include #include @@ -4396,7402 +4393,6 @@ void spu_recompiler_base::dump(const spu_program& result, std::string& out) out += '\n'; } -#ifdef LLVM_AVAILABLE - -#include "Emu/CPU/CPUTranslator.h" - -#ifdef _MSC_VER -#pragma warning(push, 0) -#else -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wall" -#pragma GCC diagnostic ignored "-Wextra" -#pragma GCC diagnostic ignored "-Wold-style-cast" -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Weffc++" -#pragma GCC diagnostic ignored "-Wmissing-noreturn" -#endif -#if LLVM_VERSION_MAJOR < 17 -#include "llvm/ADT/Triple.h" -#endif -#include "llvm/TargetParser/Host.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Verifier.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/ADT/PostOrderIterator.h" -#ifdef _MSC_VER -#pragma warning(pop) -#else -#pragma GCC diagnostic pop -#endif - -class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator -{ - // JIT Instance - jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)}; - - // Interpreter table size power - const u8 m_interp_magn; - - // Constant opcode bits - u32 m_op_const_mask = -1; - - // Current function chunk entry point - u32 m_entry; - - // Main entry point offset - u32 m_base; - - // Module name - std::string m_hash; - - // Patchpoint unique id - u32 m_pp_id = 0; - - // Next opcode - u32 m_next_op = 0; - - // Current function (chunk) - llvm::Function* m_function; - - llvm::Value* m_thread; - llvm::Value* m_lsptr; - llvm::Value* m_interp_op; - llvm::Value* m_interp_pc; - llvm::Value* m_interp_table; - llvm::Value* m_interp_7f0; - llvm::Value* m_interp_regs; - - // Helpers - llvm::Value* m_base_pc; - llvm::Value* m_interp_pc_next; - llvm::BasicBlock* m_interp_bblock; - - // i8*, contains constant vm::g_base_addr value - llvm::Value* m_memptr; - - // Pointers to registers in the thread context - std::array m_reg_addr; - - // Global variable (function table) - llvm::GlobalVariable* m_function_table{}; - - // Helpers (interpreter) - llvm::GlobalVariable* m_scale_float_to{}; - llvm::GlobalVariable* m_scale_to_float{}; - - // Function for check_state execution - llvm::Function* m_test_state{}; - - // Chunk for external tail call (dispatch) - llvm::Function* m_dispatch{}; - - llvm::MDNode* m_md_unlikely; - llvm::MDNode* m_md_likely; - - struct block_info - { - // Pointer to the analyser - spu_recompiler_base::block_info* bb{}; - - // Current block's entry block - llvm::BasicBlock* block; - - // Final block (for PHI nodes, set after completion) - llvm::BasicBlock* block_end{}; - - // Additional blocks for sinking instructions after block_end: - std::unordered_map> block_edges; - - // Current register values - std::array reg{}; - - // PHI nodes created for this block (if any) - std::array phi{}; - - // Store instructions - std::array store{}; - - // Store reordering/elimination protection - std::array store_context_last_id = fill_array(0); // Protects against illegal forward ordering - std::array store_context_first_id = fill_array(usz{umax}); // Protects against illegal past store elimination (backwards ordering is not implemented) - std::array store_context_ctr = fill_array(1); // Store barrier cointer - - bool does_gpr_barrier_proceed_last_store(u32 i) const noexcept - { - const usz counter = store_context_ctr[i]; - return counter != 1 && counter > store_context_last_id[i]; - } - - bool does_gpr_barrier_preceed_first_store(u32 i) const noexcept - { - const usz counter = store_context_ctr[i]; - const usz first_id = store_context_first_id[i]; - return counter != 1 && first_id != umax && counter < first_id; - } - }; - - struct function_info - { - // Standard callable chunk - llvm::Function* chunk{}; - - // Callable function - llvm::Function* fn{}; - - // Registers possibly loaded in the entry block - std::array load{}; - }; - - // Current block - block_info* m_block; - - // Current function or chunk - function_info* m_finfo; - - // All blocks in the current function chunk - std::unordered_map> m_blocks; - - // Block list for processing - std::vector m_block_queue; - - // All function chunks in current SPU compile unit - std::unordered_map> m_functions; - - // Function chunk list for processing - std::vector m_function_queue; - - // Add or get the function chunk - function_info* add_function(u32 addr) - { - // Enqueue if necessary - const auto empl = m_functions.try_emplace(addr); - - if (!empl.second) - { - return &empl.first->second; - } - - // Chunk function type - // 0. Result (tail call target) - // 1. Thread context - // 2. Local storage pointer - // 3. -#if 0 - const auto chunk_type = get_ftype(); -#else - const auto chunk_type = get_ftype(); -#endif - - // Get function chunk name - const std::string name = fmt::format("__spu-cx%05x-%s", addr, fmt::base57(be_t{m_hash_start})); - llvm::Function* result = llvm::cast(m_module->getOrInsertFunction(name, chunk_type).getCallee()); - - // Set parameters - result->setLinkage(llvm::GlobalValue::InternalLinkage); - result->addParamAttr(0, llvm::Attribute::NoAlias); - result->addParamAttr(1, llvm::Attribute::NoAlias); -#if 1 - result->setCallingConv(llvm::CallingConv::GHC); -#endif - - empl.first->second.chunk = result; - - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - // Find good real function - const auto ffound = m_funcs.find(addr); - - if (ffound != m_funcs.end() && ffound->second.good) - { - // Real function type (not equal to chunk type) - // 4. $SP - // 5. $3 - const auto func_type = get_ftype(); - - const std::string fname = fmt::format("__spu-fx%05x-%s", addr, fmt::base57(be_t{m_hash_start})); - llvm::Function* fn = llvm::cast(m_module->getOrInsertFunction(fname, func_type).getCallee()); - - fn->setLinkage(llvm::GlobalValue::InternalLinkage); - fn->addParamAttr(0, llvm::Attribute::NoAlias); - fn->addParamAttr(1, llvm::Attribute::NoAlias); -#if 1 - fn->setCallingConv(llvm::CallingConv::GHC); -#endif - empl.first->second.fn = fn; - } - } - - // Enqueue - m_function_queue.push_back(addr); - - return &empl.first->second; - } - - // Create tail call to the function chunk (non-tail calls are just out of question) - void tail_chunk(llvm::FunctionCallee callee, llvm::Value* base_pc = nullptr) - { - if (!callee && !g_cfg.core.spu_verification) - { - // Disable patchpoints if verification is disabled - callee = m_dispatch; - } - else if (!callee) - { - // Create branch patchpoint if chunk == nullptr - ensure(m_finfo && (!m_finfo->fn || m_function == m_finfo->chunk)); - - // Register under a unique linkable name - const std::string ppname = fmt::format("%s-pp-%u", m_hash, m_pp_id++); - m_engine->updateGlobalMapping(ppname, reinterpret_cast(m_spurt->make_branch_patchpoint())); - - // Create function with not exactly correct type - const auto ppfunc = llvm::cast(m_module->getOrInsertFunction(ppname, m_finfo->chunk->getFunctionType()).getCallee()); - ppfunc->setCallingConv(m_finfo->chunk->getCallingConv()); - - if (m_finfo->chunk->getReturnType() != get_type()) - { - m_ir->CreateRet(ppfunc); - return; - } - - callee = ppfunc; - base_pc = m_ir->getInt32(0); - } - - ensure(callee); - auto call = m_ir->CreateCall(callee, {m_thread, m_lsptr, base_pc ? base_pc : m_base_pc}); - auto func = m_finfo ? m_finfo->chunk : llvm::dyn_cast(callee.getCallee()); - call->setCallingConv(func->getCallingConv()); - call->setTailCall(); - - if (func->getReturnType() == get_type()) - { - m_ir->CreateRetVoid(); - } - else - { - m_ir->CreateRet(call); - } - } - - // Call the real function - void call_function(llvm::Function* fn, bool tail = false) - { - llvm::Value* lr{}; - llvm::Value* sp{}; - llvm::Value* r3{}; - - if (!m_finfo->fn && !m_block) - { - lr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, +s_reg_lr, &v128::_u32, 3)); - sp = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, +s_reg_sp)); - r3 = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, 3)); - } - else - { - lr = m_ir->CreateExtractElement(get_reg_fixed(s_reg_lr).value, 3); - sp = get_reg_fixed(s_reg_sp).value; - r3 = get_reg_fixed(3).value; - } - - const auto _call = m_ir->CreateCall(ensure(fn), {m_thread, m_lsptr, m_base_pc, sp, r3}); - - _call->setCallingConv(fn->getCallingConv()); - - // Tail call using loaded LR value (gateway from a chunk) - if (!m_finfo->fn) - { - lr = m_ir->CreateAnd(lr, 0x3fffc); - m_ir->CreateStore(lr, spu_ptr(&spu_thread::pc)); - m_ir->CreateStore(_call, spu_ptr(&spu_thread::gpr, 3)); - m_ir->CreateBr(add_block_indirect({}, value(lr))); - } - else if (tail) - { - _call->setTailCall(); - m_ir->CreateRet(_call); - } - else - { - // TODO: initialize $LR with a constant - for (u32 i = 0; i < s_reg_max; i++) - { - if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127)) - { - m_block->reg[i] = m_ir->CreateLoad(get_reg_type(i), init_reg_fixed(i)); - } - } - - // Set result - m_block->reg[3] = _call; - } - } - - // Emit return from the real function - void ret_function() - { - m_ir->CreateRet(get_reg_fixed(3).value); - } - - void set_function(llvm::Function* func) - { - m_function = func; - m_thread = func->getArg(0); - m_lsptr = func->getArg(1); - m_base_pc = func->getArg(2); - - m_reg_addr.fill(nullptr); - m_block = nullptr; - m_finfo = nullptr; - m_blocks.clear(); - m_block_queue.clear(); - m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", m_function)); - m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); - } - - // Add block with current block as a predecessor - llvm::BasicBlock* add_block(u32 target, bool absolute = false) - { - // Check the predecessor - const bool pred_found = m_block_info[target / 4] && m_preds[target].find_first_of(m_pos) + 1; - - if (m_blocks.empty()) - { - // Special case: first block, proceed normally - if (auto fn = std::exchange(m_finfo->fn, nullptr)) - { - // Create a gateway - call_function(fn, true); - - m_finfo->fn = fn; - m_function = fn; - m_thread = fn->getArg(0); - m_lsptr = fn->getArg(1); - m_base_pc = fn->getArg(2); - m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", fn)); - m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); - - // Load registers at the entry chunk - for (u32 i = 0; i < s_reg_max; i++) - { - if (i >= s_reg_80 && i <= s_reg_127) - { - // TODO - //m_finfo->load[i] = llvm::UndefValue::get(get_reg_type(i)); - } - - m_finfo->load[i] = m_ir->CreateLoad(get_reg_type(i), init_reg_fixed(i)); - } - - // Load $SP - m_finfo->load[s_reg_sp] = fn->getArg(3); - - // Load first args - m_finfo->load[3] = fn->getArg(4); - } - } - else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target) && (!m_finfo->fn || !m_ret_info[target / 4])) - { - // Generate a tail call to the function chunk - const auto cblock = m_ir->GetInsertBlock(); - const auto result = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->SetInsertPoint(result); - const auto pfinfo = add_function(target); - - if (absolute) - { - ensure(!m_finfo->fn); - - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_base_pc, m_ir->getInt32(m_base)), next, fail); - m_ir->SetInsertPoint(fail); - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); - tail_chunk(nullptr); - m_ir->SetInsertPoint(next); - } - - if (pfinfo->fn) - { - // Tail call to the real function - call_function(pfinfo->fn, true); - - if (!result->getTerminator()) - ret_function(); - } - else - { - // Just a boring tail call to another chunk - update_pc(target); - tail_chunk(pfinfo->chunk); - } - - m_ir->SetInsertPoint(cblock); - return result; - } - else if (!pred_found || !m_block_info[target / 4]) - { - if (m_block_info[target / 4]) - { - spu_log.error("[%s] [0x%x] Predecessor not found for target 0x%x (chunk=0x%x, entry=0x%x, size=%u)", m_hash, m_pos, target, m_entry, m_function_queue[0], m_size / 4); - } - - const auto cblock = m_ir->GetInsertBlock(); - const auto result = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->SetInsertPoint(result); - - if (absolute) - { - ensure(!m_finfo->fn); - - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); - } - else - { - update_pc(target); - } - - tail_chunk(nullptr); - m_ir->SetInsertPoint(cblock); - return result; - } - - ensure(!absolute); - - auto& result = m_blocks[target].block; - - if (!result) - { - result = llvm::BasicBlock::Create(m_context, fmt::format("b-0x%x", target), m_function); - - // Add the block to the queue - m_block_queue.push_back(target); - } - else if (m_block && m_blocks[target].block_end) - { - // Connect PHI nodes if necessary - for (u32 i = 0; i < s_reg_max; i++) - { - if (const auto phi = m_blocks[target].phi[i]) - { - const auto typ = phi->getType() == get_type() ? get_type() : get_reg_type(i); - phi->addIncoming(get_reg_fixed(i, typ), m_block->block_end); - } - } - } - - return result; - } - - template - llvm::Value* _ptr(llvm::Value* base, u32 offset) - { - return m_ir->CreateGEP(get_type(), base, m_ir->getInt64(offset)); - } - - template - llvm::Value* spu_ptr(Args... offset_args) - { - return _ptr(m_thread, ::offset32(offset_args...)); - } - - template - llvm::Value* spu_ptr(value_t add, Args... offset_args) - { - const auto off = m_ir->CreateGEP(get_type(), m_thread, m_ir->getInt64(::offset32(offset_args...))); - return m_ir->CreateAdd(off, add.value); - } - - // Return default register type - llvm::Type* get_reg_type(u32 index) - { - if (index < 128) - { - return get_type(); - } - - switch (index) - { - case s_reg_mfc_eal: - case s_reg_mfc_lsa: - return get_type(); - case s_reg_mfc_tag: - return get_type(); - case s_reg_mfc_size: - return get_type(); - default: - fmt::throw_exception("get_reg_type(%u): invalid register index", index); - } - } - - u32 get_reg_offset(u32 index) - { - if (index < 128) - { - return ::offset32(&spu_thread::gpr, index); - } - - switch (index) - { - case s_reg_mfc_eal: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::eal); - case s_reg_mfc_lsa: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::lsa); - case s_reg_mfc_tag: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::tag); - case s_reg_mfc_size: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::size); - default: - fmt::throw_exception("get_reg_offset(%u): invalid register index", index); - } - } - - llvm::Value* init_reg_fixed(u32 index) - { - if (!m_block) - { - return _ptr(m_thread, get_reg_offset(index)); - } - - auto& ptr = ::at32(m_reg_addr, index); - - if (!ptr) - { - // Save and restore current insert point if necessary - const auto block_cur = m_ir->GetInsertBlock(); - - // Emit register pointer at the beginning of the function chunk - m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); - ptr = _ptr(m_thread, get_reg_offset(index)); - m_ir->SetInsertPoint(block_cur); - } - - return ptr; - } - - // Get pointer to the vector register (interpreter only) - template - llvm::Value* init_vr(const bf_t&) - { - if (!m_interp_magn) - { - m_interp_7f0 = m_ir->getInt32(0x7f0); - m_interp_regs = _ptr(m_thread, get_reg_offset(0)); - } - - // Extract reg index - const auto isl = I >= 4 ? m_interp_op : m_ir->CreateShl(m_interp_op, u64{4 - I}); - const auto isr = I <= 4 ? m_interp_op : m_ir->CreateLShr(m_interp_op, u64{I - 4}); - const auto idx = m_ir->CreateAnd(I > 4 ? isr : isl, m_interp_7f0); - - // Pointer to the register - return m_ir->CreateGEP(get_type(), m_interp_regs, m_ir->CreateZExt(idx, get_type())); - } - - llvm::Value* double_as_uint64(llvm::Value* val) - { - return bitcast(val); - } - - llvm::Value* uint64_as_double(llvm::Value* val) - { - return bitcast(val); - } - - llvm::Value* double_to_xfloat(llvm::Value* val) - { - ensure(val && val->getType() == get_type()); - - const auto d = double_as_uint64(val); - const auto s = m_ir->CreateAnd(m_ir->CreateLShr(d, 32), 0x80000000); - const auto m = m_ir->CreateXor(m_ir->CreateLShr(d, 29), 0x40000000); - const auto r = m_ir->CreateOr(m_ir->CreateAnd(m, 0x7fffffff), s); - return m_ir->CreateTrunc(m_ir->CreateSelect(m_ir->CreateIsNotNull(d), r, splat(0).eval(m_ir)), get_type()); - } - - llvm::Value* xfloat_to_double(llvm::Value* val) - { - ensure(val && val->getType() == get_type()); - - const auto x = m_ir->CreateZExt(val, get_type()); - const auto s = m_ir->CreateShl(m_ir->CreateAnd(x, 0x80000000), 32); - const auto a = m_ir->CreateAnd(x, 0x7fffffff); - const auto m = m_ir->CreateShl(m_ir->CreateAdd(a, splat(0x1c0000000).eval(m_ir)), 29); - const auto r = m_ir->CreateSelect(m_ir->CreateICmpSGT(a, splat(0x7fffff).eval(m_ir)), m, splat(0).eval(m_ir)); - const auto f = m_ir->CreateOr(s, r); - return uint64_as_double(f); - } - - // Clamp double values to ±Smax, flush values smaller than ±Smin to positive zero - llvm::Value* xfloat_in_double(llvm::Value* val) - { - ensure(val && val->getType() == get_type()); - - const auto smax = uint64_as_double(splat(0x47ffffffe0000000).eval(m_ir)); - const auto smin = uint64_as_double(splat(0x3810000000000000).eval(m_ir)); - - const auto d = double_as_uint64(val); - const auto s = m_ir->CreateAnd(d, 0x8000000000000000); - const auto a = uint64_as_double(m_ir->CreateAnd(d, 0x7fffffffe0000000)); - const auto n = m_ir->CreateFCmpOLT(a, smax); - const auto z = m_ir->CreateFCmpOLT(a, smin); - const auto c = double_as_uint64(m_ir->CreateSelect(n, a, smax)); - return m_ir->CreateSelect(z, fsplat(0.).eval(m_ir), uint64_as_double(m_ir->CreateOr(c, s))); - } - - // Expand 32-bit mask for xfloat values to 64-bit, 29 least significant bits are always zero - llvm::Value* conv_xfloat_mask(llvm::Value* val) - { - const auto d = m_ir->CreateZExt(val, get_type()); - const auto s = m_ir->CreateShl(m_ir->CreateAnd(d, 0x80000000), 32); - const auto e = m_ir->CreateLShr(m_ir->CreateAShr(m_ir->CreateShl(d, 33), 4), 1); - return m_ir->CreateOr(s, e); - } - - llvm::Value* get_reg_raw(u32 index) - { - if (!m_block || index >= m_block->reg.size()) - { - return nullptr; - } - - return m_block->reg[index]; - } - - llvm::Value* get_reg_fixed(u32 index, llvm::Type* type) - { - llvm::Value* dummy{}; - - auto& reg = *(m_block ? &::at32(m_block->reg, index) : &dummy); - - if (!reg) - { - // Load register value if necessary - reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(get_reg_type(index), init_reg_fixed(index)); - } - - if (reg->getType() == get_type()) - { - if (type == reg->getType()) - { - return reg; - } - - return bitcast(double_to_xfloat(reg), type); - } - - if (type == get_type()) - { - return xfloat_to_double(bitcast(reg)); - } - - return bitcast(reg, type); - } - - template - value_t get_reg_fixed(u32 index) - { - value_t r; - r.value = get_reg_fixed(index, get_type()); - return r; - } - - template - value_t get_vr(const bf_t& index) - { - value_t r; - - if ((m_op_const_mask & index.data_mask()) != index.data_mask()) - { - // Update const mask if necessary - if (I >= (32u - m_interp_magn)) - { - m_op_const_mask |= index.data_mask(); - } - - // Load reg - if (get_type() == get_type()) - { - r.value = xfloat_to_double(m_ir->CreateLoad(get_type(), init_vr(index))); - } - else - { - r.value = m_ir->CreateLoad(get_type(), init_vr(index)); - } - } - else - { - r.value = get_reg_fixed(index, get_type()); - } - - return r; - } - - template - auto get_vr_as(U&&, const bf_t& index) - { - return get_vr::type>(index); - } - - template - std::tuple>...> get_vrs(const Args&... args) - { - return {get_vr(args)...}; - } - - template - llvm_match_t match_vr(const bf_t& index) - { - llvm_match_t r; - - if (m_block) - { - auto v = ::at32(m_block->reg, index); - - if (v && v->getType() == get_type()) - { - r.value = v; - return r; - } - } - - return r; - } - - template - auto match_vr_as(U&&, const bf_t& index) - { - return match_vr::type>(index); - } - - template - bool match_vr(const bf_t& index, F&& pred) - { - return (( match_vr(index) ? pred(match_vr(index), match()) : false ) || ...); - } - - template - std::tuple>...> match_vrs(const Args&... args) - { - return {match_vr(args)...}; - } - - // Extract scalar value from the preferred slot - template - auto get_scalar(value_t value) - { - using e_type = std::remove_extent_t; - - static_assert(sizeof(T) == 16 || std::is_same_v, "Unknown vector type"); - - if (auto [ok, v] = match_expr(value, vsplat(match())); ok) - { - return eval(v); - } - - if constexpr (sizeof(e_type) == 1) - { - return eval(extract(value, 12)); - } - else if constexpr (sizeof(e_type) == 2) - { - return eval(extract(value, 6)); - } - else if constexpr (sizeof(e_type) == 4 || sizeof(T) == 32) - { - return eval(extract(value, 3)); - } - else - { - return eval(extract(value, 1)); - } - } - - // Splat scalar value from the preferred slot - template - auto splat_scalar(T&& arg) - { - using VT = std::remove_extent_t::type>; - - if constexpr (sizeof(VT) == 1) - { - return zshuffle(std::forward(arg), 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12); - } - else if constexpr (sizeof(VT) == 2) - { - return zshuffle(std::forward(arg), 6, 6, 6, 6, 6, 6, 6, 6); - } - else if constexpr (sizeof(VT) == 4) - { - return zshuffle(std::forward(arg), 3, 3, 3, 3); - } - else if constexpr (sizeof(VT) == 8) - { - return zshuffle(std::forward(arg), 1, 1); - } - else - { - static_assert(sizeof(VT) == 16); - return std::forward(arg); - } - } - - void set_reg_fixed(u32 index, llvm::Value* value, bool fixup = true) - { - llvm::StoreInst* dummy{}; - - // Check - ensure(!m_block || m_regmod[m_pos / 4] == index); - - // Test for special case - const bool is_xfloat = value->getType() == get_type(); - - // Clamp value if necessary - const auto saved_value = is_xfloat && fixup ? xfloat_in_double(value) : value; - - // Set register value - if (m_block) - { -#ifndef _WIN32 - if (g_cfg.core.spu_debug) - value->setName(fmt::format("result_0x%05x", m_pos)); -#endif - - ::at32(m_block->reg, index) = saved_value; - } - - // Get register location - const auto addr = init_reg_fixed(index); - - auto& _store = *(m_block ? &m_block->store[index] : &dummy); - - // Erase previous dead store instruction if necessary - if (_store) - { - if (m_block->store_context_last_id[index] == m_block->store_context_ctr[index]) - { - // Erase store of it is not preserved by ensure_gpr_stores() - _store->eraseFromParent(); - } - } - - if (m_block) - { - // Keep the store's location in history of gpr preservaions - m_block->store_context_last_id[index] = m_block->store_context_ctr[index]; - m_block->store_context_first_id[index] = std::min(m_block->store_context_first_id[index], m_block->store_context_ctr[index]); - } - - if (m_finfo && m_finfo->fn) - { - if (index <= 3 || (index >= s_reg_80 && index <= s_reg_127)) - { - // Don't save some registers in true functions - return; - } - } - - // Write register to the context - _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, get_reg_type(index)), addr); - } - - template - void set_vr(const bf_t& index, T expr, std::function vr_assume = nullptr, bool fixup = true) - { - // Process expression - const auto value = expr.eval(m_ir); - - // Test for special case - const bool is_xfloat = value->getType() == get_type(); - - if ((m_op_const_mask & index.data_mask()) != index.data_mask()) - { - // Update const mask if necessary - if (I >= (32u - m_interp_magn)) - { - m_op_const_mask |= index.data_mask(); - } - - // Clamp value if necessary - const auto saved_value = is_xfloat && fixup ? xfloat_in_double(value) : value; - - // Store value - m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, get_type()), init_vr(index)); - return; - } - - if (vr_assume) - { - } - - set_reg_fixed(index, value, fixup); - } - - template - value_t get_imm(const bf_t& imm, bool mask = true) - { - if ((m_op_const_mask & imm.data_mask()) != imm.data_mask()) - { - // Update const mask if necessary - if (I >= (32u - m_interp_magn)) - { - m_op_const_mask |= imm.data_mask(); - } - - // Extract unsigned immediate (skip AND if mask == false or truncated anyway) - value_t r; - r.value = m_interp_op; - r.value = I == 0 ? r.value : m_ir->CreateLShr(r.value, u64{I}); - r.value = !mask || N >= r.esize ? r.value : m_ir->CreateAnd(r.value, imm.data_mask() >> I); - - if constexpr (r.esize != 32) - { - r.value = m_ir->CreateZExtOrTrunc(r.value, get_type()->getScalarType()); - } - - if (r.is_vector) - { - r.value = m_ir->CreateVectorSplat(r.is_vector, r.value); - } - - return r; - } - - return eval(splat(imm)); - } - - template - value_t get_imm(const bf_t& imm) - { - if ((m_op_const_mask & imm.data_mask()) != imm.data_mask()) - { - // Update const mask if necessary - if (I >= (32u - m_interp_magn)) - { - m_op_const_mask |= imm.data_mask(); - } - - // Extract signed immediate (skip sign ext if truncated anyway) - value_t r; - r.value = m_interp_op; - r.value = I + N == 32 || N >= r.esize ? r.value : m_ir->CreateShl(r.value, u64{32u - I - N}); - r.value = N == 32 || N >= r.esize ? r.value : m_ir->CreateAShr(r.value, u64{32u - N}); - r.value = I == 0 || N < r.esize ? r.value : m_ir->CreateLShr(r.value, u64{I}); - - if constexpr (r.esize != 32) - { - r.value = m_ir->CreateSExtOrTrunc(r.value, get_type()->getScalarType()); - } - - if (r.is_vector) - { - r.value = m_ir->CreateVectorSplat(r.is_vector, r.value); - } - - return r; - } - - return eval(splat(imm)); - } - - // Get PC for given instruction address - llvm::Value* get_pc(u32 addr) - { - return m_ir->CreateAdd(m_base_pc, m_ir->getInt32(addr - m_base)); - } - - // Update PC for current or explicitly specified instruction address - void update_pc(u32 target = -1) - { - m_ir->CreateStore(m_ir->CreateAnd(get_pc(target + 1 ? target : m_pos), 0x3fffc), spu_ptr(&spu_thread::pc))->setVolatile(true); - } - - // Call cpu_thread::check_state if necessary and return or continue (full check) - void check_state(u32 addr, bool may_be_unsafe_for_savestate = true) - { - const auto pstate = spu_ptr(&spu_thread::state); - const auto _body = llvm::BasicBlock::Create(m_context, "", m_function); - const auto check = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(get_type(), pstate, true), m_ir->getInt32(0)), _body, check, m_md_likely); - m_ir->SetInsertPoint(check); - update_pc(addr); - - if (may_be_unsafe_for_savestate && std::none_of(std::begin(m_block->phi), std::end(m_block->phi), FN(!!x))) - { - may_be_unsafe_for_savestate = false; - } - - if (may_be_unsafe_for_savestate) - { - m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(&spu_thread::unsavable))->setVolatile(true); - } - - m_ir->CreateCall(m_test_state, {m_thread}); - - if (may_be_unsafe_for_savestate) - { - m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::unsavable))->setVolatile(true); - } - - m_ir->CreateBr(_body); - m_ir->SetInsertPoint(_body); - } - -public: - spu_llvm_recompiler(u8 interp_magn = 0) - : spu_recompiler_base() - , cpu_translator(nullptr, false) - , m_interp_magn(interp_magn) - { - } - - virtual void init() override - { - // Initialize if necessary - if (!m_spurt) - { - m_spurt = &g_fxo->get(); - cpu_translator::initialize(m_jit.get_context(), m_jit.get_engine()); - - const auto md_name = llvm::MDString::get(m_context, "branch_weights"); - const auto md_low = llvm::ValueAsMetadata::get(llvm::ConstantInt::get(GetType(), 1)); - const auto md_high = llvm::ValueAsMetadata::get(llvm::ConstantInt::get(GetType(), 999)); - - // Metadata for branch weights - m_md_likely = llvm::MDTuple::get(m_context, {md_name, md_high, md_low}); - m_md_unlikely = llvm::MDTuple::get(m_context, {md_name, md_low, md_high}); - } - } - - virtual spu_function_t compile(spu_program&& _func) override - { - if (_func.data.empty() && m_interp_magn) - { - return compile_interpreter(); - } - - const u32 start0 = _func.entry_point; - - const auto add_loc = m_spurt->add_empty(std::move(_func)); - - if (!add_loc) - { - return nullptr; - } - - const spu_program& func = add_loc->data; - - if (func.entry_point != start0) - { - // Wait for the duplicate - while (!add_loc->compiled) - { - add_loc->compiled.wait(nullptr); - } - - return add_loc->compiled; - } - - std::string log; - - if (auto& cache = g_fxo->get(); cache && g_cfg.core.spu_cache && !add_loc->cached.exchange(1)) - { - cache.add(func); - } - - { - sha1_context ctx; - u8 output[20]; - - sha1_starts(&ctx); - sha1_update(&ctx, reinterpret_cast(func.data.data()), func.data.size() * 4); - sha1_finish(&ctx, output); - - m_hash.clear(); - fmt::append(m_hash, "__spu-0x%05x-%s", func.entry_point, fmt::base57(output)); - - be_t hash_start; - std::memcpy(&hash_start, output, sizeof(hash_start)); - m_hash_start = hash_start; - } - - spu_log.notice("Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash); - - m_pos = func.lower_bound; - m_base = func.entry_point; - m_size = ::size32(func.data) * 4; - const u32 start = m_pos; - const u32 end = start + m_size; - - m_pp_id = 0; - - if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1)) - { - this->dump(func, log); - fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log); - } - - using namespace llvm; - - m_engine->clearAllGlobalMappings(); - - // Create LLVM module - std::unique_ptr _module = std::make_unique(m_hash + ".obj", m_context); - _module->setTargetTriple(jit_compiler::triple2()); - _module->setDataLayout(m_jit.get_engine().getTargetMachine()->createDataLayout()); - m_module = _module.get(); - - // Initialize IR Builder - IRBuilder<> irb(m_context); - m_ir = &irb; - - // Add entry function (contains only state/code check) - const auto main_func = llvm::cast(m_module->getOrInsertFunction(m_hash, get_ftype()).getCallee()); - const auto main_arg2 = main_func->getArg(2); - main_func->setCallingConv(CallingConv::GHC); - set_function(main_func); - - // Start compilation - const auto label_test = BasicBlock::Create(m_context, "", m_function); - const auto label_diff = BasicBlock::Create(m_context, "", m_function); - const auto label_body = BasicBlock::Create(m_context, "", m_function); - const auto label_stop = BasicBlock::Create(m_context, "", m_function); - - // Load PC, which will be the actual value of 'm_base' - m_base_pc = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::pc)); - - // Emit state check - const auto pstate = spu_ptr(&spu_thread::state); - m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); - - // Emit code check - u32 check_iterations = 0; - m_ir->SetInsertPoint(label_test); - - // Set block hash for profiling (if enabled) - if (g_cfg.core.spu_prof && g_cfg.core.spu_verification) - m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536)), spu_ptr(&spu_thread::block_hash)); - - if (!g_cfg.core.spu_verification) - { - // Disable check (unsafe) - m_ir->CreateBr(label_body); - } - else if (func.data.size() == 1) - { - const auto pu32 = m_ir->CreateGEP(get_type(), m_lsptr, m_base_pc); - const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pu32), m_ir->getInt32(func.data[0])); - m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); - } - else if (func.data.size() == 2) - { - const auto pu64 = m_ir->CreateGEP(get_type(), m_lsptr, m_base_pc); - const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pu64), m_ir->getInt64(static_cast(func.data[1]) << 32 | func.data[0])); - m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); - } - else - { - u32 starta = start; - - // Skip holes at the beginning (giga only) - for (u32 j = start; j < end; j += 4) - { - if (!func.data[(j - start) / 4]) - { - starta += 4; - } - else - { - break; - } - } - - u32 stride; - u32 elements; - u32 dwords; - - if (m_use_avx512 && g_cfg.core.full_width_avx512) - { - stride = 64; - elements = 16; - dwords = 8; - } - else if (m_use_avx) - { - stride = 32; - elements = 8; - dwords = 4; - } - else - { - stride = 16; - elements = 4; - dwords = 2; - } - - // Get actual pc corresponding to the found beginning of the data - llvm::Value* starta_pc = m_ir->CreateAnd(get_pc(starta), 0x3fffc); - llvm::Value* data_addr = m_ir->CreateGEP(get_type(), m_lsptr, starta_pc); - - llvm::Value* acc = nullptr; - - for (u32 j = starta; j < end; j += stride) - { - int indices[16]; - bool holes = false; - bool data = false; - - for (u32 i = 0; i < elements; i++) - { - const u32 k = j + i * 4; - - if (k < start || k >= end || !func.data[(k - start) / 4]) - { - indices[i] = elements; - holes = true; - } - else - { - indices[i] = i; - data = true; - } - } - - if (!data) - { - // Skip full-sized holes - continue; - } - - llvm::Value* vls = nullptr; - - // Load unaligned code block from LS - if (m_use_avx512 && g_cfg.core.full_width_avx512) - { - vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); - } - else if (m_use_avx) - { - vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); - } - else - { - vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); - } - - // Mask if necessary - if (holes) - { - vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements)); - } - - // Perform bitwise comparison and accumulate - u32 words[16]; - - for (u32 i = 0; i < elements; i++) - { - const u32 k = j + i * 4; - words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0; - } - - vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements))); - acc = acc ? m_ir->CreateOr(acc, vls) : vls; - check_iterations++; - } - - // Pattern for PTEST - if (m_use_avx512 && g_cfg.core.full_width_avx512) - { - acc = m_ir->CreateBitCast(acc, get_type()); - } - else if (m_use_avx) - { - acc = m_ir->CreateBitCast(acc, get_type()); - } - else - { - acc = m_ir->CreateBitCast(acc, get_type()); - } - - llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); - - for (u32 i = 1; i < dwords; i++) - { - elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); - } - - // Compare result with zero - const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); - m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); - } - - // Increase block counter with statistics - m_ir->SetInsertPoint(label_body); - const auto pbcount = spu_ptr(&spu_thread::block_counter); - m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(get_type(), pbcount), m_ir->getInt64(check_iterations)), pbcount); - - // Call the entry function chunk - const auto entry_chunk = add_function(m_pos); - const auto entry_call = m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_base_pc}); - entry_call->setCallingConv(entry_chunk->chunk->getCallingConv()); - - const auto dispatcher = llvm::cast(m_module->getOrInsertFunction("spu_dispatcher", main_func->getType()).getCallee()); - m_engine->updateGlobalMapping("spu_dispatcher", reinterpret_cast(spu_runtime::tr_all)); - dispatcher->setCallingConv(main_func->getCallingConv()); - - // Proceed to the next code - if (entry_chunk->chunk->getReturnType() != get_type()) - { - const auto next_call = m_ir->CreateCall(main_func->getFunctionType(), entry_call, {m_thread, m_lsptr, m_ir->getInt64(0)}); - next_call->setCallingConv(main_func->getCallingConv()); - next_call->setTailCall(); - } - else - { - entry_call->setTailCall(); - } - - m_ir->CreateRetVoid(); - - m_ir->SetInsertPoint(label_stop); - call("spu_escape", spu_runtime::g_escape, m_thread)->setTailCall(); - m_ir->CreateRetVoid(); - - m_ir->SetInsertPoint(label_diff); - - if (g_cfg.core.spu_verification) - { - const auto pbfail = spu_ptr(&spu_thread::block_failure); - m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(get_type(), pbfail), m_ir->getInt64(1)), pbfail); - const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_lsptr, main_arg2); - dispci->setCallingConv(CallingConv::GHC); - dispci->setTailCall(); - m_ir->CreateRetVoid(); - } - else - { - m_ir->CreateUnreachable(); - } - - m_dispatch = cast(_module->getOrInsertFunction("__spu-null", entry_chunk->chunk->getFunctionType()).getCallee()); - m_dispatch->setLinkage(llvm::GlobalValue::InternalLinkage); - m_dispatch->setCallingConv(entry_chunk->chunk->getCallingConv()); - set_function(m_dispatch); - - if (entry_chunk->chunk->getReturnType() == get_type()) - { - const auto next_call = m_ir->CreateCall(main_func->getFunctionType(), dispatcher, {m_thread, m_lsptr, m_ir->getInt64(0)}); - next_call->setCallingConv(main_func->getCallingConv()); - next_call->setTailCall(); - m_ir->CreateRetVoid(); - } - else - { - m_ir->CreateRet(dispatcher); - } - - // Function that executes check_state and escapes if necessary - m_test_state = llvm::cast(m_module->getOrInsertFunction("spu_test_state", get_ftype()).getCallee()); - m_test_state->setLinkage(GlobalValue::InternalLinkage); -#ifdef ARCH_ARM64 - // LLVM doesn't support PreserveAll on arm64. - m_test_state->setCallingConv(CallingConv::PreserveMost); -#else - m_test_state->setCallingConv(CallingConv::PreserveAll); -#endif - m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", m_test_state)); - const auto escape_yes = BasicBlock::Create(m_context, "", m_test_state); - const auto escape_no = BasicBlock::Create(m_context, "", m_test_state); - m_ir->CreateCondBr(call("spu_exec_check_state", &exec_check_state, m_test_state->getArg(0)), escape_yes, escape_no); - m_ir->SetInsertPoint(escape_yes); - call("spu_escape", spu_runtime::g_escape, m_test_state->getArg(0)); - m_ir->CreateRetVoid(); - m_ir->SetInsertPoint(escape_no); - m_ir->CreateRetVoid(); - - // Create function table (uninitialized) - m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); - - // Create function chunks - for (usz fi = 0; fi < m_function_queue.size(); fi++) - { - // Initialize function info - m_entry = m_function_queue[fi]; - set_function(m_functions[m_entry].chunk); - - // Set block hash for profiling (if enabled) - if (g_cfg.core.spu_prof) - m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536) | (m_entry >> 2)), spu_ptr(&spu_thread::block_hash)); - - m_finfo = &m_functions[m_entry]; - m_ir->CreateBr(add_block(m_entry)); - - // Emit instructions for basic blocks - for (usz bi = 0; bi < m_block_queue.size(); bi++) - { - // Initialize basic block info - const u32 baddr = m_block_queue[bi]; - m_block = &m_blocks[baddr]; - m_ir->SetInsertPoint(m_block->block); - auto& bb = ::at32(m_bbs, baddr); - bool need_check = false; - m_block->bb = &bb; - - if (!bb.preds.empty()) - { - // Initialize registers and build PHI nodes if necessary - for (u32 i = 0; i < s_reg_max; i++) - { - const u32 src = m_finfo->fn ? bb.reg_origin_abs[i] : bb.reg_origin[i]; - - if (src > 0x40000) - { - // Use the xfloat hint to create 256-bit (4x double) PHI - llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); - - const auto _phi = m_ir->CreatePHI(type, ::size32(bb.preds), fmt::format("phi0x%05x_r%u", baddr, i)); - m_block->phi[i] = _phi; - m_block->reg[i] = _phi; - - for (u32 pred : bb.preds) - { - const auto bfound = m_blocks.find(pred); - - if (bfound != m_blocks.end() && bfound->second.block_end) - { - auto& value = bfound->second.reg[i]; - - if (!value || value->getType() != _phi->getType()) - { - const auto regptr = init_reg_fixed(i); - const auto cblock = m_ir->GetInsertBlock(); - m_ir->SetInsertPoint(bfound->second.block_end->getTerminator()); - - if (!value) - { - // Value hasn't been loaded yet - value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(get_reg_type(i), regptr); - } - - if (value->getType() == get_type() && type != get_type()) - { - value = double_to_xfloat(value); - } - else if (value->getType() != get_type() && type == get_type()) - { - value = xfloat_to_double(bitcast(value)); - } - else - { - value = bitcast(value, _phi->getType()); - } - - m_ir->SetInsertPoint(cblock); - - ensure(bfound->second.block_end->getTerminator()); - } - - _phi->addIncoming(value, bfound->second.block_end); - } - } - - if (baddr == m_entry) - { - // Load value at the function chunk's entry block if necessary - const auto regptr = init_reg_fixed(i); - const auto cblock = m_ir->GetInsertBlock(); - m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); - const auto value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(get_reg_type(i), regptr); - m_ir->SetInsertPoint(cblock); - _phi->addIncoming(value, &m_function->getEntryBlock()); - } - } - else if (src < 0x40000) - { - // Passthrough register value - const auto bfound = m_blocks.find(src); - - if (bfound != m_blocks.end()) - { - m_block->reg[i] = bfound->second.reg[i]; - } - else - { - spu_log.error("[0x%05x] Value not found ($%u from 0x%05x)", baddr, i, src); - } - } - else - { - m_block->reg[i] = m_finfo->load[i]; - } - } - - // Emit state check if necessary (TODO: more conditions) - for (u32 pred : bb.preds) - { - if (pred >= baddr) - { - // If this block is a target of a backward branch (possibly loop), emit a check - need_check = true; - break; - } - } - } - - // State check at the beginning of the chunk - if (need_check || (bi == 0 && g_cfg.core.spu_block_size != spu_block_size_type::safe)) - { - check_state(baddr); - } - - // Emit instructions - for (m_pos = baddr; m_pos >= start && m_pos < end && !m_ir->GetInsertBlock()->getTerminator(); m_pos += 4) - { - if (m_pos != baddr && m_block_info[m_pos / 4]) - { - break; - } - - const u32 op = std::bit_cast>(func.data[(m_pos - start) / 4]); - - if (!op) - { - spu_log.error("[%s] Unexpected fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_hash, m_pos, m_entry, m_function_queue[0]); - break; - } - - // Set variable for set_link() - if (m_pos + 4 >= end) - m_next_op = 0; - else - m_next_op = func.data[(m_pos - start) / 4 + 1]; - - // Execute recompiler function (TODO) - (this->*decode(op))({op}); - } - - // Finalize block with fallthrough if necessary - if (!m_ir->GetInsertBlock()->getTerminator()) - { - const u32 target = m_pos == baddr ? baddr : m_pos & 0x3fffc; - - if (m_pos != baddr) - { - m_pos -= 4; - - if (target >= start && target < end) - { - const auto tfound = m_targets.find(m_pos); - - if (tfound == m_targets.end() || tfound->second.find_first_of(target) + 1 == 0) - { - spu_log.error("[%s] Unregistered fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_hash, target, m_entry, m_function_queue[0]); - } - } - } - - m_block->block_end = m_ir->GetInsertBlock(); - m_ir->CreateBr(add_block(target)); - } - - ensure(m_block->block_end); - } - - // Work on register stores. - // 1. Remove stores which are overwritten later. - // 2. Sink stores to post-dominating blocks. - llvm::PostDominatorTree pdt(*m_function); - llvm::DominatorTree dt(*m_function); - - // Post-order indices - std::unordered_map pois; - { - usz i = 0; - for (auto* bb : llvm::post_order(m_function)) - pois[bb] = i++; - } - - // Basic block to block_info - std::unordered_map bb_to_info; - - std::vector block_q; - block_q.reserve(m_blocks.size()); - for (auto& [a, b] : m_blocks) - { - block_q.emplace_back(&b); - bb_to_info[b.block] = &b; - } - - for (usz bi = 0; bi < block_q.size();) - { - auto bqbi = block_q[bi++]; - - // TODO: process all registers up to s_reg_max - for (u32 i = 0; i < 128; i++) - { - // Check if the store is beyond the last barrier - if (auto& bs = bqbi->store[i]; bs && !bqbi->does_gpr_barrier_proceed_last_store(i)) - { - for (auto& [a, b] : m_blocks) - { - // Check if the store occurs before any barrier in the block - if (b.store[i] && b.store[i] != bs && b.store_context_first_id[i] == 1) - { - if (pdt.dominates(b.store[i], bs)) - { - bs->eraseFromParent(); - bs = nullptr; - break; - } - } - } - - if (!bs) - continue; - - // Set of store instructions which overwrite bs - std::vector killers; - - for (auto& [a, b] : m_blocks) - { - const auto si = b.store[i]; - - if (si && si != bs) - { - if (pois[bs->getParent()] > pois[si->getParent()]) - { - killers.emplace_back(si->getParent()); - } - else - { - // Reset: store is not the first in the set - killers.clear(); - break; - } - } - } - - if (killers.empty()) - continue; - - // Find nearest common post-dominator - llvm::BasicBlock* common_pdom = killers[0]; - for (auto* bbb : llvm::drop_begin(killers)) - { - if (!common_pdom) - break; - common_pdom = pdt.findNearestCommonDominator(common_pdom, bbb); - } - - // Shortcut - if (!pdt.dominates(common_pdom, bs->getParent())) - common_pdom = nullptr; - - // Look for possibly-dead store in CFG starting from the exit nodes - llvm::SetVector work_list; - std::unordered_map worked_on; - - if (!common_pdom || std::count(killers.begin(), killers.end(), common_pdom) == 0) - { - if (common_pdom) - { - // Shortcut - work_list.insert(common_pdom); - worked_on[common_pdom] = true; - } - else - { - // Check all exits - for (auto* r : pdt.roots()) - { - worked_on[r] = true; - work_list.insert(r); - } - } - } - - // bool flag indicates the presence of a memory barrier before the killer store - std::vector> work2_list; - - for (usz wi = 0; wi < work_list.size(); wi++) - { - auto* cur = work_list[wi]; - if (std::count(killers.begin(), killers.end(), cur)) - { - work2_list.emplace_back(cur, bb_to_info[cur] && bb_to_info[cur]->does_gpr_barrier_preceed_first_store(i)); - continue; - } - - if (cur == bs->getParent()) - { - // Reset: store is not dead - killers.clear(); - break; - } - - for (auto* p : llvm::predecessors(cur)) - { - if (!worked_on[p]) - { - worked_on[p] = true; - work_list.insert(p); - } - } - } - - if (killers.empty()) - continue; - - worked_on.clear(); - - for (usz wi = 0; wi < work2_list.size(); wi++) - { - worked_on[work2_list[wi].first] = true; - } - - // Need to treat tails differently: do not require checking barrier (checked before in a suitable manner) - const usz work_list_tail_blocks_max_index = work2_list.size(); - - for (usz wi = 0; wi < work2_list.size(); wi++) - { - auto [cur, found_user] = work2_list[wi]; - - ensure(cur != bs->getParent()); - - if (!found_user && wi >= work_list_tail_blocks_max_index) - { - if (auto info = bb_to_info[cur]) - { - if (info->store_context_ctr[i] != 1) - { - found_user = true; - } - } - } - - for (auto* p : llvm::predecessors(cur)) - { - if (p == bs->getParent()) - { - if (found_user) - { - // Reset: store is being used and preserved by ensure_gpr_stores() - killers.clear(); - break; - } - - continue; - } - - if (!worked_on[p]) - { - worked_on[p] = true; - work2_list.push_back(std::make_pair(p, found_user)); - } - // Enqueue a second iteration for found_user=true if only found with found_user=false - else if (found_user && !std::find_if(work2_list.rbegin(), work2_list.rend(), [&](auto& it){ return it.first == p; })->second) - { - work2_list.push_back(std::make_pair(p, true)); - } - } - - if (killers.empty()) - { - break; - } - } - - // Finally erase the dead store - if (!killers.empty()) - { - bs->eraseFromParent(); - bs = nullptr; - - // Run the loop from the start - bi = 0; - } - } - } - } - - block_q.clear(); - for (auto& [a, b] : m_blocks) - { - block_q.emplace_back(&b); - } - - for (usz bi = 0; bi < block_q.size(); bi++) - { - for (u32 i = 0; i < 128; i++) - { - // If store isn't erased, try to sink it - if (auto& bs = block_q[bi]->store[i]; bs && block_q[bi]->bb->targets.size() > 1 && !block_q[bi]->does_gpr_barrier_proceed_last_store(i)) - { - std::map> sucs; - - for (u32 tj : block_q[bi]->bb->targets) - { - auto b2it = m_blocks.find(tj); - - if (b2it != m_blocks.end()) - { - sucs.emplace(tj, &b2it->second); - } - } - - for (auto [a2, b2] : sucs) - { - if (b2 != block_q[bi]) - { - auto ins = b2->block->getFirstNonPHI(); - - if (b2->bb->preds.size() == 1) - { - if (!dt.dominates(bs->getOperand(0), ins)) - continue; - if (!pdt.dominates(ins, bs)) - continue; - - m_ir->SetInsertPoint(ins); - auto si = llvm::cast(m_ir->Insert(bs->clone())); - if (b2->store[i] == nullptr) - { - b2->store[i] = si; - b2->store_context_last_id[i] = 0; - - if (!std::count(block_q.begin() + bi, block_q.end(), b2)) - { - // Sunk store can be checked again - block_q.push_back(b2); - } - } - } - else - { - // Initialize additional block between two basic blocks - auto& edge = block_q[bi]->block_edges[a2]; - if (!edge) - { - const auto succ_range = llvm::successors(block_q[bi]->block_end); - - auto succ = b2->block; - - llvm::SmallSetVector succ_q; - succ_q.insert(b2->block); - - for (usz j = 0; j < 32 && j < succ_q.size(); j++) - { - if (!llvm::count(succ_range, (succ = succ_q[j]))) - { - for (auto pred : llvm::predecessors(succ)) - { - succ_q.insert(pred); - } - } - else - { - break; - } - } - - if (!llvm::count(succ_range, succ)) - { - // TODO: figure this out - spu_log.notice("[%s] Failed successor to 0x%05x", fmt::base57(be_t{m_hash_start}), a2); - continue; - } - - edge = llvm::SplitEdge(block_q[bi]->block_end, succ); - pdt.recalculate(*m_function); - dt.recalculate(*m_function); - } - - ins = edge->getTerminator(); - if (!dt.dominates(bs->getOperand(0), ins)) - continue; - if (!pdt.dominates(ins, bs)) - continue; - - m_ir->SetInsertPoint(ins); - m_ir->Insert(bs->clone()); - } - - bs->eraseFromParent(); - bs = nullptr; - - pdt.recalculate(*m_function); - dt.recalculate(*m_function); - break; - } - } - } - } - } - } - - // Create function table if necessary - if (m_function_table->getNumUses()) - { - std::vector chunks; - chunks.reserve(m_size / 4); - - for (u32 i = start; i < end; i += 4) - { - const auto found = m_functions.find(i); - - if (found == m_functions.end()) - { - if (false && g_cfg.core.spu_verification) - { - const std::string ppname = fmt::format("%s-chunkpp-0x%05x", m_hash, i); - m_engine->updateGlobalMapping(ppname, reinterpret_cast(m_spurt->make_branch_patchpoint(i / 4))); - - const auto ppfunc = llvm::cast(m_module->getOrInsertFunction(ppname, m_finfo->chunk->getFunctionType()).getCallee()); - ppfunc->setCallingConv(m_finfo->chunk->getCallingConv()); - - chunks.push_back(ppfunc); - continue; - } - - chunks.push_back(m_dispatch); - continue; - } - - chunks.push_back(found->second.chunk); - } - - m_function_table->setInitializer(llvm::ConstantArray::get(llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), chunks)); - } - else - { - m_function_table->eraseFromParent(); - } - - // Initialize pass manager - legacy::FunctionPassManager pm(_module.get()); - - // Basic optimizations - pm.add(createEarlyCSEPass()); - pm.add(createCFGSimplificationPass()); - //pm.add(createNewGVNPass()); -#if LLVM_VERSION_MAJOR < 17 - pm.add(createDeadStoreEliminationPass()); -#endif - pm.add(createLICMPass()); -#if LLVM_VERSION_MAJOR < 17 - pm.add(createAggressiveDCEPass()); -#else - pm.add(createDeadCodeEliminationPass()); -#endif - //pm.add(createLintPass()); // Check - - for (auto& f : *m_module) - { - replace_intrinsics(f); - } - - for (const auto& func : m_functions) - { - const auto f = func.second.fn ? func.second.fn : func.second.chunk; - pm.run(*f); - } - - // Clear context (TODO) - m_blocks.clear(); - m_block_queue.clear(); - m_functions.clear(); - m_function_queue.clear(); - m_function_table = nullptr; - - raw_string_ostream out(log); - - if (g_cfg.core.spu_debug) - { - fmt::append(log, "LLVM IR at 0x%x:\n", func.entry_point); - out << *_module; // print IR - out << "\n\n"; - } - - if (verifyModule(*_module, &out)) - { - out.flush(); - spu_log.error("LLVM: Verification failed at 0x%x:\n%s", func.entry_point, log); - - if (g_cfg.core.spu_debug) - { - fs::file(m_spurt->get_cache_path() + "spu-ir.log", fs::write + fs::append).write(log); - } - - fmt::throw_exception("Compilation failed"); - } - -#if defined(__APPLE__) - pthread_jit_write_protect_np(false); -#endif - - if (g_cfg.core.spu_debug) - { - // Testing only - m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/"); - } - else - { - m_jit.add(std::move(_module)); - } - - m_jit.fin(); - - // Register function pointer - const spu_function_t fn = reinterpret_cast(m_jit.get_engine().getPointerToFunction(main_func)); - - // Install unconditionally, possibly replacing existing one from spu_fast - add_loc->compiled = fn; - - // Rebuild trampoline if necessary - if (!m_spurt->rebuild_ubertrampoline(func.data[0])) - { - return nullptr; - } - - add_loc->compiled.notify_all(); - - if (g_cfg.core.spu_debug) - { - out.flush(); - fs::write_file(m_spurt->get_cache_path() + "spu-ir.log", fs::create + fs::write + fs::append, log); - } - -#if defined(__APPLE__) - pthread_jit_write_protect_np(true); -#endif -#if defined(ARCH_ARM64) - // Flush all cache lines after potentially writing executable code - asm("ISB"); - asm("DSB ISH"); -#endif - - if (g_fxo->get().operator bool()) - { - spu_log.success("New block compiled successfully"); - } - - return fn; - } - - static void interp_check(spu_thread* _spu, bool after) - { - static thread_local std::array s_gpr; - - if (!after) - { - // Preserve reg state - s_gpr = _spu->gpr; - - // Execute interpreter instruction - const u32 op = *reinterpret_cast*>(_spu->_ptr(0) + _spu->pc); - if (!g_fxo->get().decode(op)(*_spu, {op})) - spu_log.fatal("Bad instruction"); - - // Swap state - for (u32 i = 0; i < s_gpr.size(); ++i) - std::swap(_spu->gpr[i], s_gpr[i]); - } - else - { - // Check saved state - for (u32 i = 0; i < s_gpr.size(); ++i) - { - if (_spu->gpr[i] != s_gpr[i]) - { - spu_log.fatal("Register mismatch: $%u\n%s\n%s", i, _spu->gpr[i], s_gpr[i]); - _spu->state += cpu_flag::dbg_pause; - } - } - } - } - - spu_function_t compile_interpreter() - { - using namespace llvm; - - m_engine->clearAllGlobalMappings(); - - // Create LLVM module - std::unique_ptr _module = std::make_unique("spu_interpreter.obj", m_context); - _module->setTargetTriple(jit_compiler::triple2()); - _module->setDataLayout(m_jit.get_engine().getTargetMachine()->createDataLayout()); - m_module = _module.get(); - - // Initialize IR Builder - IRBuilder<> irb(m_context); - m_ir = &irb; - - // Create interpreter table - const auto if_type = get_ftype(); - m_function_table = new GlobalVariable(*m_module, ArrayType::get(if_type->getPointerTo(), 1ull << m_interp_magn), true, GlobalValue::InternalLinkage, nullptr); - - // Add return function - const auto ret_func = cast(_module->getOrInsertFunction("spu_ret", if_type).getCallee()); - ret_func->setCallingConv(CallingConv::GHC); - ret_func->setLinkage(GlobalValue::InternalLinkage); - m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", ret_func)); - m_thread = ret_func->getArg(1); - m_interp_pc = ret_func->getArg(2); - m_ir->CreateRetVoid(); - - // Add entry function, serves as a trampoline - const auto main_func = llvm::cast(m_module->getOrInsertFunction("spu_interpreter", get_ftype()).getCallee()); -#ifdef _WIN32 - main_func->setCallingConv(CallingConv::Win64); -#endif - set_function(main_func); - - // Load pc and opcode - m_interp_pc = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::pc)); - m_interp_op = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_lsptr, m_ir->CreateZExt(m_interp_pc, get_type()))); - m_interp_op = m_ir->CreateCall(get_intrinsic(Intrinsic::bswap), {m_interp_op}); - - // Pinned constant, address of interpreter table - m_interp_table = m_ir->CreateGEP(m_function_table->getValueType(), m_function_table, {m_ir->getInt64(0), m_ir->getInt64(0)}); - - // Pinned constant, mask for shifted register index - m_interp_7f0 = m_ir->getInt32(0x7f0); - - // Pinned constant, address of first register - m_interp_regs = _ptr(m_thread, get_reg_offset(0)); - - // Save host thread's stack pointer - const auto native_sp = spu_ptr(&spu_thread::saved_native_sp); -#if defined(ARCH_X64) - const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); -#elif defined(ARCH_ARM64) - const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "sp")})); -#endif - m_ir->CreateStore(m_ir->CreateCall(get_intrinsic(Intrinsic::read_register), {rsp_name}), native_sp); - - // Decode (shift) and load function pointer - const auto first = m_ir->CreateLoad(if_type->getPointerTo(), m_ir->CreateGEP(if_type->getPointerTo(), m_interp_table, m_ir->CreateLShr(m_interp_op, 32u - m_interp_magn))); - const auto call0 = m_ir->CreateCall(if_type, first, {m_lsptr, m_thread, m_interp_pc, m_interp_op, m_interp_table, m_interp_7f0, m_interp_regs}); - call0->setCallingConv(CallingConv::GHC); - m_ir->CreateRetVoid(); - - // Create helper globals - { - std::vector float_to; - std::vector to_float; - float_to.reserve(256); - to_float.reserve(256); - - for (int i = 0; i < 256; ++i) - { - float_to.push_back(ConstantFP::get(get_type(), std::exp2(173 - i))); - to_float.push_back(ConstantFP::get(get_type(), std::exp2(i - 155))); - } - - const auto atype = ArrayType::get(get_type(), 256); - m_scale_float_to = new GlobalVariable(*m_module, atype, true, GlobalValue::InternalLinkage, ConstantArray::get(atype, float_to)); - m_scale_to_float = new GlobalVariable(*m_module, atype, true, GlobalValue::InternalLinkage, ConstantArray::get(atype, to_float)); - } - - // Fill interpreter table - std::array ifuncs{}; - std::vector iptrs; - iptrs.reserve(1ull << m_interp_magn); - - m_block = nullptr; - - auto last_itype = spu_itype::type{255}; - - for (u32 i = 0; i < 1u << m_interp_magn;) - { - // Fake opcode - const u32 op = i << (32u - m_interp_magn); - - // Instruction type - const auto itype = g_spu_itype.decode(op); - - // Function name - std::string fname = fmt::format("spu_%s", g_spu_iname.decode(op)); - - if (last_itype != itype) - { - // Trigger automatic information collection (probing) - m_op_const_mask = 0; - } - else - { - // Inject const mask into function name - fmt::append(fname, "_%X", (i & (m_op_const_mask >> (32u - m_interp_magn))) | (1u << m_interp_magn)); - } - - // Decode instruction name, access function - const auto f = cast(_module->getOrInsertFunction(fname, if_type).getCallee()); - - // Build if necessary - if (f->empty()) - { - if (last_itype != itype) - { - ifuncs[static_cast(itype)] = f; - } - - f->setCallingConv(CallingConv::GHC); - - m_function = f; - m_lsptr = f->getArg(0); - m_thread = f->getArg(1); - m_interp_pc = f->getArg(2); - m_interp_op = f->getArg(3); - m_interp_table = f->getArg(4); - m_interp_7f0 = f->getArg(5); - m_interp_regs = f->getArg(6); - - m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", f)); - m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); - - switch (itype) - { - case spu_itype::UNK: - case spu_itype::DFCEQ: - case spu_itype::DFCMEQ: - case spu_itype::DFCGT: - case spu_itype::DFCMGT: - case spu_itype::DFTSV: - case spu_itype::STOP: - case spu_itype::STOPD: - case spu_itype::RDCH: - case spu_itype::WRCH: - { - // Invalid or abortable instruction. Save current address. - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); - [[fallthrough]]; - } - default: - { - break; - } - } - - { - m_interp_bblock = nullptr; - - // Next instruction (no wraparound at the end of LS) - m_interp_pc_next = m_ir->CreateAdd(m_interp_pc, m_ir->getInt32(4)); - - bool check = false; - - if (itype == spu_itype::WRCH || - itype == spu_itype::RDCH || - itype == spu_itype::RCHCNT || - itype == spu_itype::STOP || - itype == spu_itype::STOPD || - itype & spu_itype::floating || - itype & spu_itype::branch) - { - check = false; - } - - if (itype & spu_itype::branch) - { - // Instruction changes pc - change order. - (this->*decode(op))({op}); - - if (m_interp_bblock) - { - m_ir->SetInsertPoint(m_interp_bblock); - m_interp_bblock = nullptr; - } - } - - if (!m_ir->GetInsertBlock()->getTerminator()) - { - if (check) - { - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); - } - - // Decode next instruction. - const auto next_pc = itype & spu_itype::branch ? m_interp_pc : m_interp_pc_next; - const auto be32_op = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_lsptr, m_ir->CreateZExt(next_pc, get_type()))); - const auto next_op = m_ir->CreateCall(get_intrinsic(Intrinsic::bswap), {be32_op}); - const auto next_if = m_ir->CreateLoad(if_type->getPointerTo(), m_ir->CreateGEP(if_type->getPointerTo(), m_interp_table, m_ir->CreateLShr(next_op, 32u - m_interp_magn))); - llvm::cast(next_if)->setVolatile(true); - - if (!(itype & spu_itype::branch)) - { - if (check) - { - call("spu_interp_check", &interp_check, m_thread, m_ir->getFalse()); - } - - // Normal instruction. - (this->*decode(op))({op}); - - if (check && !m_ir->GetInsertBlock()->getTerminator()) - { - call("spu_interp_check", &interp_check, m_thread, m_ir->getTrue()); - } - - m_interp_pc = m_interp_pc_next; - } - - if (last_itype != itype) - { - // Reset to discard dead code - llvm::cast(next_if)->setVolatile(false); - - if (itype & spu_itype::branch) - { - const auto _stop = BasicBlock::Create(m_context, "", f); - const auto _next = BasicBlock::Create(m_context, "", f); - m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::state))), _stop, _next, m_md_unlikely); - m_ir->SetInsertPoint(_stop); - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); - - const auto escape_yes = BasicBlock::Create(m_context, "", f); - const auto escape_no = BasicBlock::Create(m_context, "", f); - m_ir->CreateCondBr(call("spu_exec_check_state", &exec_check_state, m_thread), escape_yes, escape_no); - m_ir->SetInsertPoint(escape_yes); - call("spu_escape", spu_runtime::g_escape, m_thread); - m_ir->CreateBr(_next); - m_ir->SetInsertPoint(escape_no); - m_ir->CreateBr(_next); - m_ir->SetInsertPoint(_next); - } - - llvm::Value* fret = m_interp_table; - - if (itype == spu_itype::WRCH || - itype == spu_itype::RDCH || - itype == spu_itype::RCHCNT || - itype == spu_itype::STOP || - itype == spu_itype::STOPD || - itype == spu_itype::UNK || - itype == spu_itype::DFCMEQ || - itype == spu_itype::DFCMGT || - itype == spu_itype::DFCGT || - itype == spu_itype::DFCEQ || - itype == spu_itype::DFTSV) - { - m_interp_7f0 = m_ir->getInt32(0x7f0); - m_interp_regs = _ptr(m_thread, get_reg_offset(0)); - fret = ret_func; - } - else if (!(itype & spu_itype::branch)) - { - // Hack: inline ret instruction before final jmp; this is not reliable. -#ifdef ARCH_X64 - m_ir->CreateCall(InlineAsm::get(get_ftype(), "ret", "", true, false, InlineAsm::AD_Intel)); -#else - m_ir->CreateCall(InlineAsm::get(get_ftype(), "ret", "", true, false)); -#endif - fret = ret_func; - } - - const auto arg3 = UndefValue::get(get_type()); - const auto _ret = m_ir->CreateCall(if_type, fret, {m_lsptr, m_thread, m_interp_pc, arg3, m_interp_table, m_interp_7f0, m_interp_regs}); - _ret->setCallingConv(CallingConv::GHC); - _ret->setTailCall(); - m_ir->CreateRetVoid(); - } - - if (!m_ir->GetInsertBlock()->getTerminator()) - { - // Call next instruction. - const auto _stop = BasicBlock::Create(m_context, "", f); - const auto _next = BasicBlock::Create(m_context, "", f); - m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::state))), _stop, _next, m_md_unlikely); - m_ir->SetInsertPoint(_next); - - if (itype == spu_itype::WRCH || - itype == spu_itype::RDCH || - itype == spu_itype::RCHCNT || - itype == spu_itype::STOP || - itype == spu_itype::STOPD) - { - m_interp_7f0 = m_ir->getInt32(0x7f0); - m_interp_regs = _ptr(m_thread, get_reg_offset(0)); - } - - const auto ncall = m_ir->CreateCall(if_type, next_if, {m_lsptr, m_thread, m_interp_pc, next_op, m_interp_table, m_interp_7f0, m_interp_regs}); - ncall->setCallingConv(CallingConv::GHC); - ncall->setTailCall(); - m_ir->CreateRetVoid(); - m_ir->SetInsertPoint(_stop); - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); - call("spu_escape", spu_runtime::g_escape, m_thread)->setTailCall(); - m_ir->CreateRetVoid(); - } - } - } - } - - if (last_itype != itype && g_cfg.core.spu_decoder != spu_decoder_type::llvm) - { - // Repeat after probing - last_itype = itype; - } - else - { - // Add to the table - iptrs.push_back(f); - i++; - } - } - - m_function_table->setInitializer(ConstantArray::get(ArrayType::get(if_type->getPointerTo(), 1ull << m_interp_magn), iptrs)); - m_function_table = nullptr; - - // Initialize pass manager - legacy::FunctionPassManager pm(_module.get()); - - // Basic optimizations - pm.add(createEarlyCSEPass()); - pm.add(createCFGSimplificationPass()); -#if LLVM_VERSION_MAJOR < 17 - pm.add(createDeadStoreEliminationPass()); - pm.add(createAggressiveDCEPass()); -#else - pm.add(createDeadCodeEliminationPass()); -#endif - //pm.add(createLintPass()); - - for (auto& f : *_module) - { - replace_intrinsics(f); - //pm.run(f); - } - - std::string log; - - raw_string_ostream out(log); - - if (g_cfg.core.spu_debug) - { - fmt::append(log, "LLVM IR (interpreter):\n"); - out << *_module; // print IR - out << "\n\n"; - } - - if (verifyModule(*_module, &out)) - { - out.flush(); - spu_log.error("LLVM: Verification failed:\n%s", log); - - if (g_cfg.core.spu_debug) - { - fs::write_file(m_spurt->get_cache_path() + "spu-ir.log", fs::create + fs::write + fs::append, log); - } - - fmt::throw_exception("Compilation failed"); - } - - if (g_cfg.core.spu_debug) - { - // Testing only - m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/"); - } - else - { - m_jit.add(std::move(_module)); - } - - m_jit.fin(); - - // Register interpreter entry point - spu_runtime::g_interpreter = reinterpret_cast(m_jit.get_engine().getPointerToFunction(main_func)); - - for (u32 i = 0; i < spu_runtime::g_interpreter_table.size(); i++) - { - // Fill exported interpreter table - spu_runtime::g_interpreter_table[i] = ifuncs[i] ? reinterpret_cast(m_jit.get_engine().getPointerToFunction(ifuncs[i])) : 0; - } - - if (!spu_runtime::g_interpreter) - { - return nullptr; - } - - if (g_cfg.core.spu_debug) - { - out.flush(); - fs::write_file(m_spurt->get_cache_path() + "spu-ir.log", fs::create + fs::write + fs::append, log); - } - - return spu_runtime::g_interpreter; - } - - static bool exec_check_state(spu_thread* _spu) - { - return _spu->check_state(); - } - - template - static void exec_fall(spu_thread* _spu, spu_opcode_t op) - { - if (F(*_spu, op)) - { - _spu->pc += 4; - } - } - - template - void fall(spu_opcode_t op) - { - std::string name = fmt::format("spu_%s", g_spu_iname.decode(op.opcode)); - - if (m_interp_magn) - { - call(name, F, m_thread, m_interp_op); - return; - } - - update_pc(); - call(name, &exec_fall, m_thread, m_ir->getInt32(op.opcode)); - } - - [[noreturn]] static void exec_unk(spu_thread*, u32 op) - { - fmt::throw_exception("Unknown/Illegal instruction (0x%08x)", op); - } - - void UNK(spu_opcode_t op_unk) - { - if (m_interp_magn) - { - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); - call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); - return; - } - - m_block->block_end = m_ir->GetInsertBlock(); - update_pc(); - call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); - } - - static void exec_stop(spu_thread* _spu, u32 code) - { - if (!_spu->stop_and_signal(code) || _spu->state & cpu_flag::again) - { - spu_runtime::g_escape(_spu); - } - - if (_spu->test_stopped()) - { - _spu->pc += 4; - spu_runtime::g_escape(_spu); - } - } - - void STOP(spu_opcode_t op) // - { - if (m_interp_magn) - { - call("spu_syscall", &exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff))); - return; - } - - update_pc(); - call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff)); - - if (g_cfg.core.spu_block_size == spu_block_size_type::safe) - { - m_block->block_end = m_ir->GetInsertBlock(); - update_pc(m_pos + 4); - ensure_gpr_stores(); - tail_chunk(m_dispatch); - return; - } - } - - void STOPD(spu_opcode_t) // - { - if (m_interp_magn) - { - call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(0x3fff)); - return; - } - - STOP(spu_opcode_t{0x3fff}); - } - - static u32 exec_rdch(spu_thread* _spu, u32 ch) - { - const s64 result = _spu->get_ch_value(ch); - - if (result < 0 || _spu->state & cpu_flag::again) - { - spu_runtime::g_escape(_spu); - } - - static_cast(_spu->test_stopped()); - return static_cast(result & 0xffffffff); - } - - static u32 exec_read_in_mbox(spu_thread* _spu) - { - // TODO - return exec_rdch(_spu, SPU_RdInMbox); - } - - static u32 exec_read_dec(spu_thread* _spu) - { - const u32 res = _spu->read_dec().first; - - if (res > 1500 && g_cfg.core.spu_loop_detection) - { - _spu->state += cpu_flag::wait; - std::this_thread::yield(); - static_cast(_spu->test_stopped()); - } - - return res; - } - - static u32 exec_read_events(spu_thread* _spu) - { - // TODO - return exec_rdch(_spu, SPU_RdEventStat); - } - - void ensure_gpr_stores() - { - if (m_block) - { - // Make previous stores not able to be reordered beyond this point or be deleted - std::for_each(m_block->store_context_ctr.begin(), m_block->store_context_ctr.end(), FN(x++)); - } - } - - llvm::Value* get_rdch(spu_opcode_t op, u32 off, bool atomic) - { - const auto ptr = _ptr(m_thread, off); - llvm::Value* val0; - - if (atomic) - { - const auto val = m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, ptr, m_ir->getInt64(0), llvm::MaybeAlign{8}, llvm::AtomicOrdering::Acquire); - val0 = val; - } - else - { - const auto val = m_ir->CreateLoad(get_type(), ptr); - val->setAtomic(llvm::AtomicOrdering::Acquire); - m_ir->CreateStore(m_ir->getInt64(0), ptr)->setAtomic(llvm::AtomicOrdering::Release); - val0 = val; - } - - const auto _cur = m_ir->GetInsertBlock(); - const auto done = llvm::BasicBlock::Create(m_context, "", m_function); - const auto wait = llvm::BasicBlock::Create(m_context, "", m_function); - const auto cond = m_ir->CreateICmpSLT(val0, m_ir->getInt64(0)); - val0 = m_ir->CreateTrunc(val0, get_type()); - m_ir->CreateCondBr(cond, done, wait); - m_ir->SetInsertPoint(wait); - update_pc(); - const auto val1 = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra)); - m_ir->CreateBr(done); - m_ir->SetInsertPoint(done); - const auto rval = m_ir->CreatePHI(get_type(), 2); - rval->addIncoming(val0, _cur); - rval->addIncoming(val1, wait); - return rval; - } - - void RDCH(spu_opcode_t op) // - { - value_t res; - - if (m_interp_magn) - { - res.value = call("spu_read_channel", &exec_rdch, m_thread, get_imm(op.ra).value); - set_vr(op.rt, insert(splat(0), 3, res)); - return; - } - - switch (op.ra) - { - case SPU_RdSRR0: - { - res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::srr0)); - break; - } - case SPU_RdInMbox: - { - update_pc(); - ensure_gpr_stores(); - res.value = call("spu_read_in_mbox", &exec_read_in_mbox, m_thread); - break; - } - case MFC_RdTagStat: - { - res.value = get_rdch(op, ::offset32(&spu_thread::ch_tag_stat), false); - break; - } - case MFC_RdTagMask: - { - res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_mask)); - break; - } - case SPU_RdSigNotify1: - { - update_pc(); - ensure_gpr_stores(); - res.value = get_rdch(op, ::offset32(&spu_thread::ch_snr1), true); - break; - } - case SPU_RdSigNotify2: - { - update_pc(); - ensure_gpr_stores(); - res.value = get_rdch(op, ::offset32(&spu_thread::ch_snr2), true); - break; - } - case MFC_RdAtomicStat: - { - res.value = get_rdch(op, ::offset32(&spu_thread::ch_atomic_stat), false); - break; - } - case MFC_RdListStallStat: - { - res.value = get_rdch(op, ::offset32(&spu_thread::ch_stall_stat), false); - break; - } - case SPU_RdDec: - { - if (utils::get_tsc_freq() && !(g_cfg.core.spu_loop_detection) && (g_cfg.core.clocks_scale == 100)) - { - const auto timestamp = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_dec_start_timestamp)); - const auto dec_value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_dec_value)); - const auto tsc = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_rdtsc)); - const auto tscx = m_ir->CreateMul(m_ir->CreateUDiv(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)); - const auto tscm = m_ir->CreateUDiv(m_ir->CreateMul(m_ir->CreateURem(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)), m_ir->getInt64(utils::get_tsc_freq())); - const auto tsctb = m_ir->CreateAdd(tscx, tscm); - - const auto frz = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::is_dec_frozen)); - const auto frzev = m_ir->CreateICmpEQ(frz, m_ir->getInt8(0)); - - const auto delta = m_ir->CreateTrunc(m_ir->CreateSub(tsctb, timestamp), get_type()); - const auto deltax = m_ir->CreateSelect(frzev, delta, m_ir->getInt32(0)); - res.value = m_ir->CreateSub(dec_value, deltax); - break; - } - - res.value = call("spu_read_decrementer", &exec_read_dec, m_thread); - break; - } - case SPU_RdEventMask: - { - const auto value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_events)); - value->setAtomic(llvm::AtomicOrdering::Acquire); - res.value = m_ir->CreateTrunc(m_ir->CreateLShr(value, 32), get_type()); - break; - } - case SPU_RdEventStat: - { - update_pc(); - - if (g_cfg.savestate.compatible_mode) - { - ensure_gpr_stores(); - } - else - { - m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(&spu_thread::unsavable)); - } - - res.value = call("spu_read_events", &exec_read_events, m_thread); - - if (!g_cfg.savestate.compatible_mode) - { - m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::unsavable)); - } - - break; - } - case SPU_RdMachStat: - { - res.value = m_ir->CreateZExt(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::interrupts_enabled)), get_type()); - res.value = m_ir->CreateOr(res.value, m_ir->CreateAnd(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::thread_type)), m_ir->getInt32(2))); - break; - } - - default: - { - update_pc(); - ensure_gpr_stores(); - res.value = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra)); - break; - } - } - - set_vr(op.rt, insert(splat(0), 3, res)); - } - - static u32 exec_rchcnt(spu_thread* _spu, u32 ch) - { - return _spu->get_ch_count(ch); - } - - static u32 exec_get_events(spu_thread* _spu, u32 mask) - { - return _spu->get_events(mask).count; - } - - llvm::Value* get_rchcnt(u32 off, u64 inv = 0) - { - const auto val = m_ir->CreateLoad(get_type(), _ptr(m_thread, off)); - val->setAtomic(llvm::AtomicOrdering::Acquire); - const auto shv = m_ir->CreateLShr(val, spu_channel::off_count); - return m_ir->CreateTrunc(m_ir->CreateXor(shv, u64{inv}), get_type()); - } - - void RCHCNT(spu_opcode_t op) // - { - value_t res; - - if (m_interp_magn) - { - res.value = call("spu_read_channel_count", &exec_rchcnt, m_thread, get_imm(op.ra).value); - set_vr(op.rt, insert(splat(0), 3, res)); - return; - } - - switch (op.ra) - { - case SPU_WrOutMbox: - { - res.value = get_rchcnt(::offset32(&spu_thread::ch_out_mbox), true); - break; - } - case SPU_WrOutIntrMbox: - { - res.value = get_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true); - break; - } - case MFC_RdTagStat: - { - res.value = get_rchcnt(::offset32(&spu_thread::ch_tag_stat)); - break; - } - case MFC_RdListStallStat: - { - res.value = get_rchcnt(::offset32(&spu_thread::ch_stall_stat)); - break; - } - case SPU_RdSigNotify1: - { - res.value = get_rchcnt(::offset32(&spu_thread::ch_snr1)); - break; - } - case SPU_RdSigNotify2: - { - res.value = get_rchcnt(::offset32(&spu_thread::ch_snr2)); - break; - } - case MFC_RdAtomicStat: - { - res.value = get_rchcnt(::offset32(&spu_thread::ch_atomic_stat)); - break; - } - case MFC_WrTagUpdate: - { - res.value = m_ir->getInt32(1); - break; - } - case MFC_Cmd: - { - res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size)); - res.value = m_ir->CreateSub(m_ir->getInt32(16), res.value); - break; - } - case SPU_RdInMbox: - { - const auto value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_in_mbox)); - value->setAtomic(llvm::AtomicOrdering::Acquire); - res.value = value; - res.value = m_ir->CreateLShr(res.value, 8); - res.value = m_ir->CreateAnd(res.value, 7); - break; - } - case SPU_RdEventStat: - { - const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_events)), 32), get_type()); - res.value = call("spu_get_events", &exec_get_events, m_thread, mask); - break; - } - - // Channels with a constant count of 1: - case SPU_WrEventMask: - case SPU_WrEventAck: - case SPU_WrDec: - case SPU_RdDec: - case SPU_RdEventMask: - case SPU_RdMachStat: - case SPU_WrSRR0: - case SPU_RdSRR0: - case SPU_Set_Bkmk_Tag: - case SPU_PM_Start_Ev: - case SPU_PM_Stop_Ev: - case MFC_RdTagMask: - case MFC_LSA: - case MFC_EAH: - case MFC_EAL: - case MFC_Size: - case MFC_TagID: - case MFC_WrTagMask: - case MFC_WrListStallAck: - { - res.value = m_ir->getInt32(1); - break; - } - - default: - { - res.value = call("spu_read_channel_count", &exec_rchcnt, m_thread, m_ir->getInt32(op.ra)); - break; - } - } - - set_vr(op.rt, insert(splat(0), 3, res)); - } - - static void exec_wrch(spu_thread* _spu, u32 ch, u32 value) - { - if (!_spu->set_ch_value(ch, value) || _spu->state & cpu_flag::again) - { - spu_runtime::g_escape(_spu); - } - - static_cast(_spu->test_stopped()); - } - - static void exec_list_unstall(spu_thread* _spu, u32 tag) - { - for (u32 i = 0; i < _spu->mfc_size; i++) - { - if (_spu->mfc_queue[i].tag == (tag | 0x80)) - { - _spu->mfc_queue[i].tag &= 0x7f; - } - } - - _spu->do_mfc(); - } - - static void exec_mfc_cmd(spu_thread* _spu) - { - if (!_spu->process_mfc_cmd() || _spu->state & cpu_flag::again) - { - spu_runtime::g_escape(_spu); - } - - static_cast(_spu->test_stopped()); - } - - void WRCH(spu_opcode_t op) // - { - const auto val = eval(extract(get_vr(op.rt), 3)); - - if (m_interp_magn) - { - call("spu_write_channel", &exec_wrch, m_thread, get_imm(op.ra).value, val.value); - return; - } - - switch (op.ra) - { - case SPU_WrSRR0: - { - m_ir->CreateStore(eval(val & 0x3fffc).value, spu_ptr(&spu_thread::srr0)); - return; - } - case SPU_WrOutIntrMbox: - { - // TODO - break; - } - case SPU_WrOutMbox: - { - // TODO - break; - } - case MFC_WrTagMask: - { - // TODO - m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_tag_mask)); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto _mfc = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_upd)), m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE)), _mfc, next); - m_ir->SetInsertPoint(_mfc); - update_pc(); - call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value); - m_ir->CreateBr(next); - m_ir->SetInsertPoint(next); - return; - } - case MFC_WrTagUpdate: - { - if (true) - { - const auto tag_mask = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_mask)); - const auto mfc_fence = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_fence)); - const auto completed = m_ir->CreateAnd(tag_mask, m_ir->CreateNot(mfc_fence)); - const auto upd_ptr = spu_ptr(&spu_thread::ch_tag_upd); - const auto stat_ptr = spu_ptr(&spu_thread::ch_tag_stat); - const auto stat_val = m_ir->CreateOr(m_ir->CreateZExt(completed, get_type()), s64{smin}); - - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto next0 = llvm::BasicBlock::Create(m_context, "", m_function); - const auto imm = llvm::BasicBlock::Create(m_context, "", m_function); - const auto any = llvm::BasicBlock::Create(m_context, "", m_function); - const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); - const auto update = llvm::BasicBlock::Create(m_context, "", m_function); - - m_ir->CreateCondBr(m_ir->CreateICmpEQ(val.value, m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE)), imm, next0); - m_ir->SetInsertPoint(imm); - m_ir->CreateStore(val.value, upd_ptr); - m_ir->CreateStore(stat_val, stat_ptr); - m_ir->CreateBr(next); - m_ir->SetInsertPoint(next0); - m_ir->CreateCondBr(m_ir->CreateICmpULE(val.value, m_ir->getInt32(MFC_TAG_UPDATE_ALL)), any, fail, m_md_likely); - - // Illegal update, access violate with special address - m_ir->SetInsertPoint(fail); - const auto ptr = _ptr(m_memptr, 0xffdead04); - m_ir->CreateStore(m_ir->getInt32("TAG\0"_u32), ptr); - m_ir->CreateBr(next); - - m_ir->SetInsertPoint(any); - const auto cond = m_ir->CreateSelect(m_ir->CreateICmpEQ(val.value, m_ir->getInt32(MFC_TAG_UPDATE_ANY)) - , m_ir->CreateICmpNE(completed, m_ir->getInt32(0)), m_ir->CreateICmpEQ(completed, tag_mask)); - - m_ir->CreateStore(m_ir->CreateSelect(cond, m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE), val.value), upd_ptr); - m_ir->CreateCondBr(cond, update, next, m_md_likely); - m_ir->SetInsertPoint(update); - m_ir->CreateStore(stat_val, stat_ptr); - m_ir->CreateBr(next); - m_ir->SetInsertPoint(next); - return; - } - } - case MFC_LSA: - { - set_reg_fixed(s_reg_mfc_lsa, val.value); - return; - } - case MFC_EAH: - { - if (auto ci = llvm::dyn_cast(val.value)) - { - if (ci->getZExtValue() == 0) - { - return; - } - } - - spu_log.warning("[0x%x] MFC_EAH: $%u is not a zero constant", m_pos, +op.rt); - //m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::eah)); - return; - } - case MFC_EAL: - { - set_reg_fixed(s_reg_mfc_eal, val.value); - return; - } - case MFC_Size: - { - set_reg_fixed(s_reg_mfc_size, trunc(val).eval(m_ir)); - return; - } - case MFC_TagID: - { - set_reg_fixed(s_reg_mfc_tag, trunc(val & 0x1f).eval(m_ir)); - return; - } - case MFC_Cmd: - { - // Prevent store elimination (TODO) - m_block->store_context_ctr[s_reg_mfc_eal]++; - m_block->store_context_ctr[s_reg_mfc_lsa]++; - m_block->store_context_ctr[s_reg_mfc_tag]++; - m_block->store_context_ctr[s_reg_mfc_size]++; - - if (auto ci = llvm::dyn_cast(trunc(val).eval(m_ir))) - { - if (g_cfg.core.mfc_debug) - { - break; - } - - bool must_use_cpp_functions = !!g_cfg.core.spu_accurate_dma; - - if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); g_cfg.core.rsx_fifo_accuracy || g_cfg.video.strict_rendering_mode || !g_use_rtm) - { - // TODO: don't require TSX (current implementation is TSX-only) - if (cmdh == MFC_PUT_CMD || cmdh == MFC_SNDSIG_CMD) - { - must_use_cpp_functions = true; - } - } - - const auto eal = get_reg_fixed(s_reg_mfc_eal); - const auto lsa = get_reg_fixed(s_reg_mfc_lsa); - const auto tag = get_reg_fixed(s_reg_mfc_tag); - - const auto size = get_reg_fixed(s_reg_mfc_size); - const auto mask = m_ir->CreateShl(m_ir->getInt32(1), zext(tag).eval(m_ir)); - const auto exec = llvm::BasicBlock::Create(m_context, "", m_function); - const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - - const auto pf = spu_ptr(&spu_thread::mfc_fence); - const auto pb = spu_ptr(&spu_thread::mfc_barrier); - - switch (u64 cmd = ci->getZExtValue()) - { - case MFC_SDCRT_CMD: - case MFC_SDCRTST_CMD: - { - return; - } - case MFC_PUTL_CMD: - case MFC_PUTLB_CMD: - case MFC_PUTLF_CMD: - case MFC_PUTRL_CMD: - case MFC_PUTRLB_CMD: - case MFC_PUTRLF_CMD: - case MFC_GETL_CMD: - case MFC_GETLB_CMD: - case MFC_GETLF_CMD: - { - ensure_gpr_stores(); - [[fallthrough]]; - } - case MFC_SDCRZ_CMD: - case MFC_GETLLAR_CMD: - case MFC_PUTLLC_CMD: - case MFC_PUTLLUC_CMD: - case MFC_PUTQLLUC_CMD: - { - // TODO - m_ir->CreateBr(next); - m_ir->SetInsertPoint(exec); - m_ir->CreateUnreachable(); - m_ir->SetInsertPoint(fail); - m_ir->CreateUnreachable(); - m_ir->SetInsertPoint(next); - m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); - update_pc(); - call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); - return; - } - case MFC_SNDSIG_CMD: - case MFC_SNDSIGB_CMD: - case MFC_SNDSIGF_CMD: - case MFC_PUT_CMD: - case MFC_PUTB_CMD: - case MFC_PUTF_CMD: - case MFC_PUTR_CMD: - case MFC_PUTRB_CMD: - case MFC_PUTRF_CMD: - case MFC_GET_CMD: - case MFC_GETB_CMD: - case MFC_GETF_CMD: - { - // Try to obtain constant size - u64 csize = -1; - - if (auto ci = llvm::dyn_cast(size.value)) - { - csize = ci->getZExtValue(); - } - - if (cmd >= MFC_SNDSIG_CMD && csize != 4) - { - csize = -1; - } - - llvm::Value* src = m_ir->CreateGEP(get_type(), m_lsptr, zext(lsa).eval(m_ir)); - llvm::Value* dst = m_ir->CreateGEP(get_type(), m_memptr, zext(eal).eval(m_ir)); - - if (cmd & MFC_GET_CMD) - { - std::swap(src, dst); - } - - llvm::Value* barrier = m_ir->CreateLoad(get_type(), pb); - - if (cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK)) - { - barrier = m_ir->CreateOr(barrier, m_ir->CreateLoad(get_type(), pf)); - } - - const auto cond = m_ir->CreateIsNull(m_ir->CreateAnd(mask, barrier)); - m_ir->CreateCondBr(cond, exec, fail, m_md_likely); - m_ir->SetInsertPoint(exec); - - const auto copy = llvm::BasicBlock::Create(m_context, "", m_function); - - // Always use interpreter function for MFC debug option - if (!must_use_cpp_functions) - { - const auto mmio = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpUGE(eal.value, m_ir->getInt32(0xe0000000)), mmio, copy, m_md_unlikely); - m_ir->SetInsertPoint(mmio); - } - - m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); - call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); - m_ir->CreateBr(next); - m_ir->SetInsertPoint(copy); - - llvm::Type* vtype = get_type(); - - switch (csize) - { - case 0: - case umax: - { - break; - } - case 1: - { - vtype = get_type(); - break; - } - case 2: - { - vtype = get_type(); - break; - } - case 4: - { - vtype = get_type(); - break; - } - case 8: - { - vtype = get_type(); - break; - } - default: - { - if (csize % 16 || csize > 0x4000) - { - spu_log.error("[0x%x] MFC_Cmd: invalid size %u", m_pos, csize); - } - } - } - - // Check if the LS address is constant and 256 bit aligned - u64 clsa = umax; - - if (auto ci = llvm::dyn_cast(lsa.value)) - { - clsa = ci->getZExtValue(); - } - - u32 stride = 16; - - if (m_use_avx && csize >= 32 && !(clsa % 32)) - { - vtype = get_type(); - stride = 32; - } - - if (csize > 0 && csize <= 16) - { - // Generate single copy operation - m_ir->CreateStore(m_ir->CreateLoad(vtype, src), dst); - } - else if (csize <= stride * 16 && !(csize % 32)) - { - // Generate fixed sequence of copy operations - for (u32 i = 0; i < csize; i += stride) - { - const auto _src = m_ir->CreateGEP(get_type(), src, m_ir->getInt32(i)); - const auto _dst = m_ir->CreateGEP(get_type(), dst, m_ir->getInt32(i)); - if (csize - i < stride) - { - m_ir->CreateStore(m_ir->CreateLoad(get_type(), _src), _dst); - } - else - { - m_ir->CreateAlignedStore(m_ir->CreateAlignedLoad(vtype, _src, llvm::MaybeAlign{16}), _dst, llvm::MaybeAlign{16}); - } - } - } - else if (csize) - { - // TODO - auto spu_memcpy = [](u8* dst, const u8* src, u32 size) - { - std::memcpy(dst, src, size); - }; - call("spu_memcpy", +spu_memcpy, dst, src, zext(size).eval(m_ir)); - } - - // Disable certain thing - m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::last_faddr)); - m_ir->CreateBr(next); - break; - } - case MFC_BARRIER_CMD: - case MFC_EIEIO_CMD: - case MFC_SYNC_CMD: - { - const auto cond = m_ir->CreateIsNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size))); - m_ir->CreateCondBr(cond, exec, fail, m_md_likely); - m_ir->SetInsertPoint(exec); - m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); - m_ir->CreateBr(next); - break; - } - default: - { - // TODO - spu_log.error("[0x%x] MFC_Cmd: unknown command (0x%x)", m_pos, cmd); - m_ir->CreateBr(next); - m_ir->SetInsertPoint(exec); - m_ir->CreateUnreachable(); - break; - } - } - - // Fallback: enqueue the command - m_ir->SetInsertPoint(fail); - - // Get MFC slot, redirect to invalid memory address - const auto slot = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size)); - const auto off0 = m_ir->CreateAdd(m_ir->CreateMul(slot, m_ir->getInt32(sizeof(spu_mfc_cmd))), m_ir->getInt32(::offset32(&spu_thread::mfc_queue))); - const auto ptr0 = m_ir->CreateGEP(get_type(), m_thread, m_ir->CreateZExt(off0, get_type())); - const auto ptr1 = m_ir->CreateGEP(get_type(), m_memptr, m_ir->getInt64(0xffdeadf0)); - const auto pmfc = m_ir->CreateSelect(m_ir->CreateICmpULT(slot, m_ir->getInt32(16)), ptr0, ptr1); - m_ir->CreateStore(ci, _ptr(pmfc, ::offset32(&spu_mfc_cmd::cmd))); - - switch (u64 cmd = ci->getZExtValue()) - { - case MFC_GETLLAR_CMD: - case MFC_PUTLLC_CMD: - case MFC_PUTLLUC_CMD: - case MFC_PUTQLLUC_CMD: - { - break; - } - case MFC_PUTL_CMD: - case MFC_PUTLB_CMD: - case MFC_PUTLF_CMD: - case MFC_PUTRL_CMD: - case MFC_PUTRLB_CMD: - case MFC_PUTRLF_CMD: - case MFC_GETL_CMD: - case MFC_GETLB_CMD: - case MFC_GETLF_CMD: - { - break; - } - case MFC_SDCRZ_CMD: - { - break; - } - case MFC_SNDSIG_CMD: - case MFC_SNDSIGB_CMD: - case MFC_SNDSIGF_CMD: - case MFC_PUT_CMD: - case MFC_PUTB_CMD: - case MFC_PUTF_CMD: - case MFC_PUTR_CMD: - case MFC_PUTRB_CMD: - case MFC_PUTRF_CMD: - case MFC_GET_CMD: - case MFC_GETB_CMD: - case MFC_GETF_CMD: - { - m_ir->CreateStore(tag.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::tag))); - m_ir->CreateStore(size.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::size))); - m_ir->CreateStore(lsa.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::lsa))); - m_ir->CreateStore(eal.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::eal))); - m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type(), pf), mask), pf); - if (cmd & MFC_BARRIER_MASK) - m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type(), pb), mask), pb); - break; - } - case MFC_BARRIER_CMD: - case MFC_EIEIO_CMD: - case MFC_SYNC_CMD: - { - m_ir->CreateStore(m_ir->getInt32(-1), pb); - m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type(), pf), mask), pf); - break; - } - default: - { - m_ir->CreateUnreachable(); - break; - } - } - - m_ir->CreateStore(m_ir->CreateAdd(slot, m_ir->getInt32(1)), spu_ptr(&spu_thread::mfc_size)); - m_ir->CreateBr(next); - m_ir->SetInsertPoint(next); - return; - } - - // Fallback to unoptimized WRCH implementation (TODO) - spu_log.warning("[0x%x] MFC_Cmd: $%u is not a constant", m_pos, +op.rt); - break; - } - case MFC_WrListStallAck: - { - const auto mask = eval(splat(1) << (val & 0x1f)); - const auto _ptr = spu_ptr(&spu_thread::ch_stall_mask); - const auto _old = m_ir->CreateLoad(get_type(), _ptr); - const auto _new = m_ir->CreateAnd(_old, m_ir->CreateNot(mask.value)); - m_ir->CreateStore(_new, _ptr); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto _mfc = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpNE(_old, _new), _mfc, next); - m_ir->SetInsertPoint(_mfc); - ensure_gpr_stores(); - update_pc(); - call("spu_list_unstall", &exec_list_unstall, m_thread, eval(val & 0x1f).value); - m_ir->CreateBr(next); - m_ir->SetInsertPoint(next); - return; - } - case SPU_WrDec: - { - call("spu_get_events", &exec_get_events, m_thread, m_ir->getInt32(SPU_EVENT_TM)); - - if (utils::get_tsc_freq() && !(g_cfg.core.spu_loop_detection) && (g_cfg.core.clocks_scale == 100)) - { - const auto tsc = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_rdtsc)); - const auto tscx = m_ir->CreateMul(m_ir->CreateUDiv(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)); - const auto tscm = m_ir->CreateUDiv(m_ir->CreateMul(m_ir->CreateURem(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)), m_ir->getInt64(utils::get_tsc_freq())); - const auto tsctb = m_ir->CreateAdd(tscx, tscm); - m_ir->CreateStore(tsctb, spu_ptr(&spu_thread::ch_dec_start_timestamp)); - } - else - { - m_ir->CreateStore(call("get_timebased_time", &get_timebased_time), spu_ptr(&spu_thread::ch_dec_start_timestamp)); - } - - m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_dec_value)); - m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::is_dec_frozen)); - return; - } - case SPU_Set_Bkmk_Tag: - case SPU_PM_Start_Ev: - case SPU_PM_Stop_Ev: - { - return; - } - default: break; - } - - update_pc(); - ensure_gpr_stores(); - call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value); - } - - void LNOP(spu_opcode_t) // - { - } - - void NOP(spu_opcode_t) // - { - } - - void SYNC(spu_opcode_t) // - { - // This instruction must be used following a store instruction that modifies the instruction stream. - m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); - - if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !m_interp_magn) - { - m_block->block_end = m_ir->GetInsertBlock(); - update_pc(m_pos + 4); - tail_chunk(m_dispatch); - } - } - - void DSYNC(spu_opcode_t) // - { - // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. - m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); - } - - void MFSPR(spu_opcode_t op) // - { - // Check SPUInterpreter for notes. - set_vr(op.rt, splat(0)); - } - - void MTSPR(spu_opcode_t) // - { - // Check SPUInterpreter for notes. - } - - template - auto mpyh(TA&& a, TB&& b) - { - return bitcast(bitcast((std::forward(a) >> 16)) * bitcast(std::forward(b))) << 16; - } - - template - auto mpyu(TA&& a, TB&& b) - { - return (std::forward(a) << 16 >> 16) * (std::forward(b) << 16 >> 16); - } - - void SF(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.rb) - get_vr(op.ra)); - } - - void OR(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) | get_vr(op.rb)); - } - - void BG(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - set_vr(op.rt, zext(a <= b)); - } - - void SFH(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.rb) - get_vr(op.ra)); - } - - void NOR(spu_opcode_t op) - { - set_vr(op.rt, ~(get_vr(op.ra) | get_vr(op.rb))); - } - - void ABSDB(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - set_vr(op.rt, absd(a, b)); - } - - void ROT(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - set_vr(op.rt, rol(a, b)); - } - - void ROTM(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - - auto minusb = eval(-b); - if (auto [ok, x] = match_expr(b, -match()); ok) - { - minusb = eval(x); - } - - if (auto k = get_known_bits(minusb); !!(k.Zero & 32)) - { - set_vr(op.rt, a >> (minusb & 31)); - return; - } - - set_vr(op.rt, inf_lshr(a, minusb & 63)); - } - - void ROTMA(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - - auto minusb = eval(-b); - if (auto [ok, x] = match_expr(b, -match()); ok) - { - minusb = eval(x); - } - - if (auto k = get_known_bits(minusb); !!(k.Zero & 32)) - { - set_vr(op.rt, a >> (minusb & 31)); - return; - } - - set_vr(op.rt, inf_ashr(a, minusb & 63)); - } - - void SHL(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - - if (auto k = get_known_bits(b); !!(k.Zero & 32)) - { - set_vr(op.rt, a << (b & 31)); - return; - } - - set_vr(op.rt, inf_shl(a, b & 63)); - } - - void ROTH(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - set_vr(op.rt, rol(a, b)); - } - - void ROTHM(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - - auto minusb = eval(-b); - if (auto [ok, x] = match_expr(b, -match()); ok) - { - minusb = eval(x); - } - - if (auto k = get_known_bits(minusb); !!(k.Zero & 16)) - { - set_vr(op.rt, a >> (minusb & 15)); - return; - } - - set_vr(op.rt, inf_lshr(a, minusb & 31)); - } - - void ROTMAH(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - - auto minusb = eval(-b); - if (auto [ok, x] = match_expr(b, -match()); ok) - { - minusb = eval(x); - } - - if (auto k = get_known_bits(minusb); !!(k.Zero & 16)) - { - set_vr(op.rt, a >> (minusb & 15)); - return; - } - - set_vr(op.rt, inf_ashr(a, minusb & 31)); - } - - void SHLH(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - - if (auto k = get_known_bits(b); !!(k.Zero & 16)) - { - set_vr(op.rt, a << (b & 15)); - return; - } - - set_vr(op.rt, inf_shl(a, b & 31)); - } - - void ROTI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto i = get_imm(op.i7, false); - set_vr(op.rt, rol(a, i)); - } - - void ROTMI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto i = get_imm(op.i7, false); - set_vr(op.rt, inf_lshr(a, -i & 63)); - } - - void ROTMAI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto i = get_imm(op.i7, false); - set_vr(op.rt, inf_ashr(a, -i & 63)); - } - - void SHLI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto i = get_imm(op.i7, false); - set_vr(op.rt, inf_shl(a, i & 63)); - } - - void ROTHI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto i = get_imm(op.i7, false); - set_vr(op.rt, rol(a, i)); - } - - void ROTHMI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto i = get_imm(op.i7, false); - set_vr(op.rt, inf_lshr(a, -i & 31)); - } - - void ROTMAHI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto i = get_imm(op.i7, false); - set_vr(op.rt, inf_ashr(a, -i & 31)); - } - - void SHLHI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto i = get_imm(op.i7, false); - set_vr(op.rt, inf_shl(a, i & 31)); - } - - void A(spu_opcode_t op) - { - if (auto [a, b] = match_vrs(op.ra, op.rb); a && b) - { - static const auto MP = match(); - - if (auto [ok, a0, b0, b1, a1] = match_expr(a, mpyh(MP, MP) + mpyh(MP, MP)); ok) - { - if (auto [ok, a2, b2] = match_expr(b, mpyu(MP, MP)); ok && a2.eq(a0, a1) && b2.eq(b0, b1)) - { - // 32-bit multiplication - spu_log.notice("mpy32 in %s at 0x%05x", m_hash, m_pos); - set_vr(op.rt, a0 * b0); - return; - } - } - } - - set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); - } - - void AND(spu_opcode_t op) - { - if (match_vr(op.ra, [&](auto a, auto /*MP1*/) - { - if (auto b = match_vr_as(a, op.rb)) - { - set_vr(op.rt, a & b); - return true; - } - - return match_vr(op.rb, [&](auto /*b*/, auto /*MP2*/) - { - set_vr(op.rt, a & get_vr_as(a, op.rb)); - return true; - }); - })) - { - return; - } - - set_vr(op.rt, get_vr(op.ra) & get_vr(op.rb)); - } - - void CG(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - set_vr(op.rt, zext(a + b < a)); - } - - void AH(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); - } - - void NAND(spu_opcode_t op) - { - set_vr(op.rt, ~(get_vr(op.ra) & get_vr(op.rb))); - } - - void AVGB(spu_opcode_t op) - { - set_vr(op.rt, avg(get_vr(op.ra), get_vr(op.rb))); - } - - void GB(spu_opcode_t op) - { - // GFNI trick to extract selected bit from bytes - // By treating the first input as constant, and the second input as variable, - // with only 1 bit set in our constant, gf2p8affineqb will extract that selected bit - // from each byte of the second operand - if (m_use_gfni) - { - const auto a = get_vr(op.ra); - const auto as = zshuffle(a, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 12, 8, 4, 0); - set_vr(op.rt, gf2p8affineqb(build(0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0), as, 0x0)); - return; - } - - const auto a = get_vr(op.ra); - const auto m = zext(bitcast(trunc(a))); - set_vr(op.rt, insert(splat(0), 3, eval(m))); - } - - void GBH(spu_opcode_t op) - { - if (m_use_gfni) - { - const auto a = get_vr(op.ra); - const auto as = zshuffle(a, 16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0); - set_vr(op.rt, gf2p8affineqb(build(0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0), as, 0x0)); - return; - } - - const auto a = get_vr(op.ra); - const auto m = zext(bitcast(trunc(a))); - set_vr(op.rt, insert(splat(0), 3, eval(m))); - } - - void GBB(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - - if (m_use_gfni) - { - const auto as = zshuffle(a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); - const auto m = gf2p8affineqb(build(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0), as, 0x0); - set_vr(op.rt, zshuffle(m, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); - return; - } - - const auto m = zext(bitcast(trunc(a))); - set_vr(op.rt, insert(splat(0), 3, eval(m))); - } - - void FSM(spu_opcode_t op) - { - // FSM following a comparison instruction - if (match_vr(op.ra, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - set_vr(op.rt, (splat_scalar(c))); - return true; - } - - return false; - })) - { - return; - } - - const auto v = extract(get_vr(op.ra), 3); - const auto m = bitcast(trunc(v)); - set_vr(op.rt, sext(m)); - } - - void FSMH(spu_opcode_t op) - { - const auto v = extract(get_vr(op.ra), 3); - const auto m = bitcast(trunc(v)); - set_vr(op.rt, sext(m)); - } - - void FSMB(spu_opcode_t op) - { - const auto v = extract(get_vr(op.ra), 3); - const auto m = bitcast(trunc(v)); - set_vr(op.rt, sext(m)); - } - - template - static auto byteswap(TA&& a) - { - return zshuffle(std::forward(a), 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - } - - void ROTQBYBI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - - // Data with swapped endian from a load instruction - if (auto [ok, as] = match_expr(a, byteswap(match())); ok) - { - const auto sc = build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const auto sh = sc + (splat_scalar(get_vr(op.rb)) >> 3); - - if (m_use_avx512_icl) - { - set_vr(op.rt, vpermb(as, sh)); - return; - } - - set_vr(op.rt, pshufb(as, (sh & 0xf))); - return; - } - const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = sc - (splat_scalar(get_vr(op.rb)) >> 3); - - if (m_use_avx512_icl) - { - set_vr(op.rt, vpermb(a, sh)); - return; - } - - set_vr(op.rt, pshufb(a, (sh & 0xf))); - } - - void ROTQMBYBI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - - auto minusb = eval(-(b >> 3)); - if (auto [ok, v0, v1] = match_expr(b, match() - match()); ok) - { - if (auto [ok1, data] = get_const_vector(v0.value, m_pos); ok1) - { - if (data == v128::from32p(7)) - { - minusb = eval(v1 >> 3); - } - } - } - - const auto minusbx = eval(bitcast(minusb) & 0x1f); - - // Data with swapped endian from a load instruction - if (auto [ok, as] = match_expr(a, byteswap(match())); ok) - { - const auto sc = build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const auto sh = sc - splat_scalar(minusbx); - set_vr(op.rt, pshufb(as, sh)); - return; - } - - const auto sc = build(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127); - const auto sh = sc + splat_scalar(minusbx); - set_vr(op.rt, pshufb(a, sh)); - } - - void SHLQBYBI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - - // Data with swapped endian from a load instruction - if (auto [ok, as] = match_expr(a, byteswap(match())); ok) - { - const auto sc = build(127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112); - const auto sh = sc + (splat_scalar(b) >> 3); - set_vr(op.rt, pshufb(as, sh)); - return; - } - - const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = sc - (splat_scalar(b) >> 3); - set_vr(op.rt, pshufb(a, sh)); - } - - template - auto spu_get_insertion_shuffle_mask(T&& index) - { - const auto c = bitcast(build(0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10)); - using e_type = std::remove_extent_t; - const auto v = splat(static_cast(sizeof(e_type) == 8 ? 0x01020304050607ull : 0x010203ull)); - return insert(c, std::forward(index), v); - } - - void CBX(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) - { - // Optimization with aligned stack assumption. Strange because SPU code could use CBD instead, but encountered in wild. - set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_scalar(get_vr(op.rb)) & 0xf)); - return; - } - - const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb)); - set_vr(op.rt, spu_get_insertion_shuffle_mask(~s & 0xf)); - } - - void CHX(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) - { - // See CBX. - set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_scalar(get_vr(op.rb)) >> 1 & 0x7)); - return; - } - - const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb)); - set_vr(op.rt, spu_get_insertion_shuffle_mask(~s >> 1 & 0x7)); - } - - void CWX(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) - { - // See CBX. - set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_scalar(get_vr(op.rb)) >> 2 & 0x3)); - return; - } - - const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb)); - set_vr(op.rt, spu_get_insertion_shuffle_mask(~s >> 2 & 0x3)); - } - - void CDX(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) - { - // See CBX. - set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_scalar(get_vr(op.rb)) >> 3 & 0x1)); - return; - } - - const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb)); - set_vr(op.rt, spu_get_insertion_shuffle_mask(~s >> 3 & 0x1)); - } - - void ROTQBI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = splat_scalar(get_vr(op.rb) & 0x7); - set_vr(op.rt, fshl(a, zshuffle(a, 3, 0, 1, 2), b)); - } - - void ROTQMBI(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - - auto minusb = eval(-b); - if (auto [ok, x] = match_expr(b, -match()); ok) - { - minusb = eval(x); - } - - const auto bx = splat_scalar(minusb) & 0x7; - set_vr(op.rt, fshr(zshuffle(a, 1, 2, 3, 4), a, bx)); - } - - void SHLQBI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = splat_scalar(get_vr(op.rb) & 0x7); - set_vr(op.rt, fshl(a, zshuffle(a, 4, 0, 1, 2), b)); - } - -#if defined(ARCH_X64) - static __m128i exec_rotqby(__m128i a, u8 b) - { - alignas(32) const __m128i buf[2]{a, a}; - return _mm_loadu_si128(reinterpret_cast(reinterpret_cast(buf) + (16 - (b & 0xf)))); - } -#elif defined(ARCH_ARM64) -#else -#error "Unimplemented" -#endif - - void ROTQBY(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - -#if defined(ARCH_X64) - if (!m_use_ssse3) - { - value_t r; - r.value = call("spu_rotqby", &exec_rotqby, a.value, eval(extract(b, 12)).value); - set_vr(op.rt, r); - return; - } -#endif - - // Data with swapped endian from a load instruction - if (auto [ok, as] = match_expr(a, byteswap(match())); ok) - { - const auto sc = build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const auto sh = eval(sc + splat_scalar(b)); - - if (m_use_avx512_icl) - { - set_vr(op.rt, vpermb(as, sh)); - return; - } - - set_vr(op.rt, pshufb(as, (sh & 0xf))); - return; - } - - const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = eval(sc - splat_scalar(b)); - - if (m_use_avx512_icl) - { - set_vr(op.rt, vpermb(a, sh)); - return; - } - - set_vr(op.rt, pshufb(a, (sh & 0xf))); - } - - void ROTQMBY(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - - auto minusb = eval(-b); - if (auto [ok, x] = match_expr(b, -match()); ok) - { - minusb = eval(x); - } - - const auto minusbx = bitcast(minusb); - - // Data with swapped endian from a load instruction - if (auto [ok, as] = match_expr(a, byteswap(match())); ok) - { - const auto sc = build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const auto sh = sc - (splat_scalar(minusbx) & 0x1f); - set_vr(op.rt, pshufb(as, sh)); - return; - } - - const auto sc = build(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127); - const auto sh = sc + (splat_scalar(minusbx) & 0x1f); - set_vr(op.rt, pshufb(a, sh)); - } - - void SHLQBY(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - - // Data with swapped endian from a load instruction - if (auto [ok, as] = match_expr(a, byteswap(match())); ok) - { - const auto sc = build(127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112); - const auto sh = sc + (splat_scalar(b) & 0x1f); - set_vr(op.rt, pshufb(as, sh)); - return; - } - - const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = sc - (splat_scalar(b) & 0x1f); - set_vr(op.rt, pshufb(a, sh)); - } - - template - static llvm_calli orx(T&& a) - { - return {"spu_orx", {std::forward(a)}}; - } - - void ORX(spu_opcode_t op) - { - register_intrinsic("spu_orx", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto x = zshuffle(a, 2, 3, 0, 1) | a; - const auto y = zshuffle(x, 1, 0, 3, 2) | x; - return zshuffle(y, 4, 4, 4, 3); - }); - - set_vr(op.rt, orx(get_vr(op.ra))); - } - - void CBD(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) - { - // Known constant with aligned stack assumption (optimization). - set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_imm(op.i7) & 0xf)); - return; - } - - const auto a = get_scalar(get_vr(op.ra)) + get_imm(op.i7); - set_vr(op.rt, spu_get_insertion_shuffle_mask(~a & 0xf)); - } - - void CHD(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) - { - // See CBD. - set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_imm(op.i7) >> 1 & 0x7)); - return; - } - - const auto a = get_scalar(get_vr(op.ra)) + get_imm(op.i7); - set_vr(op.rt, spu_get_insertion_shuffle_mask(~a >> 1 & 0x7)); - } - - void CWD(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) - { - // See CBD. - set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_imm(op.i7) >> 2 & 0x3)); - return; - } - - const auto a = get_scalar(get_vr(op.ra)) + get_imm(op.i7); - set_vr(op.rt, spu_get_insertion_shuffle_mask(~a >> 2 & 0x3)); - } - - void CDD(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn && op.ra == s_reg_sp) - { - // See CBD. - set_vr(op.rt, spu_get_insertion_shuffle_mask(~get_imm(op.i7) >> 3 & 0x1)); - return; - } - - const auto a = get_scalar(get_vr(op.ra)) + get_imm(op.i7); - set_vr(op.rt, spu_get_insertion_shuffle_mask(~a >> 3 & 0x1)); - } - - void ROTQBII(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = eval(get_imm(op.i7, false) & 0x7); - set_vr(op.rt, fshl(a, zshuffle(a, 3, 0, 1, 2), b)); - } - - void ROTQMBII(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = eval(-get_imm(op.i7, false) & 0x7); - set_vr(op.rt, fshr(zshuffle(a, 1, 2, 3, 4), a, b)); - } - - void SHLQBII(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = eval(get_imm(op.i7, false) & 0x7); - set_vr(op.rt, fshl(a, zshuffle(a, 4, 0, 1, 2), b)); - } - - void ROTQBYI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = (sc - get_imm(op.i7, false)) & 0xf; - set_vr(op.rt, pshufb(a, sh)); - } - - void ROTQMBYI(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto sc = build(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127); - const auto sh = sc + (-get_imm(op.i7, false) & 0x1f); - set_vr(op.rt, pshufb(a, sh)); - } - - void SHLQBYI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.i7) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); // For expressions matching - const auto a = get_vr(op.ra); - const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = sc - (get_imm(op.i7, false) & 0x1f); - set_vr(op.rt, pshufb(a, sh)); - } - - void CGT(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); - } - - void XOR(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) ^ get_vr(op.rb)); - } - - void CGTH(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); - } - - void EQV(spu_opcode_t op) - { - set_vr(op.rt, ~(get_vr(op.ra) ^ get_vr(op.rb))); - } - - void CGTB(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); - } - - void SUMB(spu_opcode_t op) - { - if (m_use_avx512) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - const auto zeroes = splat(0); - - if (op.ra == op.rb && !m_interp_magn) - { - set_vr(op.rt, vdbpsadbw(a, zeroes, 0)); - return; - } - - const auto ax = vdbpsadbw(a, zeroes, 0); - const auto bx = vdbpsadbw(b, zeroes, 0); - set_vr(op.rt, shuffle2(ax, bx, 0, 9, 2, 11, 4, 13, 6, 15)); - return; - } - - if (m_use_vnni) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - const auto zeroes = splat(0); - const auto ones = splat(0x01010101); - const auto ax = bitcast(vpdpbusd(zeroes, a, ones)); - const auto bx = bitcast(vpdpbusd(zeroes, b, ones)); - set_vr(op.rt, shuffle2(ax, bx, 0, 8, 2, 10, 4, 12, 6, 14)); - return; - } - - const auto [a, b] = get_vrs(op.ra, op.rb); - const auto ahs = eval((a >> 8) + (a & 0xff)); - const auto bhs = eval((b >> 8) + (b & 0xff)); - const auto lsh = shuffle2(ahs, bhs, 0, 9, 2, 11, 4, 13, 6, 15); - const auto hsh = shuffle2(ahs, bhs, 1, 8, 3, 10, 5, 12, 7, 14); - set_vr(op.rt, lsh + hsh); - } - - void CLZ(spu_opcode_t op) - { - set_vr(op.rt, ctlz(get_vr(op.ra))); - } - - void XSWD(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) << 32 >> 32); - } - - void XSHW(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) << 16 >> 16); - } - - void CNTB(spu_opcode_t op) - { - set_vr(op.rt, ctpop(get_vr(op.ra))); - } - - void XSBH(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) << 8 >> 8); - } - - void CLGT(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); - } - - void ANDC(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) & ~get_vr(op.rb)); - } - - void CLGTH(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); - } - - void ORC(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) | ~get_vr(op.rb)); - } - - void CLGTB(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); - } - - void CEQ(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) == get_vr(op.rb))); - } - - void MPYHHU(spu_opcode_t op) - { - set_vr(op.rt, (get_vr(op.ra) >> 16) * (get_vr(op.rb) >> 16)); - } - - void ADDX(spu_opcode_t op) - { - set_vr(op.rt, llvm_sum{get_vr(op.ra), get_vr(op.rb), get_vr(op.rt) & 1}); - } - - void SFX(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.rb) - get_vr(op.ra) - (~get_vr(op.rt) & 1)); - } - - void CGX(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - const auto x = (get_vr(op.rt) << 31) >> 31; - const auto s = eval(a + b); - set_vr(op.rt, noncast(sext(s < a) | (sext(s == noncast(x)) & x)) >> 31); - } - - void BGX(spu_opcode_t op) - { - const auto [a, b] = get_vrs(op.ra, op.rb); - const auto c = get_vr(op.rt) << 31; - set_vr(op.rt, noncast(sext(b > a) | (sext(a == b) & c)) >> 31); - } - - void MPYHHA(spu_opcode_t op) - { - set_vr(op.rt, (get_vr(op.ra) >> 16) * (get_vr(op.rb) >> 16) + get_vr(op.rt)); - } - - void MPYHHAU(spu_opcode_t op) - { - set_vr(op.rt, (get_vr(op.ra) >> 16) * (get_vr(op.rb) >> 16) + get_vr(op.rt)); - } - - void MPY(spu_opcode_t op) - { - set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * (get_vr(op.rb) << 16 >> 16)); - } - - void MPYH(spu_opcode_t op) - { - set_vr(op.rt, mpyh(get_vr(op.ra), get_vr(op.rb))); - } - - void MPYHH(spu_opcode_t op) - { - set_vr(op.rt, (get_vr(op.ra) >> 16) * (get_vr(op.rb) >> 16)); - } - - void MPYS(spu_opcode_t op) - { - set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * (get_vr(op.rb) << 16 >> 16) >> 16); - } - - void CEQH(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) == get_vr(op.rb))); - } - - void MPYU(spu_opcode_t op) - { - set_vr(op.rt, mpyu(get_vr(op.ra), get_vr(op.rb))); - } - - void CEQB(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) == get_vr(op.rb))); - } - - void FSMBI(spu_opcode_t op) - { - const auto m = bitcast(get_imm(op.i16)); - set_vr(op.rt, sext(m)); - } - - void IL(spu_opcode_t op) - { - set_vr(op.rt, get_imm(op.si16)); - } - - void ILHU(spu_opcode_t op) - { - set_vr(op.rt, get_imm(op.i16) << 16); - } - - void ILH(spu_opcode_t op) - { - set_vr(op.rt, get_imm(op.i16)); - } - - void IOHL(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.rt) | get_imm(op.i16)); - } - - void ORI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.si10) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); // For expressions matching - set_vr(op.rt, get_vr(op.ra) | get_imm(op.si10)); - } - - void ORHI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.si10) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) | get_imm(op.si10)); - } - - void ORBI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.si10) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) | get_imm(op.si10)); - } - - void SFI(spu_opcode_t op) - { - set_vr(op.rt, get_imm(op.si10) - get_vr(op.ra)); - } - - void SFHI(spu_opcode_t op) - { - set_vr(op.rt, get_imm(op.si10) - get_vr(op.ra)); - } - - void ANDI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && op.si10 == -1) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) & get_imm(op.si10)); - } - - void ANDHI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && op.si10 == -1) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) & get_imm(op.si10)); - } - - void ANDBI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && static_cast(op.si10) == -1) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) & get_imm(op.si10)); - } - - void AI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.si10) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) + get_imm(op.si10)); - } - - void AHI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.si10) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) + get_imm(op.si10)); - } - - void XORI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.si10) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) ^ get_imm(op.si10)); - } - - void XORHI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.si10) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) ^ get_imm(op.si10)); - } - - void XORBI(spu_opcode_t op) - { - if (get_reg_raw(op.ra) && !op.si10) return set_reg_fixed(op.rt, get_reg_raw(op.ra), false); - set_vr(op.rt, get_vr(op.ra) ^ get_imm(op.si10)); - } - - void CGTI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); - } - - void CGTHI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); - } - - void CGTBI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); - } - - void CLGTI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); - } - - void CLGTHI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); - } - - void CLGTBI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); - } - - void MPYI(spu_opcode_t op) - { - set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * get_imm(op.si10)); - } - - void MPYUI(spu_opcode_t op) - { - set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * (get_imm(op.si10) & 0xffff)); - } - - void CEQI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) == get_imm(op.si10))); - } - - void CEQHI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) == get_imm(op.si10))); - } - - void CEQBI(spu_opcode_t op) - { - set_vr(op.rt, sext(get_vr(op.ra) == get_imm(op.si10))); - } - - void ILA(spu_opcode_t op) - { - set_vr(op.rt, get_imm(op.i18)); - } - - void SELB(spu_opcode_t op) - { - if (match_vr(op.rc, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - // If the control mask comes from a comparison instruction, replace SELB with select - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - if constexpr (std::extent_v == 2) // u64[2] - { - // Try to select floats as floats if a OR b is typed as f64[2] - if (auto [a, b] = match_vrs(op.ra, op.rb); a || b) - { - set_vr(op.rt4, select(x, get_vr(op.rb), get_vr(op.ra))); - return true; - } - } - - if constexpr (std::extent_v == 4) // u32[4] - { - // Match division (adjusted) (TODO) - if (auto a = match_vr(op.ra)) - { - static const auto MT = match(); - - if (auto [div_ok, diva, divb] = match_expr(a, MT / MT); div_ok) - { - if (auto b = match_vr(op.rb)) - { - if (auto [add1_ok] = match_expr(b, bitcast(a) + splat(1)); add1_ok) - { - if (auto [fm_ok, a1, b1] = match_expr(x, bitcast(fm(MT, MT)) > splat(-1)); fm_ok) - { - if (auto [fnma_ok] = match_expr(a1, fnms(divb, bitcast(b), diva)); fnma_ok) - { - if (fabs(b1).eval(m_ir) == fsplat(1.0).eval(m_ir)) - { - set_vr(op.rt4, diva / divb); - return true; - } - - if (auto [sel_ok] = match_expr(b1, bitcast((bitcast(diva) & 0x80000000) | 0x3f800000)); sel_ok) - { - set_vr(op.rt4, diva / divb); - return true; - } - } - } - } - } - } - } - - if (auto [a, b] = match_vrs(op.ra, op.rb); a || b) - { - set_vr(op.rt4, select(x, get_vr(op.rb), get_vr(op.ra))); - return true; - } - - if (auto [a, b] = match_vrs(op.ra, op.rb); a || b) - { - set_vr(op.rt4, select(x, get_vr(op.rb), get_vr(op.ra))); - return true; - } - } - - if (auto [ok, y] = match_expr(x, bitcast]>(match>>())); ok) - { - // Don't ruin FSMB/FSM/FSMH instructions - return false; - } - - set_vr(op.rt4, select(x, get_vr(op.rb), get_vr(op.ra))); - return true; - } - - return false; - })) - { - return; - } - - const auto c = get_vr(op.rc); - - // Check if the constant mask doesn't require bit granularity - if (auto [ok, mask] = get_const_vector(c.value, m_pos); ok) - { - bool sel_32 = true; - for (u32 i = 0; i < 4; i++) - { - if (mask._u32[i] && mask._u32[i] != 0xFFFFFFFF) - { - sel_32 = false; - break; - } - } - - if (sel_32) - { - if (auto [a, b] = match_vrs(op.ra, op.rb); a || b) - { - set_vr(op.rt4, select(noncast(c) != 0, get_vr(op.rb), get_vr(op.ra))); - return; - } - else if (auto [a, b] = match_vrs(op.ra, op.rb); a || b) - { - set_vr(op.rt4, select(noncast(c) != 0, get_vr(op.rb), get_vr(op.ra))); - return; - } - - set_vr(op.rt4, select(noncast(c) != 0, get_vr(op.rb), get_vr(op.ra))); - return; - } - - bool sel_16 = true; - for (u32 i = 0; i < 8; i++) - { - if (mask._u16[i] && mask._u16[i] != 0xFFFF) - { - sel_16 = false; - break; - } - } - - if (sel_16) - { - set_vr(op.rt4, select(bitcast(c) != 0, get_vr(op.rb), get_vr(op.ra))); - return; - } - - bool sel_8 = true; - for (u32 i = 0; i < 16; i++) - { - if (mask._u8[i] && mask._u8[i] != 0xFF) - { - sel_8 = false; - break; - } - } - - if (sel_8) - { - set_vr(op.rt4, select(bitcast(c) != 0,get_vr(op.rb), get_vr(op.ra))); - return; - } - } - - const auto op1 = get_reg_raw(op.rb); - const auto op2 = get_reg_raw(op.ra); - - if ((op1 && op1->getType() == get_type()) || (op2 && op2->getType() == get_type())) - { - // Optimization: keep xfloat values in doubles even if the mask is unpredictable (hard way) - const auto c = get_vr(op.rc); - const auto b = get_vr(op.rb); - const auto a = get_vr(op.ra); - const auto m = conv_xfloat_mask(c.value); - const auto x = m_ir->CreateAnd(double_as_uint64(b.value), m); - const auto y = m_ir->CreateAnd(double_as_uint64(a.value), m_ir->CreateNot(m)); - set_reg_fixed(op.rt4, uint64_as_double(m_ir->CreateOr(x, y))); - return; - } - - set_vr(op.rt4, (get_vr(op.rb) & c) | (get_vr(op.ra) & ~c)); - } - - void SHUFB(spu_opcode_t op) // - { - if (match_vr(op.rc, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - // If the mask comes from a constant generation instruction, replace SHUFB with insert - if (auto [ok, i] = match_expr(c, spu_get_insertion_shuffle_mask(match())); ok) - { - set_vr(op.rt4, insert(get_vr(op.rb), i, get_scalar(get_vr(op.ra)))); - return true; - } - - return false; - })) - { - return; - } - - const auto c = get_vr(op.rc); - - if (auto [ok, mask] = get_const_vector(c.value, m_pos); ok) - { - // Optimization: SHUFB with constant mask - if (((mask._u64[0] | mask._u64[1]) & 0xe0e0e0e0e0e0e0e0) == 0) - { - // Trivial insert or constant shuffle (TODO) - static constexpr struct mask_info - { - u64 i1; - u64 i0; - decltype(&cpu_translator::get_type) type; - u64 extract_from; - u64 insert_to; - } s_masks[30] - { - { 0x0311121314151617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 12, 15 }, - { 0x1003121314151617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 12, 14 }, - { 0x1011031314151617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 12, 13 }, - { 0x1011120314151617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 12, 12 }, - { 0x1011121303151617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 12, 11 }, - { 0x1011121314031617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 12, 10 }, - { 0x1011121314150317, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 12, 9 }, - { 0x1011121314151603, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 12, 8 }, - { 0x1011121314151617, 0x03191a1b1c1d1e1f, &cpu_translator::get_type, 12, 7 }, - { 0x1011121314151617, 0x18031a1b1c1d1e1f, &cpu_translator::get_type, 12, 6 }, - { 0x1011121314151617, 0x1819031b1c1d1e1f, &cpu_translator::get_type, 12, 5 }, - { 0x1011121314151617, 0x18191a031c1d1e1f, &cpu_translator::get_type, 12, 4 }, - { 0x1011121314151617, 0x18191a1b031d1e1f, &cpu_translator::get_type, 12, 3 }, - { 0x1011121314151617, 0x18191a1b1c031e1f, &cpu_translator::get_type, 12, 2 }, - { 0x1011121314151617, 0x18191a1b1c1d031f, &cpu_translator::get_type, 12, 1 }, - { 0x1011121314151617, 0x18191a1b1c1d1e03, &cpu_translator::get_type, 12, 0 }, - { 0x0203121314151617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 6, 7 }, - { 0x1011020314151617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 6, 6 }, - { 0x1011121302031617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 6, 5 }, - { 0x1011121314150203, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 6, 4 }, - { 0x1011121314151617, 0x02031a1b1c1d1e1f, &cpu_translator::get_type, 6, 3 }, - { 0x1011121314151617, 0x181902031c1d1e1f, &cpu_translator::get_type, 6, 2 }, - { 0x1011121314151617, 0x18191a1b02031e1f, &cpu_translator::get_type, 6, 1 }, - { 0x1011121314151617, 0x18191a1b1c1d0203, &cpu_translator::get_type, 6, 0 }, - { 0x0001020314151617, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 3, 3 }, - { 0x1011121300010203, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 3, 2 }, - { 0x1011121314151617, 0x000102031c1d1e1f, &cpu_translator::get_type, 3, 1 }, - { 0x1011121314151617, 0x18191a1b00010203, &cpu_translator::get_type, 3, 0 }, - { 0x0001020304050607, 0x18191a1b1c1d1e1f, &cpu_translator::get_type, 1, 1 }, - { 0x1011121303151617, 0x0001020304050607, &cpu_translator::get_type, 1, 0 }, - }; - - // Check important constants from CWD-like constant generation instructions - for (const auto& cm : s_masks) - { - if (mask._u64[0] == cm.i0 && mask._u64[1] == cm.i1) - { - const auto t = (this->*cm.type)(); - const auto a = get_reg_fixed(op.ra, t); - const auto b = get_reg_fixed(op.rb, t); - const auto e = m_ir->CreateExtractElement(a, cm.extract_from); - set_reg_fixed(op.rt4, m_ir->CreateInsertElement(b, e, cm.insert_to)); - return; - } - } - } - - // Adjusted shuffle mask - v128 smask = ~mask & v128::from8p(op.ra == op.rb ? 0xf : 0x1f); - - // Blend mask for encoded constants - v128 bmask{}; - - for (u32 i = 0; i < 16; i++) - { - if (mask._bytes[i] >= 0xe0) - bmask._bytes[i] = 0x80; - else if (mask._bytes[i] >= 0xc0) - bmask._bytes[i] = 0xff; - } - - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - const auto c = make_const_vector(smask, get_type()); - const auto d = make_const_vector(bmask, get_type()); - - llvm::Value* r = d; - - if ((~mask._u64[0] | ~mask._u64[1]) & 0x8080808080808080) [[likely]] - { - r = m_ir->CreateShuffleVector(b.value, op.ra == op.rb ? b.value : a.value, m_ir->CreateZExt(c, get_type())); - - if ((mask._u64[0] | mask._u64[1]) & 0x8080808080808080) - { - r = m_ir->CreateSelect(m_ir->CreateICmpSLT(make_const_vector(mask, get_type()), llvm::ConstantInt::get(get_type(), 0)), d, r); - } - } - - set_reg_fixed(op.rt4, r); - return; - } - - // Check whether shuffle mask doesn't contain fixed value selectors - bool perm_only = false; - - if (auto k = get_known_bits(c); !!(k.Zero & 0x80)) - { - perm_only = true; - } - - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - - // Data with swapped endian from a load instruction - if (auto [ok, as] = match_expr(a, byteswap(match())); ok) - { - if (auto [ok, bs] = match_expr(b, byteswap(match())); ok) - { - // Undo endian swapping, and rely on pshufb/vperm2b to re-reverse endianness - if (m_use_avx512_icl && (op.ra != op.rb)) - { - if (perm_only) - { - set_vr(op.rt4, vperm2b(as, bs, c)); - return; - } - - const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); - const auto mm = select(noncast(m) >= 0, splat(0), m); - const auto ab = vperm2b(as, bs, c); - set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); - return; - } - - const auto x = pshufb(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); - const auto ax = pshufb(as, c); - const auto bx = pshufb(bs, c); - - if (perm_only) - set_vr(op.rt4, select_by_bit4(c, ax, bx)); - else - set_vr(op.rt4, select_by_bit4(c, ax, bx) | x); - return; - } - - if (auto [ok, data] = get_const_vector(b.value, m_pos); ok) - { - if (data == v128::from8p(data._u8[0])) - { - if (m_use_avx512_icl) - { - if (perm_only) - { - set_vr(op.rt4, vperm2b256to128(as, b, c)); - return; - } - - const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); - const auto mm = select(noncast(m) >= 0, splat(0), m); - const auto ab = vperm2b256to128(as, b, c); - set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); - return; - } - // See above - const auto x = pshufb(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); - const auto ax = pshufb(as, c); - - if (perm_only) - set_vr(op.rt4, select_by_bit4(c, ax, b)); - else - set_vr(op.rt4, select_by_bit4(c, ax, b) | x); - return; - } - } - } - - if (auto [ok, bs] = match_expr(b, byteswap(match())); ok) - { - if (auto [ok, data] = get_const_vector(a.value, m_pos); ok) - { - if (data == v128::from8p(data._u8[0])) - { - // See above - const auto x = pshufb(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); - const auto bx = pshufb(bs, c); - - if (perm_only) - set_vr(op.rt4, select_by_bit4(c, a, bx)); - else - set_vr(op.rt4, select_by_bit4(c, a, bx) | x); - return; - } - } - } - - if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn)) - { - if (auto [ok, data] = get_const_vector(b.value, m_pos); ok) - { - if (data == v128::from8p(data._u8[0])) - { - if (perm_only) - { - set_vr(op.rt4, vperm2b256to128(a, b, eval(c ^ 0xf))); - return; - } - - const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); - const auto mm = select(noncast(m) >= 0, splat(0), m); - const auto ab = vperm2b256to128(a, b, eval(c ^ 0xf)); - set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); - return; - } - } - - if (auto [ok, data] = get_const_vector(a.value, m_pos); ok) - { - if (data == v128::from8p(data._u8[0])) - { - if (perm_only) - { - set_vr(op.rt4, vperm2b256to128(b, a, eval(c ^ 0x1f))); - return; - } - - const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); - const auto mm = select(noncast(m) >= 0, splat(0), m); - const auto ab = vperm2b256to128(b, a, eval(c ^ 0x1f)); - set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); - return; - } - } - - if (perm_only) - { - set_vr(op.rt4, vperm2b(a, b, eval(c ^ 0xf))); - return; - } - - const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); - const auto mm = select(noncast(m) >= 0, splat(0), m); - const auto cr = eval(c ^ 0xf); - const auto ab = vperm2b(a, b, cr); - set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); - return; - } - - const auto x = pshufb(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); - const auto cr = eval(c ^ 0xf); - const auto ax = pshufb(a, cr); - const auto bx = pshufb(b, cr); - - if (perm_only) - set_vr(op.rt4, select_by_bit4(cr, ax, bx)); - else - set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x); - } - - void MPYA(spu_opcode_t op) - { - set_vr(op.rt4, (get_vr(op.ra) << 16 >> 16) * (get_vr(op.rb) << 16 >> 16) + get_vr(op.rc)); - } - - void FSCRRD(spu_opcode_t op) // - { - // Hack - set_vr(op.rt, splat(0)); - } - - void FSCRWR(spu_opcode_t /*op*/) // - { - // Hack - } - - void DFCGT(spu_opcode_t op) // - { - return UNK(op); - } - - void DFCEQ(spu_opcode_t op) // - { - return UNK(op); - } - - void DFCMGT(spu_opcode_t op) // - { - return UNK(op); - } - - void DFCMEQ(spu_opcode_t op) // - { - return UNK(op); - } - - void DFTSV(spu_opcode_t op) // - { - return UNK(op); - } - - void DFA(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); - } - - void DFS(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); - } - - void DFM(spu_opcode_t op) - { - set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); - } - - void DFMA(spu_opcode_t op) - { - const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rt); - - if (g_cfg.core.use_accurate_dfma) - set_vr(op.rt, fmuladd(a, b, c, true)); - else - set_vr(op.rt, a * b + c); - } - - void DFMS(spu_opcode_t op) - { - const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rt); - - if (g_cfg.core.use_accurate_dfma) - set_vr(op.rt, fmuladd(a, b, -c, true)); - else - set_vr(op.rt, a * b - c); - } - - void DFNMS(spu_opcode_t op) - { - const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rt); - - if (g_cfg.core.use_accurate_dfma) - set_vr(op.rt, fmuladd(-a, b, c, true)); - else - set_vr(op.rt, c - (a * b)); - } - - void DFNMA(spu_opcode_t op) - { - const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rt); - - if (g_cfg.core.use_accurate_dfma) - set_vr(op.rt, -fmuladd(a, b, c, true)); - else - set_vr(op.rt, -(a * b + c)); - } - - bool is_input_positive(value_t a) - { - if (auto [ok, v0, v1] = match_expr(a, match() * match()); ok && v0.eq(v1)) - { - return true; - } - - return false; - } - - // clamping helpers - value_t clamp_positive_smax(value_t v) - { - return eval(bitcast(min(bitcast(v),splat(0x7f7fffff)))); - } - - value_t clamp_negative_smax(value_t v) - { - if (is_input_positive(v)) - { - return v; - } - - return eval(bitcast(min(bitcast(v),splat(0xff7fffff)))); - } - - value_t clamp_smax(value_t v) - { - if (m_use_avx512) - { - if (is_input_positive(v)) - { - return eval(clamp_positive_smax(v)); - } - - if (auto [ok, data] = get_const_vector(v.value, m_pos); ok) - { - // Avoid pessimation when input is constant - return eval(clamp_positive_smax(clamp_negative_smax(v))); - } - - return eval(vrangeps(v, fsplat(std::bit_cast(0x7f7fffff)), 0x2, 0xff)); - } - - return eval(clamp_positive_smax(clamp_negative_smax(v))); - } - - // FMA favouring zeros - value_t xmuladd(value_t a, value_t b, value_t c) - { - const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); - const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); - const auto ca = eval(bitcast(bitcast(a) & mb)); - const auto cb = eval(bitcast(bitcast(b) & ma)); - return eval(fmuladd(ca, cb, c)); - } - - // Checks for postive and negative zero, or Denormal (treated as zero) - // If sign is +-1 check equality againts all sign bits - bool is_spu_float_zero(v128 a, int sign = 0) - { - for (u32 i = 0; i < 4; i++) - { - const u32 exponent = a._u32[i] & 0x7f800000u; - - if (exponent || (sign && (sign >= 0) != (a._s32[i] >= 0))) - { - // Normalized number - return false; - } - } - return true; - } - - template - static llvm_calli frest(T&& a) - { - return {"spu_frest", {std::forward(a)}}; - } - - void FREST(spu_opcode_t op) - { - // TODO - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - const auto a = get_vr(op.ra); - const auto mask_ov = sext(bitcast(fabs(a)) > splat(0x7e7fffff)); - const auto mask_de = eval(noncast(sext(fcmp_ord(a == fsplat(0.)))) >> 1); - set_vr(op.rt, (bitcast(fsplat(1.0) / a) & ~mask_ov) | noncast(mask_de)); - return; - } - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - register_intrinsic("spu_frest", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - // Gives accuracy penalty, frest result is within one newton-raphson iteration for accuracy - const auto approx_result = fsplat(0.999875069f) / a; - // Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed - return bitcast(bitcast(approx_result) & splat(0xFFFFF800)); - }); - } - else - { - register_intrinsic("spu_frest", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - // Fast but this makes the result vary per cpu - return fre(a); - }); - } - - set_vr(op.rt, frest(get_vr(op.ra))); - } - - template - static llvm_calli frsqest(T&& a) - { - return {"spu_frsqest", {std::forward(a)}}; - } - - void FRSQEST(spu_opcode_t op) - { - // TODO - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, fsplat(1.0) / fsqrt(fabs(get_vr(op.ra)))); - return; - } - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - // Gives accuracy penalty, frsqest result is within one newton-raphson iteration for accuracy - const auto approx_result = fsplat(0.999763668f) / fsqrt(fabs(a)); - // Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed - return bitcast(bitcast(approx_result) & splat(0xFFFFF800)); - }); - } - else - { - register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - // Fast but this makes the result vary per cpu - return frsqe(fabs(a)); - }); - } - - set_vr(op.rt, frsqest(get_vr(op.ra))); - } - - template - static llvm_calli fcgt(T&& a, U&& b) - { - return {"spu_fcgt", {std::forward(a), std::forward(b)}}; - } - - void FCGT(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, sext(fcmp_ord(get_vr(op.ra) > get_vr(op.rb)))); - return; - } - - register_intrinsic("spu_fcgt", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - - const value_t ab[2]{a, b}; - - std::bitset<2> safe_int_compare(0); - std::bitset<2> safe_nonzero_compare(0); - - for (u32 i = 0; i < 2; i++) - { - if (auto [ok, data] = get_const_vector(ab[i].value, m_pos, __LINE__ + i); ok) - { - safe_int_compare.set(i); - safe_nonzero_compare.set(i); - - for (u32 j = 0; j < 4; j++) - { - const u32 value = data._u32[j]; - const u8 exponent = static_cast(value >> 23); - - if (value >= 0x7f7fffffu || !exponent) - { - // Postive or negative zero, Denormal (treated as zero), Negative constant, or Normalized number with exponent +127 - // Cannot used signed integer compare safely - // Note: Technically this optimization is accurate for any positive value, but due to the fact that - // we don't produce "extended range" values the same way as real hardware, it's not safe to apply - // this optimization for values outside of the range of x86 floating point hardware. - safe_int_compare.reset(i); - if (!exponent) safe_nonzero_compare.reset(i); - } - } - } - } - - if (safe_int_compare.any()) - { - return eval(sext(bitcast(a) > bitcast(b))); - } - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate || g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed) - { - const auto ai = eval(bitcast(a)); - const auto bi = eval(bitcast(b)); - - if (!safe_nonzero_compare.any()) - { - return eval(sext(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi))); - } - else - { - return eval(sext(select((ai & bi) >= 0, ai > bi, ai < bi))); - } - } - else - { - return eval(sext(fcmp_ord(a > b))); - } - }); - - set_vr(op.rt, fcgt(get_vr(op.ra), get_vr(op.rb))); - } - - template - static llvm_calli fcmgt(T&& a, U&& b) - { - return {"spu_fcmgt", {std::forward(a), std::forward(b)}}; - } - - void FCMGT(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, sext(fcmp_ord(fabs(get_vr(op.ra)) > fabs(get_vr(op.rb))))); - return; - } - - register_intrinsic("spu_fcmgt", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - - const value_t ab[2]{a, b}; - - std::bitset<2> safe_int_compare(0); - - for (u32 i = 0; i < 2; i++) - { - if (auto [ok, data] = get_const_vector(ab[i].value, m_pos, __LINE__ + i); ok) - { - safe_int_compare.set(i); - - for (u32 j = 0; j < 4; j++) - { - const u32 value = data._u32[j]; - const u8 exponent = static_cast(value >> 23); - - if ((value & 0x7fffffffu) >= 0x7f7fffffu || !exponent) - { - // See above - safe_int_compare.reset(i); - } - } - } - } - - const auto ma = eval(fabs(a)); - const auto mb = eval(fabs(b)); - - const auto mai = eval(bitcast(ma)); - const auto mbi = eval(bitcast(mb)); - - if (safe_int_compare.any()) - { - return eval(sext(mai > mbi)); - } - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - return eval(sext(fcmp_uno(ma > mb) & (mai > mbi))); - } - else - { - return eval(sext(fcmp_ord(ma > mb))); - } - }); - - set_vr(op.rt, fcmgt(get_vr(op.ra), get_vr(op.rb))); - } - - template - static llvm_calli fa(T&& a, U&& b) - { - return {"spu_fa", {std::forward(a), std::forward(b)}}; - } - - void FA(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); - return; - } - - register_intrinsic("spu_fa", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - - return a + b; - }); - - set_vr(op.rt, fa(get_vr(op.ra), get_vr(op.rb))); - } - - template - static llvm_calli fs(T&& a, U&& b) - { - return {"spu_fs", {std::forward(a), std::forward(b)}}; - } - - void FS(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); - return; - } - - register_intrinsic("spu_fs", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - const auto bc = clamp_smax(b); // for #4478 - return eval(a - bc); - } - else - { - return eval(a - b); - } - }); - - set_vr(op.rt, fs(get_vr(op.ra), get_vr(op.rb))); - } - - template - static llvm_calli fm(T&& a, U&& b) - { - return {"spu_fm", {std::forward(a), std::forward(b)}}; - } - - void FM(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); - return; - } - - register_intrinsic("spu_fm", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - if (a.value == b.value) - { - return eval(a * b); - } - - const auto ma = sext(fcmp_uno(a != fsplat(0.))); - const auto mb = sext(fcmp_uno(b != fsplat(0.))); - return eval(bitcast(bitcast(a * b) & ma & mb)); - } - else - { - return eval(a * b); - } - }); - - const auto [a, b] = get_vrs(op.ra, op.rb); - - if (op.ra == op.rb && !m_interp_magn) - { - set_vr(op.rt, fm(a, a)); - return; - } - - set_vr(op.rt, fm(a, b)); - } - - template - static llvm_calli fesd(T&& a) - { - return {"spu_fesd", {std::forward(a)}}; - } - - void FESD(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - const auto r = zshuffle(get_vr(op.ra), 1, 3); - const auto d = bitcast(r); - const auto a = eval(d & 0x7fffffffffffffff); - const auto s = eval(d & 0x8000000000000000); - const auto i = select(a == 0x47f0000000000000, eval(s | 0x7ff0000000000000), d); - const auto n = select(a > 0x47f0000000000000, splat(0x7ff8000000000000), i); - set_vr(op.rt, bitcast(n)); - return; - } - - register_intrinsic("spu_fesd", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - - return fpcast(zshuffle(a, 1, 3)); - }); - - set_vr(op.rt, fesd(get_vr(op.ra))); - } - - template - static llvm_calli frds(T&& a) - { - return {"spu_frds", {std::forward(a)}}; - } - - void FRDS(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - const auto r = get_vr(op.ra); - const auto d = bitcast(r); - const auto a = eval(d & 0x7fffffffffffffff); - const auto s = eval(d & 0x8000000000000000); - const auto i = select(a > 0x47f0000000000000, eval(s | 0x47f0000000000000), d); - const auto n = select(a > 0x7ff0000000000000, splat(0x47f8000000000000), i); - const auto z = select(a < 0x3810000000000000, s, n); - set_vr(op.rt, zshuffle(bitcast(z), 2, 0, 3, 1), nullptr, false); - return; - } - - register_intrinsic("spu_frds", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - - return zshuffle(fpcast(a), 2, 0, 3, 1); - }); - - set_vr(op.rt, frds(get_vr(op.ra))); - } - - template - static llvm_calli fceq(T&& a, U&& b) - { - return {"spu_fceq", {std::forward(a), std::forward(b)}}; - } - - void FCEQ(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, sext(fcmp_ord(get_vr(op.ra) == get_vr(op.rb)))); - return; - } - - register_intrinsic("spu_fceq", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - - const value_t ab[2]{a, b}; - - std::bitset<2> safe_float_compare(0); - std::bitset<2> safe_int_compare(0); - - for (u32 i = 0; i < 2; i++) - { - if (auto [ok, data] = get_const_vector(ab[i].value, m_pos, __LINE__ + i); ok) - { - safe_float_compare.set(i); - safe_int_compare.set(i); - - for (u32 j = 0; j < 4; j++) - { - const u32 value = data._u32[j]; - const u8 exponent = static_cast(value >> 23); - - // unsafe if nan - if (exponent == 255) - { - safe_float_compare.reset(i); - } - - // unsafe if denormal or 0 - if (!exponent) - { - safe_int_compare.reset(i); - } - } - } - } - - if (safe_float_compare.any()) - { - return eval(sext(fcmp_ord(a == b))); - } - - if (safe_int_compare.any()) - { - return eval(sext(bitcast(a) == bitcast(b))); - } - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - return eval(sext(fcmp_ord(a == b)) | sext(bitcast(a) == bitcast(b))); - } - else - { - return eval(sext(fcmp_ord(a == b))); - } - }); - - set_vr(op.rt, fceq(get_vr(op.ra), get_vr(op.rb))); - } - - template - static llvm_calli fcmeq(T&& a, U&& b) - { - return {"spu_fcmeq", {std::forward(a), std::forward(b)}}; - } - - void FCMEQ(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, sext(fcmp_ord(fabs(get_vr(op.ra)) == fabs(get_vr(op.rb))))); - return; - } - - register_intrinsic("spu_fcmeq", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - - const value_t ab[2]{a, b}; - - std::bitset<2> safe_float_compare(0); - std::bitset<2> safe_int_compare(0); - - for (u32 i = 0; i < 2; i++) - { - if (auto [ok, data] = get_const_vector(ab[i].value, m_pos, __LINE__ + i); ok) - { - safe_float_compare.set(i); - safe_int_compare.set(i); - - for (u32 j = 0; j < 4; j++) - { - const u32 value = data._u32[j]; - const u8 exponent = static_cast(value >> 23); - - // unsafe if nan - if (exponent == 255) - { - safe_float_compare.reset(i); - } - - // unsafe if denormal or 0 - if (!exponent) - { - safe_int_compare.reset(i); - } - } - } - } - - const auto fa = eval(fabs(a)); - const auto fb = eval(fabs(b)); - - if (safe_float_compare.any()) - { - return eval(sext(fcmp_ord(fa == fb))); - } - - if (safe_int_compare.any()) - { - return eval(sext(bitcast(fa) == bitcast(fb))); - } - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - return eval(sext(fcmp_ord(fa == fb)) | sext(bitcast(fa) == bitcast(fb))); - } - else - { - return eval(sext(fcmp_ord(fa == fb))); - } - }); - - set_vr(op.rt, fcmeq(get_vr(op.ra), get_vr(op.rb))); - } - - value_t fma32x4(value_t a, value_t b, value_t c) - { - // Optimization: Emit only a floating multiply if the addend is zero - // This is odd since SPU code could just use the FM instruction, but it seems common enough - if (auto [ok, data] = get_const_vector(c.value, m_pos); ok) - { - if (is_spu_float_zero(data, -1)) - { - return eval(a * b); - } - - if (!m_use_fma && is_spu_float_zero(data, +1)) - { - return eval(a * b + fsplat(0.f)); - } - } - - if ([&]() - { - if (auto [ok, data] = get_const_vector(a.value, m_pos); ok) - { - if (!is_spu_float_zero(data, +1)) - { - return false; - } - - if (auto [ok0, data0] = get_const_vector(b.value, m_pos); ok0) - { - if (is_spu_float_zero(data0, +1)) - { - return true; - } - } - } - - if (auto [ok, data] = get_const_vector(a.value, m_pos); ok) - { - if (!is_spu_float_zero(data, -1)) - { - return false; - } - - if (auto [ok0, data0] = get_const_vector(b.value, m_pos); ok0) - { - if (is_spu_float_zero(data0, -1)) - { - return true; - } - } - } - - return false; - }()) - { - // Just return the added value if both a and b is +0 or -0 (+0 and -0 arent't allowed alone) - return c; - } - - if (m_use_fma) - { - return eval(fmuladd(a, b, c, true)); - } - - // Convert to doubles - const auto xa = fpcast(a); - const auto xb = fpcast(b); - const auto xc = fpcast(c); - const auto xr = fmuladd(xa, xb, xc, false); - return eval(fpcast(xr)); - } - - template - static llvm_calli fnms(T&& a, U&& b, V&& c) - { - return {"spu_fnms", {std::forward(a), std::forward(b), std::forward(c)}}; - } - - void FNMS(spu_opcode_t op) - { - // See FMA. - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); - set_vr(op.rt4, fmuladd(-a, b, c)); - return; - } - - register_intrinsic("spu_fnms", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - const auto c = value(ci->getOperand(2)); - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate || g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed) - { - return fma32x4(eval(-clamp_smax(a)), clamp_smax(b), c); - } - else - { - return fma32x4(eval(-a), b, c); - } - }); - - set_vr(op.rt4, fnms(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); - } - - template - static llvm_calli fma(T&& a, U&& b, V&& c) - { - return {"spu_fma", {std::forward(a), std::forward(b), std::forward(c)}}; - } - - void FMA(spu_opcode_t op) - { - // Hardware FMA produces the same result as multiple + add on the limited double range (xfloat). - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); - set_vr(op.rt4, fmuladd(a, b, c)); - return; - } - - register_intrinsic("spu_fma", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - const auto c = value(ci->getOperand(2)); - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - const auto ma = sext(fcmp_uno(a != fsplat(0.))); - const auto mb = sext(fcmp_uno(b != fsplat(0.))); - const auto ca = bitcast(bitcast(a) & mb); - const auto cb = bitcast(bitcast(b) & ma); - return fma32x4(eval(ca), eval(cb), c); - } - else - { - return fma32x4(a, b, c); - } - }); - - const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); - - static const auto MT = match(); - - // Match sqrt - if (auto [ok_fnma, a1, b1] = match_expr(a, fnms(MT, MT, fsplat(1.00000011920928955078125))); ok_fnma) - { - if (auto [ok_fm2, a2] = match_expr(b, fm(MT, fsplat(0.5))); ok_fm2 && a2.eq(b1)) - { - if (auto [ok_fm1, a3, b3] = match_expr(c, fm(MT, MT)); ok_fm1 && a3.eq(a1)) - { - if (auto [ok_sqrte, src] = match_expr(a3, spu_rsqrte(MT)); ok_sqrte && src.eq(b3)) - { - erase_stores(a, b, c, a3); - set_vr(op.rt4, fsqrt(fabs(src))); - return; - } - } - } - } - - // Match division (fast) - if (auto [ok_fnma, divb, diva] = match_expr(a, fnms(c, MT, MT)); ok_fnma) - { - if (auto [ok_fm] = match_expr(c, fm(diva, b)); ok_fm) - { - if (auto [ok_re] = match_expr(b, spu_re(divb)); ok_re) - { - erase_stores(b, c); - set_vr(op.rt4, diva / divb); - return; - } - } - } - - set_vr(op.rt4, fma(a, b, c)); - } - - template - static llvm_calli fms(T&& a, U&& b, V&& c) - { - return {"spu_fms", {std::forward(a), std::forward(b), std::forward(c)}}; - } - - void FMS(spu_opcode_t op) - { - // See FMA. - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); - set_vr(op.rt4, fmuladd(a, b, -c)); - return; - } - - register_intrinsic("spu_fms", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - const auto c = value(ci->getOperand(2)); - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - return fma32x4(clamp_smax(a), clamp_smax(b), eval(-c)); - } - else - { - return fma32x4(a, b, eval(-c)); - } - }); - - set_vr(op.rt4, fms(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); - } - - template - static llvm_calli fi(T&& a, U&& b) - { - return {"spu_fi", {std::forward(a), std::forward(b)}}; - } - - template - static llvm_calli spu_re(T&& a) - { - return {"spu_re", {std::forward(a)}}; - } - - template - static llvm_calli spu_rsqrte(T&& a) - { - return {"spu_rsqrte", {std::forward(a)}}; - } - - void FI(spu_opcode_t op) - { - // TODO - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - set_vr(op.rt, get_vr(op.rb)); - // const auto [a, b] = get_vrs(op.ra, op.rb); - - // const auto mask_se = splat(0xfff0000000000000ull); - // const auto mask_bf = splat(0x000fff8000000000ull); - // const auto mask_sf = splat(0x0000007fe0000000ull); - // const auto mask_yf = splat(0x0000ffffe0000000ull); - - // const auto base = bitcast((bitcast(b) & mask_bf) | 0x3ff0000000000000ull); - // const auto step = fpcast(bitcast(b) & mask_sf) * fsplat(std::exp2(-13.f)); - // const auto yval = fpcast(bitcast(a) & mask_yf) * fsplat(std::exp2(-19.f)); - // set_vr(op.rt, bitcast((bitcast(b) & mask_se) | (bitcast(base - step * yval) & ~mask_se))); - return; - } - - register_intrinsic("spu_fi", [&](llvm::CallInst* ci) - { - const auto a = bitcast(value(ci->getOperand(0))); - const auto b = bitcast(value(ci->getOperand(1))); - - const auto base = (b & 0x007ffc00u) << 9; // Base fraction - const auto ymul = (b & 0x3ff) * (a & 0x7ffff); // Step fraction * Y fraction (fixed point at 2^-32) - const auto bnew = bitcast((base - ymul) >> 9) + (sext(ymul <= base) & (1 << 23)); // Subtract and correct invisible fraction bit - return bitcast((b & 0xff800000u) | (bitcast(fpcast(bnew)) & ~0xff800000u)); // Inject old sign and exponent - }); - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - register_intrinsic("spu_re", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - // Gives accuracy penalty, frest result is within one newton-raphson iteration for accuracy - const auto approx_result = fsplat(0.999875069f) / a; - return approx_result; - }); - - register_intrinsic("spu_rsqrte", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - // Gives accuracy penalty, frsqest result is within one newton-raphson iteration for accuracy - const auto approx_result = fsplat(0.999763668f) / fsqrt(fabs(a)); - return approx_result; - }); - } - else - { - // For relaxed use intrinsics, those make the results vary per cpu - register_intrinsic("spu_re", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - return fre(a); - }); - - register_intrinsic("spu_rsqrte", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - return frsqe(a); - }); - } - - const auto [a, b] = get_vrs(op.ra, op.rb); - - if (const auto [ok, mb] = match_expr(b, frest(match())); ok && mb.eq(a)) - { - erase_stores(b); - set_vr(op.rt, spu_re(a)); - return; - } - - if (const auto [ok, mb] = match_expr(b, frsqest(match())); ok && mb.eq(a)) - { - erase_stores(b); - set_vr(op.rt, spu_rsqrte(a)); - return; - } - - const auto r = eval(fi(a, b)); - if (!m_interp_magn) - spu_log.todo("[%s:0x%05x] Unmatched spu_fi found", m_hash, m_pos); - - set_vr(op.rt, r); - } - - void CFLTS(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - value_t a = get_vr(op.ra); - value_t s; - if (m_interp_magn) - s = eval(vsplat(bitcast(((1023 + 173) - get_imm(op.i8)) << 52))); - else - s = eval(fsplat(std::exp2(static_cast(173 - op.i8)))); - if (op.i8 != 173 || m_interp_magn) - a = eval(a * s); - - value_t r; - - if (auto ca = llvm::dyn_cast(a.value)) - { - const f64 data[4] - { - ca->getElementAsDouble(0), - ca->getElementAsDouble(1), - ca->getElementAsDouble(2), - ca->getElementAsDouble(3) - }; - - v128 result; - - for (u32 i = 0; i < 4; i++) - { - if (data[i] >= std::exp2(31.f)) - { - result._s32[i] = smax; - } - else if (data[i] < std::exp2(-31.f)) - { - result._s32[i] = smin; - } - else - { - result._s32[i] = static_cast(data[i]); - } - } - - r.value = make_const_vector(result, get_type()); - set_vr(op.rt, r); - return; - } - - if (llvm::isa(a.value)) - { - set_vr(op.rt, splat(0)); - return; - } - - r.value = m_ir->CreateFPToSI(a.value, get_type()); - set_vr(op.rt, r ^ sext(fcmp_ord(a >= fsplat(std::exp2(31.f))))); - } - else - { - value_t a = get_vr(op.ra); - value_t s; - if (m_interp_magn) - s = eval(vsplat(load_const(m_scale_float_to, get_imm(op.i8)))); - else - s = eval(fsplat(std::exp2(static_cast(static_cast(173 - op.i8))))); - if (op.i8 != 173 || m_interp_magn) - a = eval(a * s); - - value_t r; - r.value = m_ir->CreateFPToSI(a.value, get_type()); - set_vr(op.rt, r ^ sext(bitcast(a) > splat(((31 + 127) << 23) - 1))); - } - } - - void CFLTU(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - value_t a = get_vr(op.ra); - value_t s; - if (m_interp_magn) - s = eval(vsplat(bitcast(((1023 + 173) - get_imm(op.i8)) << 52))); - else - s = eval(fsplat(std::exp2(static_cast(173 - op.i8)))); - if (op.i8 != 173 || m_interp_magn) - a = eval(a * s); - - value_t r; - - if (auto ca = llvm::dyn_cast(a.value)) - { - const f64 data[4] - { - ca->getElementAsDouble(0), - ca->getElementAsDouble(1), - ca->getElementAsDouble(2), - ca->getElementAsDouble(3) - }; - - v128 result; - - for (u32 i = 0; i < 4; i++) - { - if (data[i] >= std::exp2(32.f)) - { - result._u32[i] = umax; - } - else if (data[i] < 0.) - { - result._u32[i] = 0; - } - else - { - result._u32[i] = static_cast(data[i]); - } - } - - r.value = make_const_vector(result, get_type()); - set_vr(op.rt, r); - return; - } - - if (llvm::isa(a.value)) - { - set_vr(op.rt, splat(0)); - return; - } - - r.value = m_ir->CreateFPToUI(a.value, get_type()); - set_vr(op.rt, select(fcmp_ord(a >= fsplat(std::exp2(32.f))), splat(-1), r & sext(fcmp_ord(a >= fsplat(0.))))); - } - else - { - value_t a = get_vr(op.ra); - value_t s; - if (m_interp_magn) - s = eval(vsplat(load_const(m_scale_float_to, get_imm(op.i8)))); - else - s = eval(fsplat(std::exp2(static_cast(static_cast(173 - op.i8))))); - if (op.i8 != 173 || m_interp_magn) - a = eval(a * s); - - value_t r; - - if (m_use_avx512) - { - const auto sc = eval(bitcast(max(bitcast(a),splat(0x0)))); - r.value = m_ir->CreateFPToUI(sc.value, get_type()); - set_vr(op.rt, r); - return; - } - - r.value = m_ir->CreateFPToUI(a.value, get_type()); - set_vr(op.rt, select(bitcast(a) > splat(((32 + 127) << 23) - 1), splat(-1), r & ~(bitcast(a) >> 31))); - } - } - - void CSFLT(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - value_t a = get_vr(op.ra); - value_t r; - - if (auto [ok, data] = get_const_vector(a.value, m_pos); ok) - { - r.value = build(data._s32[0], data._s32[1], data._s32[2], data._s32[3]).eval(m_ir); - } - else - { - r.value = m_ir->CreateSIToFP(a.value, get_type()); - } - - value_t s; - if (m_interp_magn) - s = eval(vsplat(bitcast((get_imm(op.i8) + (1023 - 155)) << 52))); - else - s = eval(fsplat(std::exp2(static_cast(op.i8 - 155)))); - if (op.i8 != 155 || m_interp_magn) - r = eval(r * s); - set_vr(op.rt, r); - } - else - { - value_t r; - r.value = m_ir->CreateSIToFP(get_vr(op.ra).value, get_type()); - value_t s; - if (m_interp_magn) - s = eval(vsplat(load_const(m_scale_to_float, get_imm(op.i8)))); - else - s = eval(fsplat(std::exp2(static_cast(static_cast(op.i8 - 155))))); - if (op.i8 != 155 || m_interp_magn) - r = eval(r * s); - set_vr(op.rt, r); - } - } - - void CUFLT(spu_opcode_t op) - { - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) - { - value_t a = get_vr(op.ra); - value_t r; - - if (auto [ok, data] = get_const_vector(a.value, m_pos); ok) - { - r.value = build(data._u32[0], data._u32[1], data._u32[2], data._u32[3]).eval(m_ir); - } - else - { - r.value = m_ir->CreateUIToFP(a.value, get_type()); - } - - value_t s; - if (m_interp_magn) - s = eval(vsplat(bitcast((get_imm(op.i8) + (1023 - 155)) << 52))); - else - s = eval(fsplat(std::exp2(static_cast(op.i8 - 155)))); - if (op.i8 != 155 || m_interp_magn) - r = eval(r * s); - set_vr(op.rt, r); - } - else - { - value_t r; - r.value = m_ir->CreateUIToFP(get_vr(op.ra).value, get_type()); - value_t s; - if (m_interp_magn) - s = eval(vsplat(load_const(m_scale_to_float, get_imm(op.i8)))); - else - s = eval(fsplat(std::exp2(static_cast(static_cast(op.i8 - 155))))); - if (op.i8 != 155 || m_interp_magn) - r = eval(r * s); - set_vr(op.rt, r); - } - } - - void make_store_ls(value_t addr, value_t data) - { - const auto bswapped = byteswap(data); - m_ir->CreateStore(bswapped.eval(m_ir), m_ir->CreateGEP(get_type(), m_lsptr, addr.value)); - } - - auto make_load_ls(value_t addr) - { - value_t data; - data.value = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_lsptr, addr.value)); - return byteswap(data); - } - - void STQX(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - - for (auto pair : std::initializer_list, value_t>>{{a, b}, {b, a}}) - { - if (auto [ok, data] = get_const_vector(pair.first.value, m_pos); ok) - { - data._u32[3] %= SPU_LS_SIZE; - - if (data._u32[3] % 0x10 == 0) - { - value_t addr = eval(splat(data._u32[3]) + zext(extract(pair.second, 3) & 0x3fff0)); - make_store_ls(addr, get_vr(op.rt)); - return; - } - } - } - - value_t addr = eval(zext((extract(a, 3) + extract(b, 3)) & 0x3fff0)); - make_store_ls(addr, get_vr(op.rt)); - } - - void LQX(spu_opcode_t op) - { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - - for (auto pair : std::initializer_list, value_t>>{{a, b}, {b, a}}) - { - if (auto [ok, data] = get_const_vector(pair.first.value, m_pos); ok) - { - data._u32[3] %= SPU_LS_SIZE; - - if (data._u32[3] % 0x10 == 0) - { - value_t addr = eval(splat(data._u32[3]) + zext(extract(pair.second, 3) & 0x3fff0)); - set_vr(op.rt, make_load_ls(addr)); - return; - } - } - } - - value_t addr = eval(zext((extract(a, 3) + extract(b, 3)) & 0x3fff0)); - set_vr(op.rt, make_load_ls(addr)); - } - - void STQA(spu_opcode_t op) - { - value_t addr = eval((get_imm(op.i16, false) << 2) & 0x3fff0); - make_store_ls(addr, get_vr(op.rt)); - } - - void LQA(spu_opcode_t op) - { - value_t addr = eval((get_imm(op.i16, false) << 2) & 0x3fff0); - set_vr(op.rt, make_load_ls(addr)); - } - - llvm::Value* get_pc_as_u64(u32 addr) - { - return m_ir->CreateAdd(m_ir->CreateZExt(m_base_pc, get_type()), m_ir->getInt64(addr - m_base)); - } - - void STQR(spu_opcode_t op) // - { - value_t addr; - addr.value = m_interp_magn ? m_ir->CreateZExt(m_interp_pc, get_type()) : get_pc_as_u64(m_pos); - addr = eval(((get_imm(op.i16, false) << 2) + addr) & (m_interp_magn ? 0x3fff0 : ~0xf)); - make_store_ls(addr, get_vr(op.rt)); - } - - void LQR(spu_opcode_t op) // - { - value_t addr; - addr.value = m_interp_magn ? m_ir->CreateZExt(m_interp_pc, get_type()) : get_pc_as_u64(m_pos); - addr = eval(((get_imm(op.i16, false) << 2) + addr) & (m_interp_magn ? 0x3fff0 : ~0xf)); - set_vr(op.rt, make_load_ls(addr)); - } - - void STQD(spu_opcode_t op) - { - if (m_finfo && m_finfo->fn) - { - if (op.rt <= s_reg_sp || (op.rt >= s_reg_80 && op.rt <= s_reg_127)) - { - if (m_block->bb->reg_save_dom[op.rt] && get_reg_raw(op.rt) == m_finfo->load[op.rt]) - { - return; - } - } - } - - value_t addr = eval(zext(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm(op.si10) << 4)); - make_store_ls(addr, get_vr(op.rt)); - } - - void LQD(spu_opcode_t op) - { - value_t addr = eval(zext(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm(op.si10) << 4)); - set_vr(op.rt, make_load_ls(addr)); - } - - void make_halt(value_t cond) - { - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto halt = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, halt, next, m_md_unlikely); - m_ir->SetInsertPoint(halt); - if (m_interp_magn) - m_ir->CreateStore(m_function->getArg(2), spu_ptr(&spu_thread::pc)); - else - update_pc(); - const auto ptr = _ptr(m_memptr, 0xffdead00); - m_ir->CreateStore(m_ir->getInt32("HALT"_u32), ptr); - m_ir->CreateBr(next); - m_ir->SetInsertPoint(next); - } - - void HGT(spu_opcode_t op) - { - const auto cond = eval(extract(get_vr(op.ra), 3) > extract(get_vr(op.rb), 3)); - make_halt(cond); - } - - void HEQ(spu_opcode_t op) - { - const auto cond = eval(extract(get_vr(op.ra), 3) == extract(get_vr(op.rb), 3)); - make_halt(cond); - } - - void HLGT(spu_opcode_t op) - { - const auto cond = eval(extract(get_vr(op.ra), 3) > extract(get_vr(op.rb), 3)); - make_halt(cond); - } - - void HGTI(spu_opcode_t op) - { - const auto cond = eval(extract(get_vr(op.ra), 3) > get_imm(op.si10)); - make_halt(cond); - } - - void HEQI(spu_opcode_t op) - { - const auto cond = eval(extract(get_vr(op.ra), 3) == get_imm(op.si10)); - make_halt(cond); - } - - void HLGTI(spu_opcode_t op) - { - const auto cond = eval(extract(get_vr(op.ra), 3) > get_imm(op.si10)); - make_halt(cond); - } - - void HBR([[maybe_unused]] spu_opcode_t op) // - { - // TODO: use the hint. - } - - void HBRA([[maybe_unused]] spu_opcode_t op) // - { - // TODO: use the hint. - } - - void HBRR([[maybe_unused]] spu_opcode_t op) // - { - // TODO: use the hint. - } - - // TODO - static u32 exec_check_interrupts(spu_thread* _spu, u32 addr) - { - _spu->set_interrupt_status(true); - - if (_spu->ch_events.load().count) - { - _spu->interrupts_enabled = false; - _spu->srr0 = addr; - - // Test for BR/BRA instructions (they are equivalent at zero pc) - const u32 br = _spu->_ref(0); - - if ((br & 0xfd80007f) == 0x30000000) - { - return (br >> 5) & 0x3fffc; - } - - return 0; - } - - return addr; - } - - llvm::BasicBlock* add_block_indirect(spu_opcode_t op, value_t addr, bool ret = true) - { - if (m_interp_magn) - { - m_interp_bblock = llvm::BasicBlock::Create(m_context, "", m_function); - - const auto cblock = m_ir->GetInsertBlock(); - const auto result = llvm::BasicBlock::Create(m_context, "", m_function); - const auto e_exec = llvm::BasicBlock::Create(m_context, "", m_function); - const auto d_test = llvm::BasicBlock::Create(m_context, "", m_function); - const auto d_exec = llvm::BasicBlock::Create(m_context, "", m_function); - const auto d_done = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->SetInsertPoint(result); - m_ir->CreateCondBr(get_imm(op.e).value, e_exec, d_test, m_md_unlikely); - m_ir->SetInsertPoint(e_exec); - const auto e_addr = call("spu_check_interrupts", &exec_check_interrupts, m_thread, addr.value); - m_ir->CreateBr(d_test); - m_ir->SetInsertPoint(d_test); - const auto target = m_ir->CreatePHI(get_type(), 2); - target->addIncoming(addr.value, result); - target->addIncoming(e_addr, e_exec); - m_ir->CreateCondBr(get_imm(op.d).value, d_exec, d_done, m_md_unlikely); - m_ir->SetInsertPoint(d_exec); - m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&spu_thread::interrupts_enabled)); - m_ir->CreateBr(d_done); - m_ir->SetInsertPoint(d_done); - m_ir->CreateBr(m_interp_bblock); - m_ir->SetInsertPoint(cblock); - m_interp_pc = target; - return result; - } - - if (llvm::isa(addr.value)) - { - // Fixed branch excludes the possibility it's a function return (TODO) - ret = false; - } - - if (m_finfo && m_finfo->fn && op.opcode) - { - const auto cblock = m_ir->GetInsertBlock(); - const auto result = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->SetInsertPoint(result); - ret_function(); - m_ir->SetInsertPoint(cblock); - return result; - } - - // Load stack addr if necessary - value_t sp; - - if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe) - { - if (op.opcode) - { - sp = eval(extract(get_reg_fixed(1), 3) & 0x3fff0); - } - else - { - sp.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, 1, &v128::_u32, 3)); - } - } - - const auto cblock = m_ir->GetInsertBlock(); - const auto result = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->SetInsertPoint(result); - - if (op.e) - { - addr.value = call("spu_check_interrupts", &exec_check_interrupts, m_thread, addr.value); - } - - if (op.d) - { - m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&spu_thread::interrupts_enabled)); - } - - m_ir->CreateStore(addr.value, spu_ptr(&spu_thread::pc)); - - if (ret && g_cfg.core.spu_block_size >= spu_block_size_type::mega) - { - // Compare address stored in stack mirror with addr - const auto stack0 = eval(zext(sp) + ::offset32(&spu_thread::stack_mirror)); - const auto stack1 = eval(stack0 + 8); - const auto _ret = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_thread, stack0.value)); - const auto link = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_thread, stack1.value)); - const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); - const auto done = llvm::BasicBlock::Create(m_context, "", m_function); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpEQ(addr.value, m_ir->CreateTrunc(link, get_type())), next, fail, m_md_likely); - m_ir->SetInsertPoint(next); - const auto cmp2 = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_lsptr, addr.value)); - m_ir->CreateCondBr(m_ir->CreateICmpEQ(cmp2, m_ir->CreateTrunc(_ret, get_type())), done, fail, m_md_likely); - m_ir->SetInsertPoint(done); - - // Clear stack mirror and return by tail call to the provided return address - m_ir->CreateStore(splat(-1).eval(m_ir), m_ir->CreateGEP(get_type(), m_thread, stack0.value)); - const auto targ = m_ir->CreateAdd(m_ir->CreateLShr(_ret, 32), get_segment_base()); - const auto type = m_finfo->chunk->getFunctionType(); - const auto fval = m_ir->CreateIntToPtr(targ, type->getPointerTo()); - tail_chunk({type, fval}, m_ir->CreateTrunc(m_ir->CreateLShr(link, 32), get_type())); - m_ir->SetInsertPoint(fail); - } - - if (g_cfg.core.spu_block_size >= spu_block_size_type::mega) - { - // Try to load chunk address from the function table - const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); - const auto done = llvm::BasicBlock::Create(m_context, "", m_function); - const auto ad32 = m_ir->CreateSub(addr.value, m_base_pc); - m_ir->CreateCondBr(m_ir->CreateICmpULT(ad32, m_ir->getInt32(m_size)), done, fail, m_md_likely); - m_ir->SetInsertPoint(done); - - const auto ad64 = m_ir->CreateZExt(ad32, get_type()); - const auto pptr = dyn_cast(m_ir->CreateGEP(m_function_table->getValueType(), m_function_table, {m_ir->getInt64(0), m_ir->CreateLShr(ad64, 2, "", true)})); - tail_chunk({m_dispatch->getFunctionType(), m_ir->CreateLoad(pptr->getResultElementType(), pptr)}); - m_ir->SetInsertPoint(fail); - } - - tail_chunk(nullptr); - m_ir->SetInsertPoint(cblock); - return result; - } - - llvm::BasicBlock* add_block_next() - { - if (m_interp_magn) - { - const auto cblock = m_ir->GetInsertBlock(); - m_ir->SetInsertPoint(m_interp_bblock); - const auto target = m_ir->CreatePHI(get_type(), 2); - target->addIncoming(m_interp_pc_next, cblock); - target->addIncoming(m_interp_pc, m_interp_bblock->getSinglePredecessor()); - m_ir->SetInsertPoint(cblock); - m_interp_pc = target; - return m_interp_bblock; - } - - return add_block(m_pos + 4); - } - - void BIZ(spu_opcode_t op) // - { - if (m_block) m_block->block_end = m_ir->GetInsertBlock(); - - const auto rt = get_vr(op.rt); - - // Checking for zero doesn't care about the order of the bytes, - // so load the data before it's byteswapped - if (auto [ok, as] = match_expr(rt, byteswap(match())); ok) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto cond = eval(extract(bitcast(as), 0) == 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - return; - } - - const auto ox = get_vr(op.rt); - - // Instead of extracting the value generated by orx, just test the input to orx with ptest - if (auto [ok, as] = match_expr(ox, orx(match())); ok) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto a = extract(bitcast(as), 0); - const auto b = extract(bitcast(as), 1); - const auto cond = eval((a | b) == 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - return; - } - - - // Check sign bit instead (optimization) - if (match_vr(op.rt, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - const auto a = get_vr(op.rt); - const auto cond = eval(bitcast(trunc(a)) >= 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - return true; - } - - return false; - })) - { - return; - } - - const auto cond = eval(extract(get_vr(op.rt), 3) == 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - } - - void BINZ(spu_opcode_t op) // - { - if (m_block) m_block->block_end = m_ir->GetInsertBlock(); - - const auto rt = get_vr(op.rt); - - // Checking for zero doesn't care about the order of the bytes, - // so load the data before it's byteswapped - if (auto [ok, as] = match_expr(rt, byteswap(match())); ok) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto cond = eval(extract(bitcast(as), 0) != 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - return; - } - - const auto ox = get_vr(op.rt); - - // Instead of extracting the value generated by orx, just test the input to orx with ptest - if (auto [ok, as] = match_expr(ox, orx(match())); ok) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto a = extract(bitcast(as), 0); - const auto b = extract(bitcast(as), 1); - const auto cond = eval((a | b) != 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - return; - } - - - // Check sign bit instead (optimization) - if (match_vr(op.rt, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - const auto a = get_vr(op.rt); - const auto cond = eval(bitcast(trunc(a)) < 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - return true; - } - - return false; - })) - { - return; - } - - const auto cond = eval(extract(get_vr(op.rt), 3) != 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - } - - void BIHZ(spu_opcode_t op) // - { - if (m_block) m_block->block_end = m_ir->GetInsertBlock(); - - // Check sign bits of 2 vector elements (optimization) - if (match_vr(op.rt, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - const auto a = get_vr(op.rt); - const auto cond = eval((bitcast(trunc(a)) & 0x3000) == 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - return true; - } - - return false; - })) - { - return; - } - - const auto cond = eval(extract(get_vr(op.rt), 6) == 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - } - - void BIHNZ(spu_opcode_t op) // - { - if (m_block) m_block->block_end = m_ir->GetInsertBlock(); - - // Check sign bits of 2 vector elements (optimization) - if (match_vr(op.rt, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - const auto a = get_vr(op.rt); - const auto cond = eval((bitcast(trunc(a)) & 0x3000) != 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - return true; - } - - return false; - })) - { - return; - } - - const auto cond = eval(extract(get_vr(op.rt), 6) != 0); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block_next()); - } - - void BI(spu_opcode_t op) // - { - if (m_block) m_block->block_end = m_ir->GetInsertBlock(); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - - if (m_interp_magn) - { - m_ir->CreateBr(add_block_indirect(op, addr)); - return; - } - - // Create jump table if necessary (TODO) - const auto tfound = m_targets.find(m_pos); - - if (op.d && tfound != m_targets.end() && tfound->second.size() == 1 && tfound->second[0] == spu_branch_target(m_pos, 1)) - { - // Interrupts-disable pattern - m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&spu_thread::interrupts_enabled)); - return; - } - - if (!op.d && !op.e && tfound != m_targets.end() && tfound->second.size() > 1) - { - // Shift aligned address for switch - const auto addrfx = m_ir->CreateSub(addr.value, m_base_pc); - const auto sw_arg = m_ir->CreateLShr(addrfx, 2, "", true); - - // Initialize jump table targets - std::map targets; - - for (u32 target : tfound->second) - { - if (m_block_info[target / 4]) - { - targets.emplace(target, nullptr); - } - } - - // Initialize target basic blocks - for (auto& pair : targets) - { - pair.second = add_block(pair.first); - } - - if (targets.empty()) - { - // Emergency exit - spu_log.error("[%s] [0x%05x] No jump table targets at 0x%05x (%u)", m_hash, m_entry, m_pos, tfound->second.size()); - m_ir->CreateBr(add_block_indirect(op, addr)); - return; - } - - // Get jump table bounds (optimization) - const u32 start = targets.begin()->first; - const u32 end = targets.rbegin()->first + 4; - - // Emit switch instruction aiming for a jumptable in the end (indirectbr could guarantee it) - const auto sw = m_ir->CreateSwitch(sw_arg, llvm::BasicBlock::Create(m_context, "", m_function), (end - start) / 4); - - for (u32 pos = start; pos < end; pos += 4) - { - if (m_block_info[pos / 4] && targets.count(pos)) - { - const auto found = targets.find(pos); - - if (found != targets.end()) - { - sw->addCase(m_ir->getInt32(pos / 4 - m_base / 4), found->second); - continue; - } - } - - sw->addCase(m_ir->getInt32(pos / 4 - m_base / 4), sw->getDefaultDest()); - } - - // Exit function on unexpected target - m_ir->SetInsertPoint(sw->getDefaultDest()); - m_ir->CreateStore(addr.value, spu_ptr(&spu_thread::pc)); - - if (m_finfo && m_finfo->fn) - { - // Can't afford external tail call in true functions - m_ir->CreateStore(m_ir->getInt32("BIJT"_u32), _ptr(m_memptr, 0xffdead20)); - m_ir->CreateCall(m_test_state, {m_thread}); - m_ir->CreateBr(sw->getDefaultDest()); - } - else - { - tail_chunk(nullptr); - } - } - else - { - // Simple indirect branch - m_ir->CreateBr(add_block_indirect(op, addr)); - } - } - - void BISL(spu_opcode_t op) // - { - if (m_block) m_block->block_end = m_ir->GetInsertBlock(); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - set_link(op); - m_ir->CreateBr(add_block_indirect(op, addr, false)); - } - - void IRET(spu_opcode_t op) // - { - if (m_block) m_block->block_end = m_ir->GetInsertBlock(); - value_t srr0; - srr0.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::srr0)); - m_ir->CreateBr(add_block_indirect(op, srr0)); - } - - void BISLED(spu_opcode_t op) // - { - if (m_block) m_block->block_end = m_ir->GetInsertBlock(); - const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - set_link(op); - const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_events), true), 32), get_type()); - const auto res = call("spu_get_events", &exec_get_events, m_thread, mask); - const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(m_ir->CreateICmpNE(res, m_ir->getInt32(0)), target, add_block_next()); - } - - void BRZ(spu_opcode_t op) // - { - if (m_interp_magn) - { - value_t target; - target.value = m_interp_pc; - target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); - m_interp_pc = m_ir->CreateSelect(eval(extract(get_vr(op.rt), 3) == 0).value, target.value, m_interp_pc_next); - return; - } - - const u32 target = spu_branch_target(m_pos, op.i16); - - const auto rt = get_vr(op.rt); - - // Checking for zero doesn't care about the order of the bytes, - // so load the data before it's byteswapped - if (auto [ok, as] = match_expr(rt, byteswap(match())); ok) - { - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto cond = eval(extract(bitcast(as), 0) == 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - return; - } - } - - const auto ox = get_vr(op.rt); - - // Instead of extracting the value generated by orx, just test the input to orx with ptest - if (auto [ok, as] = match_expr(ox, orx(match())); ok) - { - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto a = extract(bitcast(as), 0); - const auto b = extract(bitcast(as), 1); - const auto cond = eval((a | b) == 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - return; - } - } - - - // Check sign bit instead (optimization) - if (match_vr(op.rt, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto a = get_vr(op.rt); - const auto cond = eval(bitcast(trunc(a)) >= 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - return true; - } - } - - return false; - })) - { - return; - } - - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto cond = eval(extract(get_vr(op.rt), 3) == 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - } - } - - void BRNZ(spu_opcode_t op) // - { - if (m_interp_magn) - { - value_t target; - target.value = m_interp_pc; - target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); - m_interp_pc = m_ir->CreateSelect(eval(extract(get_vr(op.rt), 3) != 0).value, target.value, m_interp_pc_next); - return; - } - - const u32 target = spu_branch_target(m_pos, op.i16); - - const auto rt = get_vr(op.rt); - - // Checking for zero doesn't care about the order of the bytes, - // so load the data before it's byteswapped - if (auto [ok, as] = match_expr(rt, byteswap(match())); ok) - { - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto cond = eval(extract(bitcast(as), 0) != 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - return; - } - } - - const auto ox = get_vr(op.rt); - - // Instead of extracting the value generated by orx, just test the input to orx with ptest - if (auto [ok, as] = match_expr(ox, orx(match())); ok) - { - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto a = extract(bitcast(as), 0); - const auto b = extract(bitcast(as), 1); - const auto cond = eval((a | b) != 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - return; - } - } - - // Check sign bit instead (optimization) - if (match_vr(op.rt, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto a = get_vr(op.rt); - const auto cond = eval(bitcast(trunc(a)) < 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - return true; - } - } - - return false; - })) - { - return; - } - - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto cond = eval(extract(get_vr(op.rt), 3) != 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - } - } - - void BRHZ(spu_opcode_t op) // - { - if (m_interp_magn) - { - value_t target; - target.value = m_interp_pc; - target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); - m_interp_pc = m_ir->CreateSelect(eval(extract(get_vr(op.rt), 6) == 0).value, target.value, m_interp_pc_next); - return; - } - - const u32 target = spu_branch_target(m_pos, op.i16); - - // Check sign bits of 2 vector elements (optimization) - if (match_vr(op.rt, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto a = get_vr(op.rt); - const auto cond = eval((bitcast(trunc(a)) & 0x3000) == 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - return true; - } - } - - return false; - })) - { - return; - } - - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto cond = eval(extract(get_vr(op.rt), 6) == 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - } - } - - void BRHNZ(spu_opcode_t op) // - { - if (m_interp_magn) - { - value_t target; - target.value = m_interp_pc; - target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); - m_interp_pc = m_ir->CreateSelect(eval(extract(get_vr(op.rt), 6) != 0).value, target.value, m_interp_pc_next); - return; - } - - const u32 target = spu_branch_target(m_pos, op.i16); - - // Check sign bits of 2 vector elements (optimization) - if (match_vr(op.rt, [&](auto c, auto MP) - { - using VT = typename decltype(MP)::type; - - if (auto [ok, x] = match_expr(c, sext(match]>())); ok) - { - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto a = get_vr(op.rt); - const auto cond = eval((bitcast(trunc(a)) & 0x3000) != 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - return true; - } - } - - return false; - })) - { - return; - } - - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - const auto cond = eval(extract(get_vr(op.rt), 6) != 0); - m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); - } - } - - void BRA(spu_opcode_t op) // - { - if (m_interp_magn) - { - m_interp_pc = eval((get_imm(op.i16, false) << 2) & 0x3fffc).value; - return; - } - - const u32 target = spu_branch_target(0, op.i16); - - m_block->block_end = m_ir->GetInsertBlock(); - m_ir->CreateBr(add_block(target, true)); - } - - void BRASL(spu_opcode_t op) // - { - set_link(op); - BRA(op); - } - - void BR(spu_opcode_t op) // - { - if (m_interp_magn) - { - value_t target; - target.value = m_interp_pc; - target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); - m_interp_pc = target.value; - return; - } - - const u32 target = spu_branch_target(m_pos, op.i16); - - if (target != m_pos + 4) - { - m_block->block_end = m_ir->GetInsertBlock(); - m_ir->CreateBr(add_block(target)); - } - } - - void BRSL(spu_opcode_t op) // - { - set_link(op); - - const u32 target = spu_branch_target(m_pos, op.i16); - - if (m_finfo && m_finfo->fn && target != m_pos + 4) - { - if (auto fn = add_function(target)->fn) - { - call_function(fn); - return; - } - else - { - spu_log.fatal("[0x%x] Can't add function 0x%x", m_pos, target); - return; - } - } - - BR(op); - } - - void set_link(spu_opcode_t op) - { - if (m_interp_magn) - { - value_t next; - next.value = m_interp_pc_next; - set_vr(op.rt, insert(splat(0), 3, next)); - return; - } - - set_vr(op.rt, insert(splat(0), 3, value(get_pc(m_pos + 4)) & 0x3fffc)); - - if (m_finfo && m_finfo->fn) - { - return; - } - - if (g_cfg.core.spu_block_size >= spu_block_size_type::mega && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1]) - { - // Store the return function chunk address at the stack mirror - const auto pfunc = add_function(m_pos + 4); - const auto stack0 = eval(zext(extract(get_reg_fixed(1), 3) & 0x3fff0) + ::offset32(&spu_thread::stack_mirror)); - const auto stack1 = eval(stack0 + 8); - const auto rel_ptr = m_ir->CreateSub(m_ir->CreatePtrToInt(pfunc->chunk, get_type()), get_segment_base()); - const auto ptr_plus_op = m_ir->CreateOr(m_ir->CreateShl(rel_ptr, 32), m_ir->getInt64(m_next_op)); - const auto base_plus_pc = m_ir->CreateOr(m_ir->CreateShl(m_ir->CreateZExt(m_base_pc, get_type()), 32), m_ir->getInt64(m_pos + 4)); - m_ir->CreateStore(ptr_plus_op, m_ir->CreateGEP(get_type(), m_thread, stack0.value)); - m_ir->CreateStore(base_plus_pc, m_ir->CreateGEP(get_type(), m_thread, stack1.value)); - } - } - - llvm::Value* get_segment_base() - { - const auto type = llvm::FunctionType::get(get_type(), {}, false); - const auto func = llvm::cast(m_module->getOrInsertFunction("spu_segment_base", type).getCallee()); - m_engine->updateGlobalMapping("spu_segment_base", reinterpret_cast(jit_runtime::alloc(0, 0))); - return m_ir->CreatePtrToInt(func, get_type()); - } - - static decltype(&spu_llvm_recompiler::UNK) decode(u32 op); -}; - -std::unique_ptr spu_recompiler_base::make_llvm_recompiler(u8 magn) -{ - return std::make_unique(magn); -} - -const spu_decoder s_spu_llvm_decoder; - -decltype(&spu_llvm_recompiler::UNK) spu_llvm_recompiler::decode(u32 op) -{ - return s_spu_llvm_decoder.decode(op); -} - -#else - -std::unique_ptr spu_recompiler_base::make_llvm_recompiler(u8 magn) -{ - if (magn) - { - return nullptr; - } - - fmt::throw_exception("LLVM is not available in this build."); -} - -#endif // LLVM_AVAILABLE - struct spu_llvm_worker { lf_queue> registered; diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 396412dc7a..279eeb53bf 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -3,25 +3,15 @@ #include "Emu/System.h" #include "Emu/system_config.h" -#include "Emu/system_progress.hpp" -#include "Emu/system_utils.hpp" -#include "Emu/cache_utils.hpp" #include "Emu/IdManager.h" #include "Emu/Cell/timers.hpp" #include "Crypto/sha1.h" -#include "Utilities/StrUtil.h" #include "Utilities/JIT.h" -#include "util/init_mutex.hpp" -#include "util/shared_ptr.hpp" #include "SPUThread.h" #include "SPUAnalyser.h" #include "SPUInterpreter.h" -#include "SPUDisAsm.h" -#include -#include #include -#include #include #include "util/v128.hpp" @@ -32,4370 +22,6 @@ const extern spu_decoder g_spu_itype; const extern spu_decoder g_spu_iname; const extern spu_decoder g_spu_iflag; -// Move 4 args for calling native function from a GHC calling convention function -#if defined(ARCH_X64) -static u8* move_args_ghc_to_native(u8* raw) -{ -#ifdef _WIN32 - // mov rcx, r13 - // mov rdx, rbp - // mov r8, r12 - // mov r9, rbx - std::memcpy(raw, "\x4C\x89\xE9\x48\x89\xEA\x4D\x89\xE0\x49\x89\xD9", 12); -#else - // mov rdi, r13 - // mov rsi, rbp - // mov rdx, r12 - // mov rcx, rbx - std::memcpy(raw, "\x4C\x89\xEF\x48\x89\xEE\x4C\x89\xE2\x48\x89\xD9", 12); -#endif - - return raw + 12; -} -#elif defined(ARCH_ARM64) -static void ghc_cpp_trampoline(u64 fn_target, native_asm& c, auto& args) -{ - using namespace asmjit; - - Label target = c.newLabel(); - c.mov(args[0], a64::x19); - c.mov(args[1], a64::x20); - c.mov(args[2], a64::x21); - c.mov(args[3], a64::x22); - - c.ldr(a64::x15, arm::Mem(target)); - c.br(a64::x15); - - c.brk(Imm(0x42)); // Unreachable - - c.bind(target); - c.embedUInt64(fn_target); -} -#endif - -DECLARE(spu_runtime::tr_dispatch) = [] -{ -#ifdef __APPLE__ - pthread_jit_write_protect_np(false); -#endif -#if defined(ARCH_X64) - // Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction - u8* const trptr = jit_runtime::alloc(32, 16); - u8* raw = move_args_ghc_to_native(trptr); - *raw++ = 0xf3; // pause - *raw++ = 0x90; - *raw++ = 0xff; // jmp [rip] - *raw++ = 0x25; - std::memset(raw, 0, 4); - const u64 target = reinterpret_cast(&spu_recompiler_base::dispatch); - std::memcpy(raw + 4, &target, 8); - return reinterpret_cast(trptr); -#elif defined(ARCH_ARM64) - auto trptr = build_function_asm("tr_dispatch", - [](native_asm& c, auto& args) - { - c.yield(); - ghc_cpp_trampoline(reinterpret_cast(&spu_recompiler_base::dispatch), c, args); - - c.embed("tr_dispatch", 11); - }); - return trptr; -#else -#error "Unimplemented" -#endif -}(); - -DECLARE(spu_runtime::tr_branch) = [] -{ -#if defined(ARCH_X64) - // Generate a trampoline to spu_recompiler_base::branch - u8* const trptr = jit_runtime::alloc(32, 16); - u8* raw = move_args_ghc_to_native(trptr); - *raw++ = 0xff; // jmp [rip] - *raw++ = 0x25; - std::memset(raw, 0, 4); - const u64 target = reinterpret_cast(&spu_recompiler_base::branch); - std::memcpy(raw + 4, &target, 8); - return reinterpret_cast(trptr); - -#elif defined(ARCH_ARM64) - auto trptr = build_function_asm("tr_branch", - [](native_asm& c, auto& args) - { - ghc_cpp_trampoline(reinterpret_cast(&spu_recompiler_base::branch), c, args); - - c.embed("tr_branch", 9); - }); - return trptr; -#else -#error "Unimplemented" -#endif -}(); - -DECLARE(spu_runtime::tr_interpreter) = [] -{ -#if defined(ARCH_X64) - u8* const trptr = jit_runtime::alloc(32, 16); - u8* raw = move_args_ghc_to_native(trptr); - *raw++ = 0xff; // jmp [rip] - *raw++ = 0x25; - std::memset(raw, 0, 4); - const u64 target = reinterpret_cast(&spu_recompiler_base::old_interpreter); - std::memcpy(raw + 4, &target, 8); - return reinterpret_cast(trptr); -#elif defined(ARCH_ARM64) - auto trptr = build_function_asm("tr_interpreter", - [](native_asm& c, auto& args) - { - ghc_cpp_trampoline(reinterpret_cast(&spu_recompiler_base::old_interpreter), c, args); - - c.embed("tr_interpreter", 14); - }); - return trptr; -#endif -}(); - -DECLARE(spu_runtime::g_dispatcher) = [] -{ - // Allocate 2^20 positions in data area - const auto ptr = reinterpret_cast>(jit_runtime::alloc(sizeof(*g_dispatcher), 64, false)); - - for (auto& x : *ptr) - { - x.raw() = tr_dispatch; - } - - return ptr; -}(); - -DECLARE(spu_runtime::tr_all) = [] -{ -#if defined(ARCH_X64) - u8* const trptr = jit_runtime::alloc(32, 16); - u8* raw = trptr; - - // Load PC: mov eax, [r13 + spu_thread::pc] - *raw++ = 0x41; - *raw++ = 0x8b; - *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); - - // Get LS address starting from PC: lea rcx, [rbp + rax] - *raw++ = 0x48; - *raw++ = 0x8d; - *raw++ = 0x4c; - *raw++ = 0x05; - *raw++ = 0x00; - - // mov eax, [rcx] - *raw++ = 0x8b; - *raw++ = 0x01; - - // shr eax, (32 - 20) - *raw++ = 0xc1; - *raw++ = 0xe8; - *raw++ = 0x0c; - - // Load g_dispatcher to rdx - *raw++ = 0x48; - *raw++ = 0x8d; - *raw++ = 0x15; - const s32 r32 = ::narrow(reinterpret_cast(g_dispatcher) - reinterpret_cast(raw) - 4); - std::memcpy(raw, &r32, 4); - raw += 4; - - // Update block_hash (set zero): mov [r13 + spu_thread::m_block_hash], 0 - *raw++ = 0x49; - *raw++ = 0xc7; - *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::block_hash)); - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x00; - - // jmp [rdx + rax * 8] - *raw++ = 0xff; - *raw++ = 0x24; - *raw++ = 0xc2; - - return reinterpret_cast(trptr); - -#elif defined(ARCH_ARM64) - auto trptr = build_function_asm("tr_all", - [](native_asm& c, auto& args) - { - using namespace asmjit; - - // w1: PC (eax in x86 SPU) - // x7: lsa (rcx in x86 SPU) - - // Load PC - Label pc_offset = c.newLabel(); - c.ldr(a64::x0, arm::Mem(pc_offset)); - c.ldr(a64::w1, arm::Mem(a64::x19, a64::x0)); // REG_Base + offset(spu_thread::pc) - // Compute LS address = REG_Sp + PC, store into x7 (use later) - c.add(a64::x7, a64::x20, a64::x1); - // Load 32b from LS address - c.ldr(a64::w3, arm::Mem(a64::x7)); - // shr (32 - 20) - c.lsr(a64::w3, a64::w3, Imm(32 - 20)); - // Load g_dispatcher - Label g_dispatcher_offset = c.newLabel(); - c.ldr(a64::x4, arm::Mem(g_dispatcher_offset)); - // Update block hash - Label block_hash_offset = c.newLabel(); - c.mov(a64::x5, Imm(0)); - c.ldr(a64::x6, arm::Mem(block_hash_offset)); - c.str(a64::x5, arm::Mem(a64::x19, a64::x6)); // REG_Base + offset(spu_thread::block_hash) - // Jump to [g_dispatcher + idx * 8] - c.mov(a64::x6, Imm(8)); - c.mul(a64::x6, a64::x3, a64::x6); - c.add(a64::x4, a64::x4, a64::x6); - c.ldr(a64::x4, arm::Mem(a64::x4)); - c.br(a64::x4); - - c.bind(pc_offset); - c.embedUInt64(::offset32(&spu_thread::pc)); - c.bind(g_dispatcher_offset); - c.embedUInt64(reinterpret_cast(g_dispatcher)); - c.bind(block_hash_offset); - c.embedUInt64(::offset32(&spu_thread::block_hash)); - c.embed("tr_all", 6); - }); - return trptr; -#else -#error "Unimplemented" -#endif -}(); - -DECLARE(spu_runtime::g_gateway) = build_function_asm("spu_gateway", [](native_asm& c, auto& args) -{ - // Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape - using namespace asmjit; - -#if defined(ARCH_X64) -#ifdef _WIN32 - c.push(x86::r15); - c.push(x86::r14); - c.push(x86::r13); - c.push(x86::r12); - c.push(x86::rsi); - c.push(x86::rdi); - c.push(x86::rbp); - c.push(x86::rbx); - c.sub(x86::rsp, 0xa8); - c.movaps(x86::oword_ptr(x86::rsp, 0x90), x86::xmm15); - c.movaps(x86::oword_ptr(x86::rsp, 0x80), x86::xmm14); - c.movaps(x86::oword_ptr(x86::rsp, 0x70), x86::xmm13); - c.movaps(x86::oword_ptr(x86::rsp, 0x60), x86::xmm12); - c.movaps(x86::oword_ptr(x86::rsp, 0x50), x86::xmm11); - c.movaps(x86::oword_ptr(x86::rsp, 0x40), x86::xmm10); - c.movaps(x86::oword_ptr(x86::rsp, 0x30), x86::xmm9); - c.movaps(x86::oword_ptr(x86::rsp, 0x20), x86::xmm8); - c.movaps(x86::oword_ptr(x86::rsp, 0x10), x86::xmm7); - c.movaps(x86::oword_ptr(x86::rsp, 0), x86::xmm6); -#else - c.push(x86::rbp); - c.push(x86::r15); - c.push(x86::r14); - c.push(x86::r13); - c.push(x86::r12); - c.push(x86::rbx); - c.push(x86::rax); -#endif - - // Save native stack pointer for longjmp emulation - c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp); - - // Move 4 args (despite spu_function_t def) - c.mov(x86::r13, args[0]); - c.mov(x86::rbp, args[1]); - c.mov(x86::r12, args[2]); - c.mov(x86::rbx, args[3]); - - if (utils::has_avx()) - { - c.vzeroupper(); - } - - c.call(spu_runtime::tr_all); - - if (utils::has_avx()) - { - c.vzeroupper(); - } - -#ifdef _WIN32 - c.movaps(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); - c.movaps(x86::xmm7, x86::oword_ptr(x86::rsp, 0x10)); - c.movaps(x86::xmm8, x86::oword_ptr(x86::rsp, 0x20)); - c.movaps(x86::xmm9, x86::oword_ptr(x86::rsp, 0x30)); - c.movaps(x86::xmm10, x86::oword_ptr(x86::rsp, 0x40)); - c.movaps(x86::xmm11, x86::oword_ptr(x86::rsp, 0x50)); - c.movaps(x86::xmm12, x86::oword_ptr(x86::rsp, 0x60)); - c.movaps(x86::xmm13, x86::oword_ptr(x86::rsp, 0x70)); - c.movaps(x86::xmm14, x86::oword_ptr(x86::rsp, 0x80)); - c.movaps(x86::xmm15, x86::oword_ptr(x86::rsp, 0x90)); - c.add(x86::rsp, 0xa8); - c.pop(x86::rbx); - c.pop(x86::rbp); - c.pop(x86::rdi); - c.pop(x86::rsi); - c.pop(x86::r12); - c.pop(x86::r13); - c.pop(x86::r14); - c.pop(x86::r15); -#else - c.add(x86::rsp, +8); - c.pop(x86::rbx); - c.pop(x86::r12); - c.pop(x86::r13); - c.pop(x86::r14); - c.pop(x86::r15); - c.pop(x86::rbp); -#endif - - c.ret(); -#elif defined(ARCH_ARM64) - // Push callee saved registers to the stack - // We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B - c.sub(a64::sp, a64::sp, Imm(112)); - c.stp(a64::x18, a64::x19, arm::Mem(a64::sp)); - c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); - c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); - c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); - c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); - c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); - c.str(a64::x30, arm::Mem(a64::sp, 96)); - - // Save native stack pointer for longjmp emulation - Label sp_offset = c.newLabel(); - c.ldr(a64::x26, arm::Mem(sp_offset)); - // sp not allowed to be used in load/stores directly - c.mov(a64::x15, a64::sp); - c.str(a64::x15, arm::Mem(args[0], a64::x26)); - - // Move 4 args (despite spu_function_t def) - c.mov(a64::x19, args[0]); - c.mov(a64::x20, args[1]); - c.mov(a64::x21, args[2]); - c.mov(a64::x22, args[3]); - - // Save ret address to stack - // since non-tail calls to cpp fns may corrupt lr and - // g_tail_escape may jump out of a fn before the epilogue can restore lr - Label ret_addr = c.newLabel(); - c.adr(a64::x0, ret_addr); - c.str(a64::x0, arm::Mem(a64::sp, 104)); - - Label call_target = c.newLabel(); - c.ldr(a64::x0, arm::Mem(call_target)); - c.blr(a64::x0); - - c.bind(ret_addr); - - // Restore stack ptr - c.ldr(a64::x26, arm::Mem(sp_offset)); - c.ldr(a64::x15, arm::Mem(a64::x19, a64::x26)); - c.mov(a64::sp, a64::x15); - - // Restore registers from the stack - c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp)); - c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); - c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); - c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); - c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); - c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); - c.ldr(a64::x30, arm::Mem(a64::sp, 96)); - // Restore stack ptr - c.add(a64::sp, a64::sp, Imm(112)); - // Return - c.ret(a64::x30); - - c.bind(sp_offset); - c.embedUInt64(::offset32(&spu_thread::saved_native_sp)); - c.bind(call_target); - c.embedUInt64(reinterpret_cast(spu_runtime::tr_all)); - c.embed("spu_gateway", 11); -#else -#error "Unimplemented" -#endif -}); - -DECLARE(spu_runtime::g_escape) = build_function_asm("spu_escape", [](native_asm& c, auto& args) -{ - using namespace asmjit; - -#if defined(ARCH_X64) - // Restore native stack pointer (longjmp emulation) - c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp))); - - // Return to the return location - c.sub(x86::rsp, 8); - c.ret(); -#elif defined(ARCH_ARM64) - // Restore native stack pointer (longjmp emulation) - Label sp_offset = c.newLabel(); - c.ldr(a64::x15, arm::Mem(sp_offset)); - c.ldr(a64::x15, arm::Mem(args[0], a64::x15)); - c.mov(a64::sp, a64::x15); - - c.ldr(a64::x30, arm::Mem(a64::sp, 104)); - c.ret(a64::x30); - - c.bind(sp_offset); - c.embedUInt64(::offset32(&spu_thread::saved_native_sp)); - - c.embed("spu_escape", 10); -#else -#error "Unimplemented" -#endif -}); - -DECLARE(spu_runtime::g_tail_escape) = build_function_asm("spu_tail_escape", [](native_asm& c, auto& args) -{ - using namespace asmjit; - -#if defined(ARCH_X64) - // Restore native stack pointer (longjmp emulation) - c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp))); - - // Adjust stack for initial call instruction in the gateway - c.sub(x86::rsp, 16); - - // Tail call, GHC CC (second arg) - c.mov(x86::r13, args[0]); - c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls))); - c.mov(x86::r12, args[2]); - c.xor_(x86::ebx, x86::ebx); - c.mov(x86::qword_ptr(x86::rsp), args[1]); - c.ret(); -#elif defined(ARCH_ARM64) - // Restore native stack pointer (longjmp emulation) - Label sp_offset = c.newLabel(); - c.ldr(a64::x15, arm::Mem(sp_offset)); - c.ldr(a64::x15, arm::Mem(args[0], a64::x15)); - c.mov(a64::sp, a64::x15); - - // Reload lr, since it might've been clobbered by a cpp fn - // and g_tail_escape runs before epilogue - c.ldr(a64::x30, arm::Mem(a64::sp, 104)); - - // Tail call, GHC CC - c.mov(a64::x19, args[0]); // REG_Base - Label ls_offset = c.newLabel(); - c.ldr(a64::x20, arm::Mem(ls_offset)); - c.ldr(a64::x20, arm::Mem(args[0], a64::x20)); // REG_Sp - c.mov(a64::x21, args[2]); // REG_Hp - c.eor(a64::w22, a64::w22, a64::w22); // REG_R1 - - c.br(args[1]); - - c.bind(ls_offset); - c.embedUInt64(::offset32(&spu_thread::ls)); - c.bind(sp_offset); - c.embedUInt64(::offset32(&spu_thread::saved_native_sp)); - - c.embed("spu_tail_escape", 15); -#else -#error "Unimplemented" -#endif -}); - -DECLARE(spu_runtime::g_interpreter_table) = {}; - -DECLARE(spu_runtime::g_interpreter) = nullptr; - -spu_cache::spu_cache(const std::string& loc) - : m_file(loc, fs::read + fs::write + fs::create + fs::append) -{ -} - -spu_cache::~spu_cache() -{ -} - -extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size) -{ - if (vaddr % 4) - { - return; - } - - size &= -4; - - if (!size || vaddr + size > SPU_LS_SIZE) - { - return; - } - - if (!g_cfg.core.llvm_precompilation) - { - return; - } - - g_fxo->need(); - - if (!g_fxo->get().collect_funcs_to_precompile) - { - return; - } - - std::basic_string data(size / 4, 0); - std::memcpy(data.data(), ls_data_vaddr, size); - - spu_cache::precompile_data_t obj{vaddr, std::move(data)}; - - obj.funcs = spu_thread::discover_functions(vaddr, { reinterpret_cast(ls_data_vaddr), size }, vaddr != 0, umax); - - if (obj.funcs.empty()) - { - // Nothing to add - return; - } - - for (u32 addr : obj.funcs) - { - spu_log.notice("Found SPU function at: 0x%05x", addr); - } - - spu_log.notice("Found %u SPU functions", obj.funcs.size()); - - g_fxo->get().precompile_funcs.push(std::move(obj)); -} - -// For SPU cache validity check -static u16 calculate_crc16(const uchar* data, usz length) -{ - u16 crc = umax; - - while (length--) - { - u8 x = (crc >> 8) ^ *data++; - x ^= (x >> 4); - crc = static_cast((crc << 8) ^ (x << 12) ^ (x << 5) ^ x); - } - - return crc; -} - -std::deque spu_cache::get() -{ - std::deque result; - - if (!m_file) - { - return result; - } - - m_file.seek(0); - - // TODO: signal truncated or otherwise broken file - while (true) - { - struct block_info_t - { - be_t crc; - be_t size; - be_t addr; - } block_info{}; - - if (!m_file.read(block_info)) - { - break; - } - - const u32 crc = block_info.crc; - const u32 size = block_info.size; - const u32 addr = block_info.addr; - - if (utils::add_saturate(addr, size * 4) > SPU_LS_SIZE) - { - break; - } - - std::vector func; - - if (!m_file.read(func, size)) - { - break; - } - - if (!size || !func[0]) - { - // Skip old format Giga entries - continue; - } - - // CRC check is optional to be compatible with old format - if (crc && std::max(calculate_crc16(reinterpret_cast(func.data()), size * 4), 1) != crc) - { - // Invalid, but continue anyway - continue; - } - - spu_program res; - res.entry_point = addr; - res.lower_bound = addr; - res.data = std::move(func); - result.emplace_front(std::move(res)); - } - - return result; -} - -void spu_cache::add(const spu_program& func) -{ - if (!m_file) - { - return; - } - - be_t size = ::size32(func.data); - be_t addr = func.entry_point; - - // Add CRC (forced non-zero) - size |= std::max(calculate_crc16(reinterpret_cast(func.data.data()), size * 4), 1) << 16; - - const fs::iovec_clone gather[3] - { - {&size, sizeof(size)}, - {&addr, sizeof(addr)}, - {func.data.data(), func.data.size() * 4} - }; - - // Append data - m_file.write_gather(gather, 3); -} - -void spu_cache::initialize(bool build_existing_cache) -{ - spu_runtime::g_interpreter = spu_runtime::g_gateway; - - if (g_cfg.core.spu_decoder == spu_decoder_type::_static || g_cfg.core.spu_decoder == spu_decoder_type::dynamic) - { - for (auto& x : *spu_runtime::g_dispatcher) - { - x.raw() = spu_runtime::tr_interpreter; - } - } - - const std::string ppu_cache = rpcs3::cache::get_ppu_cache(); - - if (ppu_cache.empty()) - { - return; - } - - // SPU cache file (version + block size type) - const std::string loc = ppu_cache + "spu-" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1-tane.dat"; - - spu_cache cache(loc); - - if (!cache) - { - spu_log.error("Failed to initialize SPU cache at: %s", loc); - return; - } - - // Read cache - auto func_list = cache.get(); - atomic_t fnext{}; - atomic_t fail_flag{0}; - - auto data_list = g_fxo->get().precompile_funcs.pop_all(); - g_fxo->get().collect_funcs_to_precompile = false; - - u32 total_precompile = 0; - - for (auto& sec : data_list) - { - total_precompile += sec.funcs.size(); - } - - const bool spu_precompilation_enabled = func_list.empty() && g_cfg.core.spu_cache && g_cfg.core.llvm_precompilation; - - if (spu_precompilation_enabled) - { - // What compiles in this case goes straight to disk - g_fxo->get() = std::move(cache); - } - else if (!build_existing_cache) - { - return; - } - else - { - total_precompile = 0; - data_list = {}; - } - - atomic_t data_indexer = 0; - - if (g_cfg.core.spu_decoder == spu_decoder_type::dynamic || g_cfg.core.spu_decoder == spu_decoder_type::llvm) - { - if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11)) - { - compiler->init(); - - if (compiler->compile({}) && spu_runtime::g_interpreter) - { - spu_log.success("SPU Runtime: Built the interpreter."); - - if (g_cfg.core.spu_decoder != spu_decoder_type::llvm) - { - return; - } - } - else - { - spu_log.fatal("SPU Runtime: Failed to build the interpreter."); - } - } - } - - u32 worker_count = 0; - - std::optional progr; - - u32 total_funcs = 0; - - if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm) - { - const u32 add_count = ::size32(func_list) + total_precompile; - - if (add_count) - { - total_funcs = build_existing_cache ? add_count : 0; - } - - worker_count = std::min(rpcs3::utils::get_max_threads(), add_count); - } - - atomic_t pending_progress = 0; - atomic_t showing_progress = false; - - if (!g_progr_ptotal) - { - g_progr_ptotal += total_funcs; - showing_progress.release(true); - progr.emplace("Building SPU cache..."); - } - - named_thread_group workers("SPU Worker ", worker_count, [&]() -> uint - { -#ifdef __APPLE__ - pthread_jit_write_protect_np(false); -#endif - // Set low priority - thread_ctrl::scoped_priority low_prio(-1); - - // Initialize compiler instances for parallel compilation - std::unique_ptr compiler; - - if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) - { - compiler = spu_recompiler_base::make_asmjit_recompiler(); - } - else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) - { - compiler = spu_recompiler_base::make_llvm_recompiler(); - } - - compiler->init(); - - // How much every thread compiled - uint result = 0; - - // Fake LS - std::vector> ls(0x10000); - - usz func_i = fnext++; - - // Ensure some actions are performed on a single thread - const bool is_first_thread = func_i == 0; - - // Build functions - for (; func_i < func_list.size(); func_i = fnext++, (showing_progress ? g_progr_pdone : pending_progress) += build_existing_cache ? 1 : 0) - { - const spu_program& func = std::as_const(func_list)[func_i]; - - if (Emu.IsStopped() || fail_flag) - { - continue; - } - - // Get data start - const u32 start = func.lower_bound; - const u32 size0 = ::size32(func.data); - - be_t hash_start; - { - sha1_context ctx; - u8 output[20]; - - sha1_starts(&ctx); - sha1_update(&ctx, reinterpret_cast(func.data.data()), func.data.size() * 4); - sha1_finish(&ctx, output); - std::memcpy(&hash_start, output, sizeof(hash_start)); - } - - // Check hash against allowed bounds - const bool inverse_bounds = g_cfg.core.spu_llvm_lower_bound > g_cfg.core.spu_llvm_upper_bound; - - if ((!inverse_bounds && (hash_start < g_cfg.core.spu_llvm_lower_bound || hash_start > g_cfg.core.spu_llvm_upper_bound)) || - (inverse_bounds && (hash_start < g_cfg.core.spu_llvm_lower_bound && hash_start > g_cfg.core.spu_llvm_upper_bound))) - { - spu_log.error("[Debug] Skipped function %s", fmt::base57(hash_start)); - result++; - continue; - } - - // Initialize LS with function data only - for (u32 i = 0, pos = start; i < size0; i++, pos += 4) - { - ls[pos / 4] = std::bit_cast>(func.data[i]); - } - - // Call analyser - spu_program func2 = compiler->analyse(ls.data(), func.entry_point); - - if (func2 != func) - { - spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0); - } - else if (!compiler->compile(std::move(func2))) - { - // Likely, out of JIT memory. Signal to prevent further building. - fail_flag |= 1; - continue; - } - - // Clear fake LS - std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1)); - - result++; - - if (is_first_thread && !showing_progress) - { - if (!g_progr.load() && !g_progr_ptotal && !g_progr_ftotal) - { - showing_progress = true; - g_progr_pdone += pending_progress.exchange(0); - g_progr_ptotal += total_funcs; - progr.emplace("Building SPU cache..."); - } - } - else if (showing_progress && pending_progress) - { - // Cover missing progress due to a race - g_progr_pdone += pending_progress.exchange(0); - } - } - - u32 last_sec_idx = umax; - - for (func_i = data_indexer++;; func_i = data_indexer++, (showing_progress ? g_progr_pdone : pending_progress) += build_existing_cache ? 1 : 0) - { - u32 passed_count = 0; - u32 func_addr = 0; - u32 next_func = 0; - u32 sec_addr = umax; - u32 sec_idx = 0; - std::basic_string_view inst_data; - - // Try to get the data this index points to - for (auto& sec : data_list) - { - if (func_i < passed_count + sec.funcs.size()) - { - const u32 func_idx = func_i - passed_count; - sec_addr = sec.vaddr; - func_addr = ::at32(sec.funcs, func_idx); - inst_data = sec.inst_data; - next_func = sec.funcs.size() >= func_idx ? sec_addr + inst_data.size() * 4 : sec.funcs[func_idx]; - break; - } - - passed_count += sec.funcs.size(); - sec_idx++; - } - - if (sec_addr == umax) - { - // End of compilation for thread - break; - } - - if (Emu.IsStopped() || fail_flag) - { - continue; - } - - if (last_sec_idx != sec_idx) - { - if (last_sec_idx != umax) - { - // Clear fake LS of previous section - auto& sec = data_list[last_sec_idx]; - std::memset(ls.data() + sec.vaddr / 4, 0, sec.inst_data.size() * 4); - } - - // Initialize LS with the entire section data - for (u32 i = 0, pos = sec_addr; i < inst_data.size(); i++, pos += 4) - { - ls[pos / 4] = std::bit_cast>(inst_data[i]); - } - - last_sec_idx = sec_idx; - } - - u32 block_addr = func_addr; - - std::map> targets; - - // Call analyser - spu_program func2 = compiler->analyse(ls.data(), block_addr, &targets); - - while (!func2.data.empty()) - { - const u32 last_inst = std::bit_cast>(func2.data.back()); - const u32 prog_size = func2.data.size(); - - if (!compiler->compile(std::move(func2))) - { - // Likely, out of JIT memory. Signal to prevent further building. - fail_flag |= 1; - break; - } - - result++; - - const u32 start_new = block_addr + prog_size * 4; - - if (start_new >= next_func || (start_new == next_func - 4 && ls[start_new / 4] == 0x200000u)) - { - // Completed - break; - } - - if (auto type = g_spu_itype.decode(last_inst); - type == spu_itype::BRSL || type == spu_itype::BRASL || type == spu_itype::BISL || type == spu_itype::SYNC) - { - if (ls[start_new / 4] && g_spu_itype.decode(ls[start_new / 4]) != spu_itype::UNK) - { - spu_log.notice("Precompiling fallthrough to 0x%05x", start_new); - func2 = compiler->analyse(ls.data(), start_new, &targets); - block_addr = start_new; - continue; - } - } - - if (targets.empty()) - { - break; - } - - const auto upper = targets.upper_bound(func_addr); - - if (upper == targets.begin()) - { - break; - } - - u32 new_entry = umax; - - // Find the lowest target in the space in-between - for (auto it = std::prev(upper); it != targets.end() && it->first < start_new && new_entry > start_new; it++) - { - for (u32 target : it->second) - { - if (target >= start_new && target < next_func) - { - if (target < new_entry) - { - new_entry = target; - - if (new_entry == start_new) - { - // Cannot go lower - break; - } - } - } - } - } - - if (new_entry != umax && !spu_thread::is_exec_code(new_entry, { reinterpret_cast(ls.data()), SPU_LS_SIZE })) - { - new_entry = umax; - } - - if (new_entry == umax) - { - new_entry = start_new; - - while (new_entry < next_func && (ls[start_new / 4] < 0x3fffc || !spu_thread::is_exec_code(new_entry, { reinterpret_cast(ls.data()), SPU_LS_SIZE }))) - { - new_entry += 4; - } - - if (new_entry >= next_func || (new_entry == next_func - 4 && ls[new_entry / 4] == 0x200000u)) - { - // Completed - break; - } - } - - - spu_log.notice("Precompiling filler space at 0x%05x (next=0x%05x)", new_entry, next_func); - func2 = compiler->analyse(ls.data(), new_entry, &targets); - block_addr = new_entry; - } - - if (is_first_thread && !showing_progress) - { - if (!g_progr.load() && !g_progr_ptotal && !g_progr_ftotal) - { - showing_progress = true; - g_progr_pdone += pending_progress.exchange(0); - g_progr_ptotal += total_funcs; - progr.emplace("Building SPU cache..."); - } - } - else if (showing_progress && pending_progress) - { - // Cover missing progress due to a race - g_progr_pdone += pending_progress.exchange(0); - } - } - - if (showing_progress && pending_progress) - { - // Cover missing progress due to a race - g_progr_pdone += pending_progress.exchange(0); - } - - return result; - }); - - u32 built_total = 0; - - // Join (implicitly) and print individual results - for (u32 i = 0; i < workers.size(); i++) - { - spu_log.notice("SPU Runtime: Worker %u built %u programs.", i + 1, workers[i]); - built_total += workers[i]; - } - - spu_log.notice("SPU Runtime: Workers built %u programs.", built_total); - - if (Emu.IsStopped()) - { - spu_log.error("SPU Runtime: Cache building aborted."); - return; - } - - if (fail_flag) - { - spu_log.fatal("SPU Runtime: Cache building failed (out of memory)."); - return; - } - - if ((g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm) && !func_list.empty()) - { - spu_log.success("SPU Runtime: Built %u functions.", func_list.size()); - - if (g_cfg.core.spu_debug) - { - std::string dump; - dump.reserve(10'000'000); - - std::map, spu_program*> sorted; - - for (auto&& f : func_list) - { - // Interpret as a byte string - std::basic_string_view data = {reinterpret_cast(f.data.data()), f.data.size() * sizeof(u32)}; - - sorted[data] = &f; - } - - std::unordered_set depth_n; - - u32 n_max = 0; - - for (auto&& [bytes, f] : sorted) - { - { - sha1_context ctx; - u8 output[20]; - - sha1_starts(&ctx); - sha1_update(&ctx, bytes.data(), bytes.size()); - sha1_finish(&ctx, output); - fmt::append(dump, "\n\t[%s] ", fmt::base57(output)); - } - - u32 depth_m = 0; - - for (auto&& [data, f2] : sorted) - { - u32 depth = 0; - - if (f2 == f) - { - continue; - } - - for (u32 i = 0; i < bytes.size(); i++) - { - if (i < data.size() && data[i] == bytes[i]) - { - depth++; - } - else - { - break; - } - } - - depth_n.emplace(depth); - depth_m = std::max(depth, depth_m); - } - - fmt::append(dump, "c=%06d,d=%06d ", depth_n.size(), depth_m); - - bool sk = false; - - for (u32 i = 0; i < bytes.size(); i++) - { - if (depth_m == i) - { - dump += '|'; - sk = true; - } - - fmt::append(dump, "%02x", bytes[i]); - - if (i % 4 == 3) - { - if (sk) - { - sk = false; - } - else - { - dump += ' '; - } - - dump += ' '; - } - } - - fmt::append(dump, "\n\t%49s", ""); - - for (u32 i = 0; i < f->data.size(); i++) - { - fmt::append(dump, "%-10s", g_spu_iname.decode(std::bit_cast>(f->data[i]))); - } - - n_max = std::max(n_max, ::size32(depth_n)); - - depth_n.clear(); - } - - spu_log.notice("SPU Cache Dump (max_c=%d): %s", n_max, dump); - } - } - - // Initialize global cache instance - if (g_cfg.core.spu_cache && cache) - { - g_fxo->get() = std::move(cache); - } -} - -bool spu_program::operator==(const spu_program& rhs) const noexcept -{ - // TODO - return entry_point - lower_bound == rhs.entry_point - rhs.lower_bound && data == rhs.data; -} - -bool spu_program::operator<(const spu_program& rhs) const noexcept -{ - const u32 lhs_offs = (entry_point - lower_bound) / 4; - const u32 rhs_offs = (rhs.entry_point - rhs.lower_bound) / 4; - - // Select range for comparison - std::basic_string_view lhs_data(data.data() + lhs_offs, data.size() - lhs_offs); - std::basic_string_view rhs_data(rhs.data.data() + rhs_offs, rhs.data.size() - rhs_offs); - const auto cmp0 = lhs_data.compare(rhs_data); - - if (cmp0 < 0) - return true; - else if (cmp0 > 0) - return false; - - // Compare from address 0 to the point before the entry point (TODO: undesirable) - lhs_data = {data.data(), lhs_offs}; - rhs_data = {rhs.data.data(), rhs_offs}; - const auto cmp1 = lhs_data.compare(rhs_data); - - if (cmp1 < 0) - return true; - else if (cmp1 > 0) - return false; - - // TODO - return lhs_offs < rhs_offs; -} - -spu_runtime::spu_runtime() -{ - // Clear LLVM output - m_cache_path = rpcs3::cache::get_ppu_cache(); - - if (m_cache_path.empty()) - { - return; - } - - fs::create_dir(m_cache_path + "llvm/"); - fs::remove_all(m_cache_path + "llvm/", false); - - if (g_cfg.core.spu_debug) - { - fs::file(m_cache_path + "spu.log", fs::rewrite); - fs::file(m_cache_path + "spu-ir.log", fs::rewrite); - } -} - -spu_item* spu_runtime::add_empty(spu_program&& data) -{ - if (data.data.empty()) - { - return nullptr; - } - - // Store previous item if already added - spu_item* prev = nullptr; - - //Try to add item that doesn't exist yet - const auto ret = m_stuff[data.data[0] >> 12].push_if([&](spu_item& _new, spu_item& _old) - { - if (_new.data == _old.data) - { - prev = &_old; - return false; - } - - return true; - }, std::move(data)); - - if (ret) - { - return ret; - } - - return prev; -} - -spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst) -{ - // Prepare sorted list - static thread_local std::vector, spu_function_t>> m_flat_list; - - // Remember top position - auto stuff_it = ::at32(m_stuff, id_inst >> 12).begin(); - auto stuff_end = ::at32(m_stuff, id_inst >> 12).end(); - { - if (stuff_it->trampoline) - { - return stuff_it->trampoline; - } - - m_flat_list.clear(); - - for (auto it = stuff_it; it != stuff_end; ++it) - { - if (const auto ptr = it->compiled.load()) - { - std::basic_string_view range{it->data.data.data(), it->data.data.size()}; - range.remove_prefix((it->data.entry_point - it->data.lower_bound) / 4); - m_flat_list.emplace_back(range, ptr); - } - else - { - // Pull oneself deeper (TODO) - ++stuff_it; - } - } - } - - std::sort(m_flat_list.begin(), m_flat_list.end(), FN(x.first < y.first)); - - struct work - { - u32 size; - u16 from; - u16 level; - u8* rel32; - decltype(m_flat_list)::iterator beg; - decltype(m_flat_list)::iterator end; - }; - - // Scratch vector - static thread_local std::vector workload; - - // Generate a dispatcher (übertrampoline) - const auto beg = m_flat_list.begin(); - const auto _end = m_flat_list.end(); - const u32 size0 = ::size32(m_flat_list); - - auto result = beg->second; - - if (size0 != 1) - { -#if defined(ARCH_ARM64) - // Allocate some writable executable memory - u8* const wxptr = jit_runtime::alloc(size0 * 128 + 16, 16); - - if (!wxptr) - { - return nullptr; - } - - // Raw assembly pointer - u8* raw = wxptr; - - auto make_jump = [&](asmjit::arm::CondCode op, auto target) - { - // 36 bytes - // Fallback to dispatch if no target - const u64 taddr = target ? reinterpret_cast(target) : reinterpret_cast(tr_dispatch); - - // ldr x9, #16 -> ldr x9, taddr - *raw++ = 0x89; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x58; - - if (op == asmjit::arm::CondCode::kAlways) - { - // br x9 - *raw++ = 0x20; - *raw++ = 0x01; - *raw++ = 0x1F; - *raw++ = 0xD6; - - // nop - *raw++ = 0x1F; - *raw++ = 0x20; - *raw++ = 0x03; - *raw++ = 0xD5; - - // nop - *raw++ = 0x1F; - *raw++ = 0x20; - *raw++ = 0x03; - *raw++ = 0xD5; - } - else - { - // b.COND #8 -> b.COND do_branch - switch (op) - { - case asmjit::arm::CondCode::kUnsignedLT: - *raw++ = 0x43; - break; - case asmjit::arm::CondCode::kUnsignedGT: - *raw++ = 0x48; - break; - default: - asm("brk 0x42"); - } - - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x54; - - // b #16 -> b cont - *raw++ = 0x04; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x14; - - // do_branch: br x9 - *raw++ = 0x20; - *raw++ = 0x01; - *raw++ = 0x1f; - *raw++ = 0xD6; - } - - // taddr - std::memcpy(raw, &taddr, 8); - raw += 8; - - // cont: next instruction - }; -#elif defined(ARCH_X64) - // Allocate some writable executable memory - u8* const wxptr = jit_runtime::alloc(size0 * 22 + 14, 16); - - if (!wxptr) - { - return nullptr; - } - - // Raw assembly pointer - u8* raw = wxptr; - - // Write jump instruction with rel32 immediate - auto make_jump = [&](u8 op, auto target) - { - ensure(raw + 8 <= wxptr + size0 * 22 + 16); - - // Fallback to dispatch if no target - const u64 taddr = target ? reinterpret_cast(target) : reinterpret_cast(tr_dispatch); - - // Compute the distance - const s64 rel = taddr - reinterpret_cast(raw) - (op != 0xe9 ? 6 : 5); - - ensure(rel >= s32{smin} && rel <= s32{smax}); - - if (op != 0xe9) - { - // First jcc byte - *raw++ = 0x0f; - ensure((op >> 4) == 0x8); - } - - *raw++ = op; - - const s32 r32 = static_cast(rel); - - std::memcpy(raw, &r32, 4); - raw += 4; - }; -#endif - - workload.clear(); - workload.reserve(size0); - workload.emplace_back(); - workload.back().size = size0; - workload.back().level = 0; - workload.back().from = -1; - workload.back().rel32 = nullptr; - workload.back().beg = beg; - workload.back().end = _end; - - // LS address starting from PC is already loaded into rcx (see spu_runtime::tr_all) - - for (usz i = 0; i < workload.size(); i++) - { - // Get copy of the workload info - auto w = workload[i]; - - // Split range in two parts - auto it = w.beg; - auto it2 = w.beg; - u32 size1 = w.size / 2; - u32 size2 = w.size - size1; - std::advance(it2, w.size / 2); - - while (ensure(w.level < umax)) - { - it = it2; - size1 = w.size - size2; - - if (w.level >= w.beg->first.size()) - { - // Cannot split: smallest function is a prefix of bigger ones (TODO) - break; - } - - const u32 x1 = ::at32(w.beg->first, w.level); - - if (!x1) - { - // Cannot split: some functions contain holes at this level - w.level++; - - // Resort subrange starting from the new level - std::stable_sort(w.beg, w.end, [&](const auto& a, const auto& b) - { - std::basic_string_view lhs = a.first; - std::basic_string_view rhs = b.first; - - lhs.remove_prefix(w.level); - rhs.remove_prefix(w.level); - - return lhs < rhs; - }); - - continue; - } - - // Adjust ranges (forward) - while (it != w.end && x1 == ::at32(it->first, w.level)) - { - it++; - size1++; - } - - if (it == w.end) - { - // Cannot split: words are identical within the range at this level - w.level++; - } - else - { - size2 = w.size - size1; - break; - } - } - - if (w.rel32) - { -#if defined(ARCH_X64) - // Patch rel32 linking it to the current location if necessary - const s32 r32 = ::narrow(raw - w.rel32); - std::memcpy(w.rel32 - 4, &r32, 4); -#elif defined(ARCH_ARM64) - // Rewrite jump address - { - u64 raw64 = reinterpret_cast(raw); - memcpy(w.rel32 - 8, &raw64, 8); - } -#else -#error "Unimplemented" -#endif - } - - if (w.level >= w.beg->first.size() || w.level >= it->first.size()) - { - // If functions cannot be compared, assume smallest function - spu_log.error("Trampoline simplified at ??? (level=%u)", w.level); -#if defined(ARCH_X64) - make_jump(0xe9, w.beg->second); // jmp rel32 -#elif defined(ARCH_ARM64) - u64 branch_target = reinterpret_cast(w.beg->second); - make_jump(asmjit::arm::CondCode::kAlways, branch_target); -#else -#error "Unimplemented" -#endif - continue; - } - - // Value for comparison - const u32 x = ::at32(it->first, w.level); - - // Adjust ranges (backward) - while (it != m_flat_list.begin()) - { - it--; - - if (w.level >= it->first.size()) - { - it = m_flat_list.end(); - break; - } - - if (::at32(it->first, w.level) != x) - { - it++; - break; - } - - ensure(it != w.beg); - size1--; - size2++; - } - - if (it == m_flat_list.end()) - { - spu_log.error("Trampoline simplified (II) at ??? (level=%u)", w.level); -#if defined(ARCH_X64) - make_jump(0xe9, w.beg->second); // jmp rel32 -#elif defined(ARCH_ARM64) - u64 branch_target = reinterpret_cast(w.beg->second); - make_jump(asmjit::arm::CondCode::kAlways, branch_target); -#else -#error "Unimplemented" -#endif - continue; - } - - // Emit 32-bit comparison -#if defined(ARCH_X64) - ensure(raw + 12 <= wxptr + size0 * 22 + 16); // "Asm overflow" -#elif defined(ARCH_ARM64) - ensure(raw + (4 * 4) <= wxptr + size0 * 128 + 16); -#else -#error "Unimplemented" -#endif - - if (w.from != w.level) - { - // If necessary (level has advanced), emit load: mov eax, [rcx + addr] - const u32 cmp_lsa = w.level * 4u; -#if defined(ARCH_X64) - if (cmp_lsa < 0x80) - { - *raw++ = 0x8b; - *raw++ = 0x41; - *raw++ = ::narrow(cmp_lsa); - } - else - { - *raw++ = 0x8b; - *raw++ = 0x81; - std::memcpy(raw, &cmp_lsa, 4); - raw += 4; - } -#elif defined(ARCH_ARM64) - // ldr w9, #8 - *raw++ = 0x49; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x18; - - // b #8 - *raw++ = 0x02; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x14; - - // cmp_lsa - std::memcpy(raw, &cmp_lsa, 4); - raw += 4; - - // ldr w1, [x7, x9] - *raw++ = 0xE1; - *raw++ = 0x68; - *raw++ = 0x69; - *raw++ = 0xB8; -#else -#error "Unimplemented" -#endif - } - - // Emit comparison: cmp eax, imm32 -#if defined(ARCH_X64) - *raw++ = 0x3d; - std::memcpy(raw, &x, 4); - raw += 4; -#elif defined(ARCH_ARM64) - // ldr w9, #8 - *raw++ = 0x49; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x18; - - // b #8 - *raw++ = 0x02; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x14; - - // x - std::memcpy(raw, &x, 4); - raw += 4; - - // cmp w1, w9 - *raw++ = 0x3f; - *raw++ = 0x00; - *raw++ = 0x09; - *raw++ = 0x6B; -#else -#error "Unimplemented" -#endif - - // Low subrange target - if (size1 == 1) - { -#if defined(ARCH_X64) - make_jump(0x82, w.beg->second); // jb rel32 -#elif defined(ARCH_ARM64) - u64 branch_target = reinterpret_cast(w.beg->second); - make_jump(asmjit::arm::CondCode::kUnsignedLT, branch_target); -#else -#error "Unimplemented" -#endif - } - else - { -#if defined(ARCH_X64) - make_jump(0x82, raw); // jb rel32 (stub) -#elif defined(ARCH_ARM64) - make_jump(asmjit::arm::CondCode::kUnsignedLT, raw); -#else -#error "Unimplemented" -#endif - auto& to = workload.emplace_back(w); - to.end = it; - to.size = size1; - to.rel32 = raw; - to.from = w.level; - } - - // Second subrange target - if (size2 == 1) - { -#if defined(ARCH_X64) - make_jump(0xe9, it->second); // jmp rel32 -#elif defined(ARCH_ARM64) - u64 branch_target = reinterpret_cast(it->second); - make_jump(asmjit::arm::CondCode::kAlways, branch_target); -#else -#error "Unimplemented" -#endif - } - else - { - it2 = it; - - // Select additional midrange for equality comparison - while (it2 != w.end && ::at32(it2->first, w.level) == x) - { - size2--; - it2++; - } - - if (it2 != w.end) - { - // High subrange target - if (size2 == 1) - { -#if defined(ARCH_X64) - make_jump(0x87, it2->second); // ja rel32 -#elif defined(ARCH_ARM64) - u64 branch_target = reinterpret_cast(it2->second); - make_jump(asmjit::arm::CondCode::kUnsignedGT, branch_target); -#else -#throw "Unimplemented" -#endif - } - else - { -#if defined(ARCH_X64) - make_jump(0x87, raw); // ja rel32 (stub) -#elif defined(ARCH_ARM64) - make_jump(asmjit::arm::CondCode::kUnsignedGT, raw); -#else -#error "Unimplemented" -#endif - auto& to = workload.emplace_back(w); - to.beg = it2; - to.size = size2; - to.rel32 = raw; - to.from = w.level; - } - - const u32 size3 = w.size - size1 - size2; - - if (size3 == 1) - { -#if defined(ARCH_X64) - make_jump(0xe9, it->second); // jmp rel32 -#elif defined(ARCH_ARM64) - u64 branch_target = reinterpret_cast(it->second); - make_jump(asmjit::arm::CondCode::kAlways, branch_target); -#else -#error "Unimplemented" -#endif - } - else - { -#if defined(ARCH_X64) - make_jump(0xe9, raw); // jmp rel32 (stub) -#elif defined(ARCH_ARM64) - make_jump(asmjit::arm::CondCode::kAlways, raw); -#else -#error "Unimplemented" -#endif - auto& to = workload.emplace_back(w); - to.beg = it; - to.end = it2; - to.size = size3; - to.rel32 = raw; - to.from = w.level; - } - } - else - { -#if defined(ARCH_X64) - make_jump(0xe9, raw); // jmp rel32 (stub) -#elif defined(ARCH_ARM64) - make_jump(asmjit::arm::CondCode::kAlways, raw); -#else -#error "Unimplemented" -#endif - auto& to = workload.emplace_back(w); - to.beg = it; - to.size = w.size - size1; - to.rel32 = raw; - to.from = w.level; - } - } - } - - workload.clear(); - result = reinterpret_cast(reinterpret_cast(wxptr)); - - std::string fname; - fmt::append(fname, "__ub%u", m_flat_list.size()); - jit_announce(wxptr, raw - wxptr, fname); - } - - if (auto _old = stuff_it->trampoline.compare_and_swap(nullptr, result)) - { - return _old; - } - - // Install ubertrampoline - auto& insert_to = ::at32(*spu_runtime::g_dispatcher, id_inst >> 12); - - auto _old = insert_to.load(); - - do - { - // Make sure we are replacing an older ubertrampoline but not newer one - if (_old != tr_dispatch) - { - bool ok = false; - - for (auto it = stuff_it; it != stuff_end; ++it) - { - if (it->trampoline == _old) - { - ok = true; - break; - } - } - - if (!ok) - { - return result; - } - } - } - while (!insert_to.compare_exchange(_old, result)); - - return result; -} - -spu_function_t spu_runtime::find(const u32* ls, u32 addr) const -{ - const u32 index = ls[addr / 4] >> 12; - for (const auto& item : ::at32(m_stuff, index)) - { - if (const auto ptr = item.compiled.load()) - { - std::basic_string_view range{item.data.data.data(), item.data.data.size()}; - range.remove_prefix((item.data.entry_point - item.data.lower_bound) / 4); - - if (addr / 4 + range.size() > 0x10000) - { - continue; - } - - if (range.compare(0, range.size(), ls + addr / 4, range.size()) == 0) - { - return ptr; - } - } - } - - return nullptr; -} - -spu_function_t spu_runtime::make_branch_patchpoint(u16 data) const -{ -#if defined(ARCH_X64) - u8* const raw = jit_runtime::alloc(16, 16); - - if (!raw) - { - return nullptr; - } - - // Save address of the following jmp (GHC CC 3rd argument) - raw[0] = 0x4c; // lea r12, [rip+1] - raw[1] = 0x8d; - raw[2] = 0x25; - raw[3] = 0x01; - raw[4] = 0x00; - raw[5] = 0x00; - raw[6] = 0x00; - - raw[7] = 0x90; // nop - - // Jump to spu_recompiler_base::branch - raw[8] = 0xe9; - // Compute the distance - const s64 rel = reinterpret_cast(tr_branch) - reinterpret_cast(raw + 8) - 5; - std::memcpy(raw + 9, &rel, 4); - raw[13] = 0xcc; - raw[14] = data >> 8; - raw[15] = data & 0xff; - - return reinterpret_cast(raw); -#elif defined(ARCH_ARM64) -#if defined(__APPLE__) - pthread_jit_write_protect_np(false); -#endif - - u8* const patch_fn = ensure(jit_runtime::alloc(36, 16)); - u8* raw = patch_fn; - - // adr x21, #16 - *raw++ = 0x95; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x10; - - // nop x3 - for (int i = 0; i < 3; i++) - { - *raw++ = 0x1F; - *raw++ = 0x20; - *raw++ = 0x03; - *raw++ = 0xD5; - } - - // ldr x9, #8 - *raw++ = 0x49; - *raw++ = 0x00; - *raw++ = 0x00; - *raw++ = 0x58; - - // br x9 - *raw++ = 0x20; - *raw++ = 0x01; - *raw++ = 0x1F; - *raw++ = 0xD6; - - u64 branch_target = reinterpret_cast(tr_branch); - std::memcpy(raw, &branch_target, 8); - raw += 8; - - *raw++ = static_cast(data >> 8); - *raw++ = static_cast(data & 0xff); - -#if defined(__APPLE__) - pthread_jit_write_protect_np(true); -#endif - - // Flush all cache lines after potentially writing executable code - asm("ISB"); - asm("DSB ISH"); - - return reinterpret_cast(patch_fn); -#else -#error "Unimplemented" -#endif -} - -spu_recompiler_base::spu_recompiler_base() -{ -} - -spu_recompiler_base::~spu_recompiler_base() -{ -} - -void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) -{ - // If code verification failed from a patched patchpoint, clear it with a dispatcher jump - if (rip) - { -#if defined(ARCH_X64) - const s64 rel = reinterpret_cast(spu_runtime::tr_all) - reinterpret_cast(rip - 8) - 5; - - union - { - u8 bytes[8]; - u64 result; - }; - - bytes[0] = 0xe9; // jmp rel32 - std::memcpy(bytes + 1, &rel, 4); - bytes[5] = 0x66; // lnop (2 bytes) - bytes[6] = 0x90; - bytes[7] = 0x90; - - atomic_storage::release(*reinterpret_cast(rip - 8), result); -#elif defined(ARCH_ARM64) - union - { - u8 bytes[16]; - u128 result; - }; - - // ldr x9, #8 - bytes[0] = 0x49; - bytes[1] = 0x00; - bytes[2] = 0x00; - bytes[3] = 0x58; - - // br x9 - bytes[4] = 0x20; - bytes[5] = 0x01; - bytes[6] = 0x1F; - bytes[7] = 0xD6; - - const u64 target = reinterpret_cast(spu_runtime::tr_all); - std::memcpy(bytes + 8, &target, 8); -#if defined(__APPLE__) - pthread_jit_write_protect_np(false); -#endif - atomic_storage::release(*reinterpret_cast(rip), result); -#if defined(__APPLE__) - pthread_jit_write_protect_np(true); -#endif - - // Flush all cache lines after potentially writing executable code - asm("ISB"); - asm("DSB ISH"); -#else -#error "Unimplemented" -#endif - } - - // Second attempt (recover from the recursion after repeated unsuccessful trampoline call) - if (spu.block_counter != spu.block_recover && &dispatch != ::at32(*spu_runtime::g_dispatcher, spu._ref>(spu.pc) >> 12)) - { - spu.block_recover = spu.block_counter; - return; - } - - spu.jit->init(); - - // Compile - if (spu._ref(spu.pc) == 0u) - { - spu_runtime::g_escape(&spu); - return; - } - - const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr(0), spu.pc)); - - if (!func) - { - spu_log.fatal("[0x%05x] Compilation failed.", spu.pc); - return; - } - - // Diagnostic - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - const v128 _info = spu.stack_mirror[(spu.gpr[1]._u32[3] & 0x3fff0) >> 4]; - - if (_info._u64[0] + 1) - { - spu_log.trace("Called from 0x%x", _info._u32[2] - 4); - } - } -#if defined(__APPLE__) - pthread_jit_write_protect_np(true); -#endif - -#if defined(ARCH_ARM64) - // Flush all cache lines after potentially writing executable code - asm("ISB"); - asm("DSB ISH"); -#endif - spu_runtime::g_tail_escape(&spu, func, nullptr); -} - -void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip) -{ -#if defined(ARCH_X64) - if (const u32 ls_off = ((rip[6] << 8) | rip[7]) * 4) -#elif defined(ARCH_ARM64) - if (const u32 ls_off = ((rip[16] << 8) | rip[17]) * 4) // See branch_patchpoint `data` -#else -#error "Unimplemented" -#endif - { - spu_log.todo("Special branch patchpoint hit.\nPlease report to the developer (0x%05x).", ls_off); - } - - // Find function - const auto func = spu.jit->get_runtime().find(static_cast(spu._ptr(0)), spu.pc); - - if (!func) - { - return; - } - -#if defined(ARCH_X64) - // Overwrite jump to this function with jump to the compiled function - const s64 rel = reinterpret_cast(func) - reinterpret_cast(rip) - 5; - - union - { - u8 bytes[8]; - u64 result; - }; - - if (rel >= s32{smin} && rel <= s32{smax}) - { - const s64 rel8 = (rel + 5) - 2; - - if (rel8 >= s8{smin} && rel8 <= s8{smax}) - { - bytes[0] = 0xeb; // jmp rel8 - bytes[1] = static_cast(rel8); - std::memset(bytes + 2, 0xcc, 4); - } - else - { - bytes[0] = 0xe9; // jmp rel32 - std::memcpy(bytes + 1, &rel, 4); - bytes[5] = 0xcc; - } - - bytes[6] = rip[6]; - bytes[7] = rip[7]; - } - else - { - fmt::throw_exception("Impossible far jump: %p -> %p", rip, func); - } - - atomic_storage::release(*reinterpret_cast(rip), result); -#elif defined(ARCH_ARM64) - union - { - u8 bytes[16]; - u128 result; - }; - - // ldr x9, #8 - bytes[0] = 0x49; - bytes[1] = 0x00; - bytes[2] = 0x00; - bytes[3] = 0x58; - - // br x9 - bytes[4] = 0x20; - bytes[5] = 0x01; - bytes[6] = 0x1F; - bytes[7] = 0xD6; - - const u64 target = reinterpret_cast(func); - std::memcpy(bytes + 8, &target, 8); -#if defined(__APPLE__) - pthread_jit_write_protect_np(false); -#endif - atomic_storage::release(*reinterpret_cast(rip), result); -#if defined(__APPLE__) - pthread_jit_write_protect_np(true); -#endif - - // Flush all cache lines after potentially writing executable code - asm("ISB"); - asm("DSB ISH"); -#else -#error "Unimplemented" -#endif - - spu_runtime::g_tail_escape(&spu, func, rip); -} - -void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/) -{ - if (g_cfg.core.spu_decoder != spu_decoder_type::_static) - { - fmt::throw_exception("Invalid SPU decoder"); - } - - // Select opcode table - const auto& table = g_fxo->get(); - - // LS pointer - const auto base = static_cast(ls); - - while (true) - { - if (spu.state) [[unlikely]] - { - if (spu.check_state()) - break; - } - - const u32 op = *reinterpret_cast*>(base + spu.pc); - if (table.decode(op)(spu, {op})) - spu.pc += 4; - } -} - -std::vector spu_thread::discover_functions(u32 base_addr, std::span ls, bool is_known_addr, u32 /*entry*/) -{ - std::vector calls; - std::vector branches; - - calls.reserve(100); - - // Discover functions - // Use the most simple method: search for instructions that calls them - // And then filter invalid cases - // TODO: Does not detect jumptables or fixed-addr indirect calls - const v128 brasl_mask = is_known_addr ? v128::from32p(0x62u << 23) : v128::from32p(umax); - - for (u32 i = utils::align(base_addr, 0x10); i < std::min(base_addr + ls.size(), 0x3FFF0); i += 0x10) - { - // Search for BRSL LR and BRASL LR or BR - // TODO: BISL - const v128 inst = read_from_ptr>(ls.data(), i - base_addr); - const v128 cleared_i16 = gv_and32(inst, v128::from32p(utils::rol32(~0xffff, 7))); - const v128 eq_brsl = gv_eq32(cleared_i16, v128::from32p(0x66u << 23)); - const v128 eq_brasl = gv_eq32(cleared_i16, brasl_mask); - const v128 eq_br = gv_eq32(cleared_i16, v128::from32p(0x64u << 23)); - const v128 result = eq_brsl | eq_brasl; - - if (!gv_testz(result)) - { - for (u32 j = 0; j < 4; j++) - { - if (result.u32r[j]) - { - calls.push_back(i + j * 4); - } - } - } - - if (!gv_testz(eq_br)) - { - for (u32 j = 0; j < 4; j++) - { - if (eq_br.u32r[j]) - { - branches.push_back(i + j * 4); - } - } - } - } - - calls.erase(std::remove_if(calls.begin(), calls.end(), [&](u32 caller) - { - // Check the validity of both the callee code and the following caller code - return !is_exec_code(caller, ls, base_addr) || !is_exec_code(caller + 4, ls, base_addr); - }), calls.end()); - - branches.erase(std::remove_if(branches.begin(), branches.end(), [&](u32 caller) - { - // Check the validity of the callee code - return !is_exec_code(caller, ls, base_addr); - }), branches.end()); - - std::vector addrs; - - for (u32 addr : calls) - { - const spu_opcode_t op{read_from_ptr>(ls, addr - base_addr)}; - - const u32 func = op_branch_targets(addr, op)[0]; - - if (func == umax || addr + 4 == func || func == addr || std::count(addrs.begin(), addrs.end(), func)) - { - continue; - } - - addrs.push_back(func); - } - - for (u32 addr : branches) - { - const spu_opcode_t op{read_from_ptr>(ls, addr - base_addr)}; - - const u32 func = op_branch_targets(addr, op)[0]; - - if (func == umax || addr + 4 == func || func == addr || !addr) - { - continue; - } - - // Search for AI R1, +x or OR R3/4, Rx, 0 - // Reasoning: AI R1, +x means stack pointer restoration, branch after that is likely a tail call - // R3 and R4 are common function arguments because they are the first two - for (u32 back = addr - 4, it = 10; it && back >= base_addr && back < std::min(base_addr + ls.size(), 0x3FFF0); it--, back -= 4) - { - const spu_opcode_t test_op{read_from_ptr>(ls, back - base_addr)}; - const auto type = g_spu_itype.decode(test_op.opcode); - - if (type & spu_itype::branch) - { - break; - } - - bool is_tail = false; - - if (type == spu_itype::AI && test_op.rt == 1u && test_op.ra == 1u) - { - if (test_op.si10 <= 0) - { - break; - } - - is_tail = true; - } - else if (!(type & spu_itype::zregmod)) - { - const u32 op_rt = type & spu_itype::_quadrop ? +test_op.rt4 : +test_op.rt; - - if (op_rt >= 80u && (type != spu_itype::LQD || test_op.ra != 1u)) - { - // Modifying non-volatile registers, not a call (and not context restoration) - break; - } - - //is_tail = op_rt == 3u || op_rt == 4u; - } - - if (!is_tail) - { - continue; - } - - if (std::count(addrs.begin(), addrs.end(), func)) - { - break; - } - - addrs.push_back(func); - break; - } - } - - std::sort(addrs.begin(), addrs.end()); - - return addrs; -} - -spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, std::map>* out_target_list) -{ - // Result: addr + raw instruction data - spu_program result; - result.data.reserve(10000); - result.entry_point = entry_point; - result.lower_bound = entry_point; - - // Initialize block entries - m_block_info.reset(); - m_block_info.set(entry_point / 4); - m_entry_info.reset(); - m_entry_info.set(entry_point / 4); - m_ret_info.reset(); - - // Simple block entry workload list - workload.clear(); - workload.push_back(entry_point); - - std::memset(m_regmod.data(), 0xff, sizeof(m_regmod)); - std::memset(m_use_ra.data(), 0xff, sizeof(m_use_ra)); - std::memset(m_use_rb.data(), 0xff, sizeof(m_use_rb)); - std::memset(m_use_rc.data(), 0xff, sizeof(m_use_rc)); - m_targets.clear(); - m_preds.clear(); - m_preds[entry_point]; - m_bbs.clear(); - m_chunks.clear(); - m_funcs.clear(); - - // Value flags (TODO: only is_const is implemented) - enum class vf : u32 - { - is_const, - is_mask, - is_rel, - - __bitset_enum_max - }; - - // Weak constant propagation context (for guessing branch targets) - std::array, 128> vflags{}; - - // Associated constant values for 32-bit preferred slot - std::array values; - - // SYNC instruction found - bool sync = false; - - u32 hbr_loc = 0; - u32 hbr_tg = -1; - - // Result bounds - u32 lsa = entry_point; - u32 limit = 0x40000; - - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - } - - for (u32 wi = 0, wa = workload[0]; wi < workload.size();) - { - const auto next_block = [&] - { - // Reset value information - vflags.fill({}); - sync = false; - hbr_loc = 0; - hbr_tg = -1; - wi++; - - if (wi < workload.size()) - { - wa = workload[wi]; - } - }; - - const u32 pos = wa; - - const auto add_block = [&](u32 target) - { - // Validate new target (TODO) - if (target >= lsa && target < limit) - { - // Check for redundancy - if (!m_block_info[target / 4]) - { - m_block_info[target / 4] = true; - workload.push_back(target); - } - - // Add predecessor - if (m_preds[target].find_first_of(pos) + 1 == 0) - { - m_preds[target].push_back(pos); - } - } - }; - - if (pos < lsa || pos >= limit) - { - // Don't analyse if already beyond the limit - next_block(); - continue; - } - - const u32 data = ls[pos / 4]; - const auto op = spu_opcode_t{data}; - - wa += 4; - - m_targets.erase(pos); - - // Fill register access info - if (auto iflags = g_spu_iflag.decode(data)) - { - if (+iflags & +spu_iflag::use_ra) - m_use_ra[pos / 4] = op.ra; - if (+iflags & +spu_iflag::use_rb) - m_use_rb[pos / 4] = op.rb; - if (+iflags & +spu_iflag::use_rc) - m_use_rc[pos / 4] = op.rc; - } - - // Analyse instruction - switch (const auto type = g_spu_itype.decode(data)) - { - case spu_itype::UNK: - case spu_itype::DFCEQ: - case spu_itype::DFCMEQ: - case spu_itype::DFCGT: - case spu_itype::DFCMGT: - case spu_itype::DFTSV: - { - // Stop before invalid instructions (TODO) - next_block(); - continue; - } - - case spu_itype::SYNC: - case spu_itype::STOP: - case spu_itype::STOPD: - { - if (data == 0) - { - // Stop before null data - next_block(); - continue; - } - - if (g_cfg.core.spu_block_size == spu_block_size_type::safe) - { - // Stop on special instructions (TODO) - m_targets[pos]; - next_block(); - break; - } - - if (type == spu_itype::SYNC) - { - // Remember - sync = true; - } - - break; - } - - case spu_itype::IRET: - { - if (op.d && op.e) - { - spu_log.error("[0x%x] Invalid interrupt flags (DE)", pos); - } - - m_targets[pos]; - next_block(); - break; - } - - case spu_itype::BI: - case spu_itype::BISL: - case spu_itype::BISLED: - case spu_itype::BIZ: - case spu_itype::BINZ: - case spu_itype::BIHZ: - case spu_itype::BIHNZ: - { - if (op.d && op.e) - { - spu_log.error("[0x%x] Invalid interrupt flags (DE)", pos); - } - - const auto af = vflags[op.ra]; - const auto av = values[op.ra]; - const bool sl = type == spu_itype::BISL || type == spu_itype::BISLED; - - if (sl) - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = +vf::is_const; - values[op.rt] = pos + 4; - } - - if (af & vf::is_const) - { - const u32 target = spu_branch_target(av); - - spu_log.warning("[0x%x] At 0x%x: indirect branch to 0x%x%s", entry_point, pos, target, op.d ? " (D)" : op.e ? " (E)" : ""); - - if (type == spu_itype::BI && target == pos + 4 && op.d) - { - // Disable interrupts idiom - break; - } - - m_targets[pos].push_back(target); - - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - if (sync) - { - spu_log.notice("[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", entry_point, pos, sl ? "" : "tail ", target); - - if (target > entry_point) - { - limit = std::min(limit, target); - } - } - else - { - m_entry_info[target / 4] = true; - add_block(target); - } - } - else if (target > entry_point) - { - limit = std::min(limit, target); - } - - if (sl && g_cfg.core.spu_block_size != spu_block_size_type::safe) - { - m_ret_info[pos / 4 + 1] = true; - m_entry_info[pos / 4 + 1] = true; - m_targets[pos].push_back(pos + 4); - add_block(pos + 4); - } - } - else if (type == spu_itype::BI && g_cfg.core.spu_block_size != spu_block_size_type::safe && !op.d && !op.e && !sync) - { - // Analyse jump table (TODO) - std::basic_string jt_abs; - std::basic_string jt_rel; - const u32 start = pos + 4; - u64 dabs = 0; - u64 drel = 0; - - for (u32 i = start; i < limit; i += 4) - { - const u32 target = ls[i / 4]; - - if (target == 0 || target % 4) - { - // Address cannot be misaligned: abort - break; - } - - if (target >= lsa && target < 0x40000) - { - // Possible jump table entry (absolute) - jt_abs.push_back(target); - } - - if (target + start >= lsa && target + start < 0x40000) - { - // Possible jump table entry (relative) - jt_rel.push_back(target + start); - } - - if (std::max(jt_abs.size(), jt_rel.size()) * 4 + start <= i) - { - // Neither type of jump table completes - jt_abs.clear(); - jt_rel.clear(); - break; - } - } - - // Choose position after the jt as an anchor and compute the average distance - for (u32 target : jt_abs) - { - dabs += std::abs(static_cast(target - start - jt_abs.size() * 4)); - } - - for (u32 target : jt_rel) - { - drel += std::abs(static_cast(target - start - jt_rel.size() * 4)); - } - - // Add detected jump table blocks - if (jt_abs.size() >= 3 || jt_rel.size() >= 3) - { - if (jt_abs.size() == jt_rel.size()) - { - if (dabs < drel) - { - jt_rel.clear(); - } - - if (dabs > drel) - { - jt_abs.clear(); - } - - ensure(jt_abs.size() != jt_rel.size()); - } - - if (jt_abs.size() >= jt_rel.size()) - { - const u32 new_size = (start - lsa) / 4 + ::size32(jt_abs); - - if (result.data.size() < new_size) - { - result.data.resize(new_size); - } - - for (u32 i = 0; i < jt_abs.size(); i++) - { - add_block(jt_abs[i]); - result.data[(start - lsa) / 4 + i] = std::bit_cast>(jt_abs[i]); - m_targets[start + i * 4]; - } - - m_targets.emplace(pos, std::move(jt_abs)); - } - - if (jt_rel.size() >= jt_abs.size()) - { - const u32 new_size = (start - lsa) / 4 + ::size32(jt_rel); - - if (result.data.size() < new_size) - { - result.data.resize(new_size); - } - - for (u32 i = 0; i < jt_rel.size(); i++) - { - add_block(jt_rel[i]); - result.data[(start - lsa) / 4 + i] = std::bit_cast>(jt_rel[i] - start); - m_targets[start + i * 4]; - } - - m_targets.emplace(pos, std::move(jt_rel)); - } - } - else if (start + 12 * 4 < limit && - ls[start / 4 + 0] == 0x1ce00408u && - ls[start / 4 + 1] == 0x24000389u && - ls[start / 4 + 2] == 0x24004809u && - ls[start / 4 + 3] == 0x24008809u && - ls[start / 4 + 4] == 0x2400c809u && - ls[start / 4 + 5] == 0x24010809u && - ls[start / 4 + 6] == 0x24014809u && - ls[start / 4 + 7] == 0x24018809u && - ls[start / 4 + 8] == 0x1c200807u && - ls[start / 4 + 9] == 0x2401c809u) - { - spu_log.warning("[0x%x] Pattern 1 detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg); - - // Add 8 targets (TODO) - for (u32 addr = start + 4; addr < start + 36; addr += 4) - { - m_targets[pos].push_back(addr); - add_block(addr); - } - } - else if (hbr_loc > start && hbr_loc < limit && hbr_tg == start) - { - spu_log.warning("[0x%x] No patterns detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg); - } - } - else if (type == spu_itype::BI && sync) - { - spu_log.notice("[0x%x] At 0x%x: ignoring indirect branch (SYNC)", entry_point, pos); - } - - if (type == spu_itype::BI || sl) - { - if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe) - { - m_targets[pos]; - } - else - { - m_ret_info[pos / 4 + 1] = true; - m_entry_info[pos / 4 + 1] = true; - m_targets[pos].push_back(pos + 4); - add_block(pos + 4); - } - } - else - { - m_targets[pos].push_back(pos + 4); - add_block(pos + 4); - } - - next_block(); - break; - } - - case spu_itype::BRSL: - case spu_itype::BRASL: - { - const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16); - - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = +vf::is_const; - values[op.rt] = pos + 4; - - if (type == spu_itype::BRSL && target == pos + 4) - { - // Get next instruction address idiom - break; - } - - m_targets[pos].push_back(target); - - if (g_cfg.core.spu_block_size != spu_block_size_type::safe) - { - m_ret_info[pos / 4 + 1] = true; - m_entry_info[pos / 4 + 1] = true; - m_targets[pos].push_back(pos + 4); - add_block(pos + 4); - } - - if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync) - { - m_entry_info[target / 4] = true; - add_block(target); - } - else - { - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - spu_log.notice("[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", entry_point, pos, target); - } - - if (target > entry_point) - { - limit = std::min(limit, target); - } - } - - next_block(); - break; - } - - case spu_itype::BRA: - { - const u32 target = spu_branch_target(0, op.i16); - - if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync) - { - m_entry_info[target / 4] = true; - add_block(target); - } - else - { - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - spu_log.notice("[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", entry_point, pos, target); - } - - if (target > entry_point) - { - limit = std::min(limit, target); - } - } - - next_block(); - break; - } - - case spu_itype::BR: - case spu_itype::BRZ: - case spu_itype::BRNZ: - case spu_itype::BRHZ: - case spu_itype::BRHNZ: - { - const u32 target = spu_branch_target(pos, op.i16); - - if (target == pos + 4) - { - // Nop - break; - } - - m_targets[pos].push_back(target); - add_block(target); - - if (type != spu_itype::BR) - { - m_targets[pos].push_back(pos + 4); - add_block(pos + 4); - } - - next_block(); - break; - } - - case spu_itype::DSYNC: - case spu_itype::HEQ: - case spu_itype::HEQI: - case spu_itype::HGT: - case spu_itype::HGTI: - case spu_itype::HLGT: - case spu_itype::HLGTI: - case spu_itype::LNOP: - case spu_itype::NOP: - case spu_itype::MTSPR: - case spu_itype::FSCRWR: - case spu_itype::STQA: - case spu_itype::STQD: - case spu_itype::STQR: - case spu_itype::STQX: - { - // Do nothing - break; - } - - case spu_itype::WRCH: - { - switch (op.ra) - { - case MFC_EAL: - { - m_regmod[pos / 4] = s_reg_mfc_eal; - break; - } - case MFC_LSA: - { - m_regmod[pos / 4] = s_reg_mfc_lsa; - break; - } - case MFC_TagID: - { - m_regmod[pos / 4] = s_reg_mfc_tag; - break; - } - case MFC_Size: - { - m_regmod[pos / 4] = s_reg_mfc_size; - break; - } - case MFC_Cmd: - { - m_use_rb[pos / 4] = s_reg_mfc_eal; - break; - } - default: break; - } - - break; - } - - case spu_itype::LQA: - case spu_itype::LQD: - case spu_itype::LQR: - case spu_itype::LQX: - { - // Unconst - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = {}; - break; - } - - case spu_itype::HBR: - { - hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt); - hbr_tg = vflags[op.ra] & vf::is_const && !op.c ? values[op.ra] & 0x3fffc : -1; - break; - } - - case spu_itype::HBRA: - { - hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt); - hbr_tg = spu_branch_target(0x0, op.i16); - break; - } - - case spu_itype::HBRR: - { - hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt); - hbr_tg = spu_branch_target(pos, op.i16); - break; - } - - case spu_itype::IL: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = +vf::is_const; - values[op.rt] = op.si16; - break; - } - case spu_itype::ILA: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = +vf::is_const; - values[op.rt] = op.i18; - break; - } - case spu_itype::ILH: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = +vf::is_const; - values[op.rt] = op.i16 << 16 | op.i16; - break; - } - case spu_itype::ILHU: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = +vf::is_const; - values[op.rt] = op.i16 << 16; - break; - } - case spu_itype::IOHL: - { - m_regmod[pos / 4] = op.rt; - values[op.rt] = values[op.rt] | op.i16; - break; - } - case spu_itype::ORI: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = vflags[op.ra] & vf::is_const; - values[op.rt] = values[op.ra] | op.si10; - break; - } - case spu_itype::OR: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; - values[op.rt] = values[op.ra] | values[op.rb]; - break; - } - case spu_itype::ANDI: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = vflags[op.ra] & vf::is_const; - values[op.rt] = values[op.ra] & op.si10; - break; - } - case spu_itype::AND: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; - values[op.rt] = values[op.ra] & values[op.rb]; - break; - } - case spu_itype::AI: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = vflags[op.ra] & vf::is_const; - values[op.rt] = values[op.ra] + op.si10; - break; - } - case spu_itype::A: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; - values[op.rt] = values[op.ra] + values[op.rb]; - break; - } - case spu_itype::SFI: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = vflags[op.ra] & vf::is_const; - values[op.rt] = op.si10 - values[op.ra]; - break; - } - case spu_itype::SF: - { - m_regmod[pos / 4] = op.rt; - vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; - values[op.rt] = values[op.rb] - values[op.ra]; - break; - } - case spu_itype::ROTMI: - { - m_regmod[pos / 4] = op.rt; - - if ((0 - op.i7) & 0x20) - { - vflags[op.rt] = +vf::is_const; - values[op.rt] = 0; - break; - } - - vflags[op.rt] = vflags[op.ra] & vf::is_const; - values[op.rt] = values[op.ra] >> ((0 - op.i7) & 0x1f); - break; - } - case spu_itype::SHLI: - { - m_regmod[pos / 4] = op.rt; - - if (op.i7 & 0x20) - { - vflags[op.rt] = +vf::is_const; - values[op.rt] = 0; - break; - } - - vflags[op.rt] = vflags[op.ra] & vf::is_const; - values[op.rt] = values[op.ra] << (op.i7 & 0x1f); - break; - } - - default: - { - // Unconst - const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt; - m_regmod[pos / 4] = op_rt; - vflags[op_rt] = {}; - break; - } - } - - // Insert raw instruction value - const u32 new_size = (pos - lsa) / 4; - - if (result.data.size() <= new_size) - { - if (result.data.size() < new_size) - { - result.data.resize(new_size); - } - - result.data.emplace_back(std::bit_cast>(data)); - } - else if (u32& raw_val = result.data[new_size]) - { - ensure(raw_val == std::bit_cast>(data)); - } - else - { - raw_val = std::bit_cast>(data); - } - } - - while (lsa > 0 || limit < 0x40000) - { - const u32 initial_size = ::size32(result.data); - - // Check unreachable blocks - limit = std::min(limit, lsa + initial_size * 4); - - for (auto& pair : m_preds) - { - bool reachable = false; - - if (pair.first >= limit) - { - continue; - } - - // All (direct and indirect) predecessors to check - std::basic_string workload; - - // Bit array used to deduplicate workload list - workload.push_back(pair.first); - m_bits[pair.first / 4] = true; - - for (usz i = 0; !reachable && i < workload.size(); i++) - { - for (u32 j = workload[i];; j -= 4) - { - // Go backward from an address until the entry point is reached - if (j == entry_point) - { - reachable = true; - break; - } - - const auto found = m_preds.find(j); - - bool had_fallthrough = false; - - if (found != m_preds.end()) - { - for (u32 new_pred : found->second) - { - // Check whether the predecessor is previous instruction - if (new_pred == j - 4) - { - had_fallthrough = true; - continue; - } - - // Check whether in range and not already added - if (new_pred >= lsa && new_pred < limit && !m_bits[new_pred / 4]) - { - workload.push_back(new_pred); - m_bits[new_pred / 4] = true; - } - } - } - - // Check for possible fallthrough predecessor - if (!had_fallthrough) - { - if (::at32(result.data, (j - lsa) / 4 - 1) == 0 || m_targets.count(j - 4)) - { - break; - } - } - - if (i == 0) - { - // TODO - } - } - } - - for (u32 pred : workload) - { - m_bits[pred / 4] = false; - } - - if (!reachable && pair.first < limit) - { - limit = pair.first; - } - } - - result.data.resize((limit - lsa) / 4); - - // Check holes in safe mode (TODO) - u32 valid_size = 0; - - for (u32 i = 0; i < result.data.size(); i++) - { - if (result.data[i] == 0) - { - const u32 pos = lsa + i * 4; - const u32 data = ls[pos / 4]; - - // Allow only NOP or LNOP instructions in holes - if (data == 0x200000 || (data & 0xffffff80) == 0x40200000) - { - continue; - } - - if (g_cfg.core.spu_block_size != spu_block_size_type::giga) - { - result.data.resize(valid_size); - break; - } - } - else - { - valid_size = i + 1; - } - } - - // Even if NOP or LNOP, should be removed at the end - result.data.resize(valid_size); - - // Repeat if blocks were removed - if (result.data.size() == initial_size) - { - break; - } - } - - limit = std::min(limit, lsa + ::size32(result.data) * 4); - - // Cleanup block info - for (u32 i = 0; i < workload.size(); i++) - { - const u32 addr = workload[i]; - - if (addr < lsa || addr >= limit || !result.data[(addr - lsa) / 4]) - { - m_block_info[addr / 4] = false; - m_entry_info[addr / 4] = false; - m_ret_info[addr / 4] = false; - m_preds.erase(addr); - } - } - - // Complete m_preds and associated m_targets for adjacent blocks - for (auto it = m_preds.begin(); it != m_preds.end();) - { - if (it->first < lsa || it->first >= limit) - { - it = m_preds.erase(it); - continue; - } - - // Erase impossible predecessors - const auto new_end = std::remove_if(it->second.begin(), it->second.end(), [&](u32 addr) - { - return addr < lsa || addr >= limit; - }); - - it->second.erase(new_end, it->second.end()); - - // Don't add fallthrough target if all predecessors are removed - if (it->second.empty() && !m_entry_info[it->first / 4]) - { - // If not an entry point, remove the block completely - m_block_info[it->first / 4] = false; - it = m_preds.erase(it); - continue; - } - - // Previous instruction address - const u32 prev = (it->first - 4) & 0x3fffc; - - // TODO: check the correctness - if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result.data[(prev - lsa) / 4]) - { - // Add target and the predecessor - m_targets[prev].push_back(it->first); - it->second.push_back(prev); - } - - it++; - } - - if (out_target_list) - { - out_target_list->insert(m_targets.begin(), m_targets.end()); - } - - // Remove unnecessary target lists - for (auto it = m_targets.begin(); it != m_targets.end();) - { - if (it->first < lsa || it->first >= limit) - { - it = m_targets.erase(it); - continue; - } - - it++; - } - - // Fill holes which contain only NOP and LNOP instructions (TODO: compile) - for (u32 i = 0, nnop = 0, vsize = 0; i <= result.data.size(); i++) - { - if (i >= result.data.size() || result.data[i]) - { - if (nnop && nnop == i - vsize) - { - // Write only complete NOP sequence - for (u32 j = vsize; j < i; j++) - { - result.data[j] = std::bit_cast>(ls[lsa / 4 + j]); - } - } - - nnop = 0; - vsize = i + 1; - } - else - { - const u32 pos = lsa + i * 4; - const u32 data = ls[pos / 4]; - - if (data == 0x200000 || (data & 0xffffff80) == 0x40200000) - { - nnop++; - } - } - } - - // Fill block info - for (auto& pred : m_preds) - { - auto& block = m_bbs[pred.first]; - - // Copy predeccessors (wrong at this point, needs a fixup later) - block.preds = pred.second; - - // Fill register usage info - for (u32 ia = pred.first; ia < limit; ia += 4) - { - block.size++; - - // Decode instruction - const spu_opcode_t op{std::bit_cast>(result.data[(ia - lsa) / 4])}; - - const auto type = g_spu_itype.decode(op.opcode); - - u8 reg_save = 255; - - if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt]) - { - // Register saved onto the stack before use - block.reg_save_dom[op.rt] = true; - - reg_save = op.rt; - } - - for (auto* _use : {&m_use_ra, &m_use_rb, &m_use_rc}) - { - if (u8 reg = (*_use)[ia / 4]; reg < s_reg_max) - { - // Register reg use only if it happens before reg mod - if (!block.reg_mod[reg]) - { - block.reg_use.set(reg); - - if (reg_save != reg && block.reg_save_dom[reg]) - { - // Register is still used after saving; probably not eligible for optimization - block.reg_save_dom[reg] = false; - } - } - } - } - - if (m_use_rb[ia / 4] == s_reg_mfc_eal) - { - // Expand MFC_Cmd reg use - for (u8 reg : {s_reg_mfc_lsa, s_reg_mfc_tag, s_reg_mfc_size}) - { - if (!block.reg_mod[reg]) - block.reg_use.set(reg); - } - } - - // Register reg modification - if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) - { - block.reg_mod.set(reg); - block.reg_mod_xf.set(reg, type & spu_itype::xfloat); - - if (type == spu_itype::SELB && (block.reg_mod_xf[op.ra] || block.reg_mod_xf[op.rb])) - block.reg_mod_xf.set(reg); - - // Possible post-dominating register load - if (type == spu_itype::LQD && op.ra == s_reg_sp) - block.reg_load_mod[reg] = ia + 1; - else - block.reg_load_mod[reg] = 0; - } - - // Find targets (also means end of the block) - const auto tfound = m_targets.find(ia); - - if (tfound != m_targets.end()) - { - // Copy targets - block.targets = tfound->second; - - // Assume that the call reads and modifies all volatile registers (TODO) - bool is_call = false; - bool is_tail = false; - switch (type) - { - case spu_itype::BRSL: - is_call = spu_branch_target(ia, op.i16) != ia + 4; - break; - case spu_itype::BRASL: - is_call = spu_branch_target(0, op.i16) != ia + 4; - break; - case spu_itype::BRA: - is_call = true; - is_tail = true; - break; - case spu_itype::BISL: - case spu_itype::BISLED: - is_call = true; - break; - default: - break; - } - - if (is_call) - { - for (u32 i = 0; i < s_reg_max; ++i) - { - if (i == s_reg_lr || (i >= 2 && i < s_reg_80) || i > s_reg_127) - { - if (!block.reg_mod[i]) - block.reg_use.set(i); - - if (!is_tail) - { - block.reg_mod.set(i); - block.reg_mod_xf[i] = false; - } - } - } - } - - break; - } - } - } - - // Fixup block predeccessors to point to basic blocks, not last instructions - for (auto& bb : m_bbs) - { - const u32 addr = bb.first; - - for (u32& pred : bb.second.preds) - { - pred = std::prev(m_bbs.upper_bound(pred))->first; - } - - if (m_entry_info[addr / 4] && g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - // Register empty chunk - m_chunks.push_back(addr); - - // Register function if necessary - if (!m_ret_info[addr / 4]) - { - m_funcs[addr]; - } - } - } - - // Ensure there is a function at the lowest address - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - if (auto emp = m_funcs.try_emplace(m_bbs.begin()->first); emp.second) - { - const u32 addr = emp.first->first; - spu_log.error("[0x%05x] Fixed first function at 0x%05x", entry_point, addr); - m_entry_info[addr / 4] = true; - m_ret_info[addr / 4] = false; - } - } - - // Split functions - while (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - bool need_repeat = false; - - u32 start = 0; - u32 limit = 0x40000; - - // Walk block list in ascending order - for (auto& block : m_bbs) - { - const u32 addr = block.first; - - if (m_entry_info[addr / 4] && !m_ret_info[addr / 4]) - { - const auto upper = m_funcs.upper_bound(addr); - start = addr; - limit = upper == m_funcs.end() ? 0x40000 : upper->first; - } - - // Find targets that exceed [start; limit) range and make new functions from them - for (u32 target : block.second.targets) - { - const auto tfound = m_bbs.find(target); - - if (tfound == m_bbs.end()) - { - continue; - } - - if (target < start || target >= limit) - { - if (!m_entry_info[target / 4] || m_ret_info[target / 4]) - { - // Create new function entry (likely a tail call) - m_entry_info[target / 4] = true; - - m_ret_info[target / 4] = false; - - m_funcs.try_emplace(target); - - if (target < limit) - { - need_repeat = true; - } - } - } - } - - block.second.func = start; - } - - if (!need_repeat) - { - break; - } - } - - // Fill entry map - while (true) - { - workload.clear(); - workload.push_back(entry_point); - ensure(m_bbs.count(entry_point)); - - std::basic_string new_entries; - - for (u32 wi = 0; wi < workload.size(); wi++) - { - const u32 addr = workload[wi]; - auto& block = ::at32(m_bbs, addr); - const u32 _new = block.chunk; - - if (!m_entry_info[addr / 4]) - { - // Check block predecessors - for (u32 pred : block.preds) - { - const u32 _old = ::at32(m_bbs, pred).chunk; - - if (_old < 0x40000 && _old != _new) - { - // If block has multiple 'entry' points, it becomes an entry point itself - new_entries.push_back(addr); - } - } - } - - // Update chunk address - block.chunk = m_entry_info[addr / 4] ? addr : _new; - - // Process block targets - for (u32 target : block.targets) - { - const auto tfound = m_bbs.find(target); - - if (tfound == m_bbs.end()) - { - continue; - } - - auto& tb = tfound->second; - - const u32 value = m_entry_info[target / 4] ? target : block.chunk; - - if (u32& tval = tb.chunk; tval < 0x40000) - { - // TODO: fix condition - if (tval != value && !m_entry_info[target / 4]) - { - new_entries.push_back(target); - } - } - else - { - tval = value; - workload.emplace_back(target); - } - } - } - - if (new_entries.empty()) - { - break; - } - - for (u32 entry : new_entries) - { - m_entry_info[entry / 4] = true; - - // Acknowledge artificial (reversible) chunk entry point - m_ret_info[entry / 4] = true; - } - - for (auto& bb : m_bbs) - { - // Reset chunk info - bb.second.chunk = 0x40000; - } - } - - workload.clear(); - workload.push_back(entry_point); - - // Fill workload adding targets - for (u32 wi = 0; wi < workload.size(); wi++) - { - const u32 addr = workload[wi]; - auto& block = ::at32(m_bbs, addr); - block.analysed = true; - - for (u32 target : block.targets) - { - const auto tfound = m_bbs.find(target); - - if (tfound == m_bbs.end()) - { - continue; - } - - auto& tb = tfound->second; - - if (!tb.analysed) - { - workload.push_back(target); - tb.analysed = true; - } - - // Limited xfloat hint propagation (possibly TODO) - if (tb.chunk == block.chunk) - { - tb.reg_maybe_xf &= block.reg_mod_xf; - } - else - { - tb.reg_maybe_xf.reset(); - } - } - - block.reg_origin.fill(0x80000000); - block.reg_origin_abs.fill(0x80000000); - } - - // Fill register origin info - while (true) - { - bool must_repeat = false; - - for (u32 wi = 0; wi < workload.size(); wi++) - { - const u32 addr = workload[wi]; - auto& block = ::at32(m_bbs, addr); - - // Initialize entry point with default value: unknown origin (requires load) - if (m_entry_info[addr / 4]) - { - for (u32 i = 0; i < s_reg_max; i++) - { - if (block.reg_origin[i] == 0x80000000) - block.reg_origin[i] = 0x40000; - } - } - - if (g_cfg.core.spu_block_size == spu_block_size_type::giga && m_entry_info[addr / 4] && !m_ret_info[addr / 4]) - { - for (u32 i = 0; i < s_reg_max; i++) - { - if (block.reg_origin_abs[i] == 0x80000000) - block.reg_origin_abs[i] = 0x40000; - else if (block.reg_origin_abs[i] + 1 == 0) - block.reg_origin_abs[i] = -2; - } - } - - for (u32 target : block.targets) - { - const auto tfound = m_bbs.find(target); - - if (tfound == m_bbs.end()) - { - continue; - } - - auto& tb = tfound->second; - - for (u32 i = 0; i < s_reg_max; i++) - { - if (tb.chunk == block.chunk && tb.reg_origin[i] + 1) - { - const u32 expected = block.reg_mod[i] ? addr : block.reg_origin[i]; - - if (tb.reg_origin[i] == 0x80000000) - { - tb.reg_origin[i] = expected; - } - else if (tb.reg_origin[i] != expected) - { - // Set -1 if multiple origins merged (requires PHI node) - tb.reg_origin[i] = -1; - - must_repeat |= !tb.targets.empty(); - } - } - - if (g_cfg.core.spu_block_size == spu_block_size_type::giga && tb.func == block.func && tb.reg_origin_abs[i] + 2) - { - const u32 expected = block.reg_mod[i] ? addr : block.reg_origin_abs[i]; - - if (tb.reg_origin_abs[i] == 0x80000000) - { - tb.reg_origin_abs[i] = expected; - } - else if (tb.reg_origin_abs[i] != expected) - { - if (tb.reg_origin_abs[i] == 0x40000 || expected + 2 == 0 || expected == 0x40000) - { - // Set -2: sticky value indicating possible external reg origin (0x40000) - tb.reg_origin_abs[i] = -2; - - must_repeat |= !tb.targets.empty(); - } - else if (tb.reg_origin_abs[i] + 1) - { - tb.reg_origin_abs[i] = -1; - - must_repeat |= !tb.targets.empty(); - } - } - } - } - } - } - - if (!must_repeat) - { - break; - } - - for (u32 wi = 0; wi < workload.size(); wi++) - { - const u32 addr = workload[wi]; - auto& block = ::at32(m_bbs, addr); - - // Reset values for the next attempt (keep negative values) - for (u32 i = 0; i < s_reg_max; i++) - { - if (block.reg_origin[i] <= 0x40000) - block.reg_origin[i] = 0x80000000; - if (block.reg_origin_abs[i] <= 0x40000) - block.reg_origin_abs[i] = 0x80000000; - } - } - } - - // Fill more block info - for (u32 wi = 0; wi < workload.size(); wi++) - { - if (g_cfg.core.spu_block_size != spu_block_size_type::giga) - { - break; - } - - const u32 addr = workload[wi]; - auto& bb = ::at32(m_bbs, addr); - auto& func = ::at32(m_funcs, bb.func); - - // Update function size - func.size = std::max(func.size, bb.size + (addr - bb.func) / 4); - - // Copy constants according to reg origin info - for (u32 i = 0; i < s_reg_max; i++) - { - const u32 orig = bb.reg_origin_abs[i]; - - if (orig < 0x40000) - { - auto& src = ::at32(m_bbs, orig); - bb.reg_const[i] = src.reg_const[i]; - bb.reg_val32[i] = src.reg_val32[i]; - } - - if (!bb.reg_save_dom[i] && bb.reg_use[i] && (orig == 0x40000 || orig + 2 == 0)) - { - // Destroy offset if external reg value is used - func.reg_save_off[i] = -1; - } - } - - if (u32 orig = bb.reg_origin_abs[s_reg_sp]; orig < 0x40000) - { - auto& prologue = ::at32(m_bbs, orig); - - // Copy stack offset (from the assumed prologue) - bb.stack_sub = prologue.stack_sub; - } - else if (orig > 0x40000) - { - // Unpredictable stack - bb.stack_sub = 0x80000000; - } - - spu_opcode_t op{}; - - auto last_inst = spu_itype::UNK; - - for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4) - { - // Decode instruction again - op.opcode = std::bit_cast>(result.data[(ia - lsa) / 41]); - last_inst = g_spu_itype.decode(op.opcode); - - // Propagate some constants - switch (last_inst) - { - case spu_itype::IL: - { - bb.reg_const[op.rt] = true; - bb.reg_val32[op.rt] = op.si16; - break; - } - case spu_itype::ILA: - { - bb.reg_const[op.rt] = true; - bb.reg_val32[op.rt] = op.i18; - break; - } - case spu_itype::ILHU: - { - bb.reg_const[op.rt] = true; - bb.reg_val32[op.rt] = op.i16 << 16; - break; - } - case spu_itype::ILH: - { - bb.reg_const[op.rt] = true; - bb.reg_val32[op.rt] = op.i16 << 16 | op.i16; - break; - } - case spu_itype::IOHL: - { - bb.reg_val32[op.rt] = bb.reg_val32[op.rt] | op.i16; - break; - } - case spu_itype::ORI: - { - bb.reg_const[op.rt] = bb.reg_const[op.ra]; - bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | op.si10; - break; - } - case spu_itype::OR: - { - bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb]; - bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | bb.reg_val32[op.rb]; - break; - } - case spu_itype::AI: - { - bb.reg_const[op.rt] = bb.reg_const[op.ra]; - bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + op.si10; - break; - } - case spu_itype::A: - { - bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb]; - bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + bb.reg_val32[op.rb]; - break; - } - case spu_itype::SFI: - { - bb.reg_const[op.rt] = bb.reg_const[op.ra]; - bb.reg_val32[op.rt] = op.si10 - bb.reg_val32[op.ra]; - break; - } - case spu_itype::SF: - { - bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb]; - bb.reg_val32[op.rt] = bb.reg_val32[op.rb] - bb.reg_val32[op.ra]; - break; - } - case spu_itype::STQD: - { - if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_save_dom[op.rt]) - { - const u32 offset = 0x80000000 + op.si10 * 16 - bb.stack_sub; - - if (func.reg_save_off[op.rt] == 0) - { - // Store reg save offset - func.reg_save_off[op.rt] = offset; - } - else if (func.reg_save_off[op.rt] != offset) - { - // Conflict of different offsets - func.reg_save_off[op.rt] = -1; - } - } - - break; - } - case spu_itype::LQD: - { - if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_load_mod[op.rt] == ia + 1) - { - // Adjust reg load offset - bb.reg_load_mod[op.rt] = 0x80000000 + op.si10 * 16 - bb.stack_sub; - } - - // Clear const - bb.reg_const[op.rt] = false; - break; - } - default: - { - // Clear const if reg is modified here - if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) - bb.reg_const[reg] = false; - break; - } - } - - // $SP is modified - if (m_regmod[ia / 4] == s_reg_sp) - { - if (bb.reg_const[s_reg_sp]) - { - // Making $SP a constant is a funny thing too. - bb.stack_sub = 0x80000000; - } - - if (bb.stack_sub != 0x80000000) - { - switch (last_inst) - { - case spu_itype::AI: - { - if (op.ra == s_reg_sp) - bb.stack_sub -= op.si10; - else - bb.stack_sub = 0x80000000; - break; - } - case spu_itype::A: - { - if (op.ra == s_reg_sp && bb.reg_const[op.rb]) - bb.stack_sub -= bb.reg_val32[op.rb]; - else if (op.rb == s_reg_sp && bb.reg_const[op.ra]) - bb.stack_sub -= bb.reg_val32[op.ra]; - else - bb.stack_sub = 0x80000000; - break; - } - case spu_itype::SF: - { - if (op.rb == s_reg_sp && bb.reg_const[op.ra]) - bb.stack_sub += bb.reg_val32[op.ra]; - else - bb.stack_sub = 0x80000000; - break; - } - default: - { - bb.stack_sub = 0x80000000; - break; - } - } - } - - // Check for funny values. - if (bb.stack_sub >= 0x40000 || bb.stack_sub % 16) - { - bb.stack_sub = 0x80000000; - } - } - } - - // Analyse terminator instruction - const u32 tia = addr + bb.size * 4 - 4; - - switch (last_inst) - { - case spu_itype::BR: - case spu_itype::BRNZ: - case spu_itype::BRZ: - case spu_itype::BRHNZ: - case spu_itype::BRHZ: - case spu_itype::BRSL: - { - const u32 target = spu_branch_target(tia, op.i16); - - if (target == tia + 4) - { - bb.terminator = term_type::fallthrough; - } - else if (last_inst != spu_itype::BRSL) - { - // No-op terminator or simple branch instruction - bb.terminator = term_type::br; - - if (target == bb.func) - { - // Recursive tail call - bb.terminator = term_type::ret; - } - } - else if (op.rt == s_reg_lr) - { - bb.terminator = term_type::call; - } - else - { - bb.terminator = term_type::interrupt_call; - } - - break; - } - case spu_itype::BRA: - case spu_itype::BRASL: - { - bb.terminator = term_type::indirect_call; - break; - } - case spu_itype::BI: - { - if (op.d || op.e || bb.targets.size() == 1) - { - bb.terminator = term_type::interrupt_call; - } - else if (bb.targets.size() > 1) - { - // Jump table - bb.terminator = term_type::br; - } - else if (op.ra == s_reg_lr) - { - // Return (TODO) - bb.terminator = term_type::ret; - } - else - { - // Indirect tail call (TODO) - bb.terminator = term_type::interrupt_call; - } - - break; - } - case spu_itype::BISLED: - case spu_itype::IRET: - { - bb.terminator = term_type::interrupt_call; - break; - } - case spu_itype::BISL: - case spu_itype::BIZ: - case spu_itype::BINZ: - case spu_itype::BIHZ: - case spu_itype::BIHNZ: - { - if (op.d || op.e || bb.targets.size() != 1) - { - bb.terminator = term_type::interrupt_call; - } - else if (last_inst != spu_itype::BISL && bb.targets[0] == tia + 4 && op.ra == s_reg_lr) - { - // Conditional return (TODO) - bb.terminator = term_type::ret; - } - else if (last_inst == spu_itype::BISL) - { - // Indirect call - bb.terminator = term_type::indirect_call; - } - else - { - // TODO - bb.terminator = term_type::interrupt_call; - } - - break; - } - default: - { - // Normal instruction - bb.terminator = term_type::fallthrough; - break; - } - } - } - - // Check function blocks, verify and print some reasons - for (auto& f : m_funcs) - { - if (g_cfg.core.spu_block_size != spu_block_size_type::giga) - { - break; - } - - bool is_ok = true; - - u32 used_stack = 0; - - for (auto it = m_bbs.lower_bound(f.first); it != m_bbs.end() && it->second.func == f.first; ++it) - { - auto& bb = it->second; - auto& func = ::at32(m_funcs, bb.func); - const u32 addr = it->first; - const u32 flim = bb.func + func.size * 4; - - used_stack |= bb.stack_sub; - - if (is_ok && bb.terminator >= term_type::indirect_call) - { - is_ok = false; - } - - if (is_ok && bb.terminator == term_type::ret) - { - // Check $LR (alternative return registers are currently not supported) - if (u32 lr_orig = bb.reg_mod[s_reg_lr] ? addr : bb.reg_origin_abs[s_reg_lr]; lr_orig < 0x40000) - { - auto& src = ::at32(m_bbs, lr_orig); - - if (src.reg_load_mod[s_reg_lr] != func.reg_save_off[s_reg_lr]) - { - spu_log.error("Function 0x%05x: [0x%05x] $LR mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, addr, lr_orig, src.reg_load_mod[0], func.reg_save_off[0]); - is_ok = false; - } - else if (src.reg_load_mod[s_reg_lr] == 0) - { - spu_log.error("Function 0x%05x: [0x%05x] $LR modified (src=0x%x)", f.first, addr, lr_orig); - is_ok = false; - } - } - else if (lr_orig > 0x40000) - { - spu_log.todo("Function 0x%05x: [0x%05x] $LR unpredictable (src=0x%x)", f.first, addr, lr_orig); - is_ok = false; - } - - // Check $80..$127 (should be restored or unmodified) - for (u32 i = s_reg_80; is_ok && i <= s_reg_127; i++) - { - if (u32 orig = bb.reg_mod[i] ? addr : bb.reg_origin_abs[i]; orig < 0x40000) - { - auto& src = ::at32(m_bbs, orig); - - if (src.reg_load_mod[i] != func.reg_save_off[i]) - { - spu_log.error("Function 0x%05x: [0x%05x] $%u mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, addr, i, orig, src.reg_load_mod[i], func.reg_save_off[i]); - is_ok = false; - } - } - else if (orig > 0x40000) - { - spu_log.todo("Function 0x%05x: [0x%05x] $%u unpredictable (src=0x%x)", f.first, addr, i, orig); - is_ok = false; - } - - if (func.reg_save_off[i] + 1 == 0) - { - spu_log.error("Function 0x%05x: [0x%05x] $%u used incorrectly", f.first, addr, i); - is_ok = false; - } - } - - // Check $SP (should be restored or unmodified) - if (bb.stack_sub != 0 && bb.stack_sub != 0x80000000) - { - spu_log.error("Function 0x%05x: [0x%05x] return with stack frame 0x%x", f.first, addr, bb.stack_sub); - is_ok = false; - } - } - - if (is_ok && bb.terminator == term_type::call) - { - // Check call instruction (TODO) - if (bb.stack_sub == 0) - { - // Call without a stack frame - spu_log.error("Function 0x%05x: [0x%05x] frameless call", f.first, addr); - is_ok = false; - } - } - - if (is_ok && bb.terminator == term_type::fallthrough) - { - // Can't just fall out of the function - if (bb.targets.size() != 1 || bb.targets[0] >= flim) - { - spu_log.error("Function 0x%05x: [0x%05x] bad fallthrough to 0x%x", f.first, addr, bb.targets[0]); - is_ok = false; - } - } - - if (is_ok && bb.stack_sub == 0x80000000) - { - spu_log.error("Function 0x%05x: [0x%05x] bad stack frame", f.first, addr); - is_ok = false; - } - - // Fill external function targets (calls, possibly tail calls) - for (u32 target : bb.targets) - { - if (target < bb.func || target >= flim || (bb.terminator == term_type::call && target == bb.func)) - { - if (func.calls.find_first_of(target) + 1 == 0) - { - func.calls.push_back(target); - } - } - } - } - - if (is_ok && used_stack && f.first == entry_point) - { - spu_log.error("Function 0x%05x: considered possible chunk", f.first); - is_ok = false; - } - - // if (is_ok && f.first > 0x1d240 && f.first < 0x1e000) - // { - // spu_log.error("Function 0x%05x: manually disabled", f.first); - // is_ok = false; - // } - - f.second.good = is_ok; - } - - // Check function call graph - while (g_cfg.core.spu_block_size == spu_block_size_type::giga) - { - bool need_repeat = false; - - for (auto& f : m_funcs) - { - if (!f.second.good) - { - continue; - } - - for (u32 call : f.second.calls) - { - const auto ffound = std::as_const(m_funcs).find(call); - - if (ffound == m_funcs.cend() || ffound->second.good == false) - { - need_repeat = true; - - if (f.second.good) - { - spu_log.error("Function 0x%05x: calls bad function (0x%05x)", f.first, call); - f.second.good = false; - } - } - } - } - - if (!need_repeat) - { - break; - } - } - - if (result.data.empty()) - { - // Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback - } - - return result; -} - -void spu_recompiler_base::dump(const spu_program& result, std::string& out) -{ - SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast(result.data.data()), result.lower_bound); - - std::string hash; - { - sha1_context ctx; - u8 output[20]; - - sha1_starts(&ctx); - sha1_update(&ctx, reinterpret_cast(result.data.data()), result.data.size() * 4); - sha1_finish(&ctx, output); - fmt::append(hash, "%s", fmt::base57(output)); - } - - fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n\n", result.entry_point, result.data.size(), hash); - - for (auto& bb : m_bbs) - { - for (u32 pos = bb.first, end = bb.first + bb.second.size * 4; pos < end; pos += 4) - { - dis_asm.disasm(pos); - - if (!dis_asm.last_opcode.ends_with('\n')) - { - dis_asm.last_opcode += '\n'; - } - - fmt::append(out, ">%s", dis_asm.last_opcode); - } - - out += '\n'; - - if (m_block_info[bb.first / 4]) - { - fmt::append(out, "A: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block"); - - fmt::append(out, "\tF: 0x%05x\n", bb.second.func); - - for (u32 pred : bb.second.preds) - { - fmt::append(out, "\t<- 0x%05x\n", pred); - } - - for (u32 target : bb.second.targets) - { - fmt::append(out, "\t-> 0x%05x%s\n", target, m_bbs.count(target) ? "" : " (null)"); - } - } - else - { - fmt::append(out, "A: [0x%05x] ?\n", bb.first); - } - - out += '\n'; - } - - for (auto& f : m_funcs) - { - fmt::append(out, "F: [0x%05x]%s\n", f.first, f.second.good ? " (good)" : " (bad)"); - - fmt::append(out, "\tN: 0x%05x\n", f.second.size * 4 + f.first); - - for (u32 call : f.second.calls) - { - fmt::append(out, "\t>> 0x%05x%s\n", call, m_funcs.count(call) ? "" : " (null)"); - } - } - - out += '\n'; -} - #ifdef LLVM_AVAILABLE #include "Emu/CPU/CPUTranslator.h" @@ -4531,7 +157,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator bool does_gpr_barrier_preceed_first_store(u32 i) const noexcept { const usz counter = store_context_ctr[i]; - return counter != 1 && counter < store_context_first_id[i]; + const usz first_id = store_context_first_id[i]; + return counter != 1 && first_id != umax && counter < first_id; } }; @@ -11790,620 +7417,3 @@ std::unique_ptr spu_recompiler_base::make_llvm_recompiler(u } #endif // LLVM_AVAILABLE - -struct spu_llvm_worker -{ - lf_queue> registered; - - void operator()() - { - // SPU LLVM Recompiler instance - const auto compiler = spu_recompiler_base::make_llvm_recompiler(); - compiler->init(); - - // Fake LS - std::vector> ls(0x10000); - - for (auto slice = registered.pop_all();; [&] - { - if (slice) - { - slice.pop_front(); - } - - if (slice || thread_ctrl::state() == thread_state::aborting) - { - return; - } - - thread_ctrl::wait_on(utils::bless>(®istered)[1], 0); - slice = registered.pop_all(); - }()) - { - auto* prog = slice.get(); - - if (thread_ctrl::state() == thread_state::aborting) - { - break; - } - - if (!prog) - { - continue; - } - - if (!prog->second) - { - break; - } - - const auto& func = *prog->second; - - // Get data start - const u32 start = func.lower_bound; - const u32 size0 = ::size32(func.data); - - // Initialize LS with function data only - for (u32 i = 0, pos = start; i < size0; i++, pos += 4) - { - ls[pos / 4] = std::bit_cast>(func.data[i]); - } - - // Call analyser - spu_program func2 = compiler->analyse(ls.data(), func.entry_point); - - if (func2 != func) - { - spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0); - } - else if (const auto target = compiler->compile(std::move(func2))) - { - // Redirect old function (TODO: patch in multiple places) - const s64 rel = reinterpret_cast(target) - prog->first - 5; - - union - { - u8 bytes[8]; - u64 result; - }; - - bytes[0] = 0xe9; // jmp rel32 - std::memcpy(bytes + 1, &rel, 4); - bytes[5] = 0x90; - bytes[6] = 0x90; - bytes[7] = 0x90; - - atomic_storage::release(*reinterpret_cast(prog->first), result); - } - else - { - spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point); - return; - } - - // Clear fake LS - std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1)); - } - } -}; - -// SPU LLVM recompiler thread context -struct spu_llvm -{ - // Workload - lf_queue> registered; - atomic_ptr> m_workers; - - spu_llvm() - { - // Dependency - g_fxo->init(); - } - - void operator()() - { - if (g_cfg.core.spu_decoder != spu_decoder_type::llvm) - { - return; - } - - // To compile (hash -> item) - std::unordered_multimap> enqueued; - - // Mini-profiler (hash -> number of occurrences) - std::unordered_map, value_hash> samples; - - // For synchronization with profiler thread - stx::init_mutex prof_mutex; - - named_thread profiler("SPU LLVM Profiler"sv, [&]() - { - while (thread_ctrl::state() != thread_state::aborting) - { - { - // Lock if enabled - const auto lock = prof_mutex.access(); - - if (!lock) - { - // Wait when the profiler is disabled - prof_mutex.wait_for_initialized(); - continue; - } - - // Collect profiling samples - idm::select>([&](u32 /*id*/, spu_thread& spu) - { - const u64 name = atomic_storage::load(spu.block_hash); - - if (auto state = +spu.state; !::is_paused(state) && !::is_stopped(state) && cpu_flag::wait - state) - { - const auto found = std::as_const(samples).find(name); - - if (found != std::as_const(samples).end()) - { - const_cast&>(found->second)++; - } - } - }); - } - - // Sleep for a short period if enabled - thread_ctrl::wait_for(20, false); - } - }); - - u32 worker_count = 1; - - if (uint hc = utils::get_thread_count(); hc >= 12) - { - worker_count = hc - 10; - } - - u32 worker_index = 0; - - m_workers = make_single>("SPUW.", worker_count); - auto workers_ptr = m_workers.load(); - auto& workers = *workers_ptr; - - while (thread_ctrl::state() != thread_state::aborting) - { - for (const auto& pair : registered.pop_all()) - { - enqueued.emplace(pair); - - // Interrupt and kick profiler thread - const auto lock = prof_mutex.init_always([&]{}); - - // Register new blocks to collect samples - samples.emplace(pair.first, 0); - } - - if (enqueued.empty()) - { - // Interrupt profiler thread and put it to sleep - static_cast(prof_mutex.reset()); - thread_ctrl::wait_on(utils::bless>(®istered)[1], 0); - continue; - } - - // Find the most used enqueued item - u64 sample_max = 0; - auto found_it = enqueued.begin(); - - for (auto it = enqueued.begin(), end = enqueued.end(); it != end; ++it) - { - const u64 cur = ::at32(std::as_const(samples), it->first); - - if (cur > sample_max) - { - sample_max = cur; - found_it = it; - } - } - - // Start compiling - const spu_program& func = found_it->second->data; - - // Old function pointer (pre-recompiled) - const spu_function_t _old = found_it->second->compiled; - - // Remove item from the queue - enqueued.erase(found_it); - - // Push the workload - (workers.begin() + (worker_index++ % worker_count))->registered.push(reinterpret_cast(_old), &func); - } - - static_cast(prof_mutex.init_always([&]{ samples.clear(); })); - - m_workers.reset(); - - for (u32 i = 0; i < worker_count; i++) - { - (workers.begin() + i)->operator=(thread_state::aborting); - } - } - - spu_llvm& operator=(thread_state) - { - if (const auto workers = m_workers.load()) - { - for (u32 i = 0; i < workers->size(); i++) - { - (workers->begin() + i)->operator=(thread_state::aborting); - } - } - - return *this; - } - - static constexpr auto thread_name = "SPU LLVM"sv; -}; - -using spu_llvm_thread = named_thread; - -struct spu_fast : public spu_recompiler_base -{ - virtual void init() override - { - if (!m_spurt) - { - m_spurt = &g_fxo->get(); - } - } - - virtual spu_function_t compile(spu_program&& _func) override - { - const auto add_loc = m_spurt->add_empty(std::move(_func)); - - if (!add_loc) - { - return nullptr; - } - - if (add_loc->compiled) - { - return add_loc->compiled; - } - - const spu_program& func = add_loc->data; - - if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1)) - { - std::string log; - this->dump(func, log); - fs::write_file(m_spurt->get_cache_path() + "spu.log", fs::create + fs::write + fs::append, log); - } - - // Allocate executable area with necessary size - const auto result = jit_runtime::alloc(22 + 1 + 9 + ::size32(func.data) * (16 + 16) + 36 + 47, 16); - - if (!result) - { - return nullptr; - } - - m_pos = func.lower_bound; - m_size = ::size32(func.data) * 4; - - { - sha1_context ctx; - u8 output[20]; - - sha1_starts(&ctx); - sha1_update(&ctx, reinterpret_cast(func.data.data()), func.data.size() * 4); - sha1_finish(&ctx, output); - - be_t hash_start; - std::memcpy(&hash_start, output, sizeof(hash_start)); - m_hash_start = hash_start; - } - - u8* raw = result; - - // 8-byte intruction for patching (long NOP) - *raw++ = 0x0f; - *raw++ = 0x1f; - *raw++ = 0x84; - *raw++ = 0; - *raw++ = 0; - *raw++ = 0; - *raw++ = 0; - *raw++ = 0; - - // mov rax, m_hash_start - *raw++ = 0x48; - *raw++ = 0xb8; - std::memcpy(raw, &m_hash_start, sizeof(m_hash_start)); - raw += 8; - - // Update block_hash: mov [r13 + spu_thread::m_block_hash], rax - *raw++ = 0x49; - *raw++ = 0x89; - *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::block_hash)); - - // Load PC: mov eax, [r13 + spu_thread::pc] - *raw++ = 0x41; - *raw++ = 0x8b; - *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); - - // Get LS address starting from PC: lea rcx, [rbp + rax] - *raw++ = 0x48; - *raw++ = 0x8d; - *raw++ = 0x4c; - *raw++ = 0x05; - *raw++ = 0x00; - - // Verification (slow) - for (u32 i = 0; i < func.data.size(); i++) - { - if (!func.data[i]) - { - continue; - } - - // cmp dword ptr [rcx + off], opc - *raw++ = 0x81; - *raw++ = 0xb9; - const u32 off = i * 4; - const u32 opc = func.data[i]; - std::memcpy(raw + 0, &off, 4); - std::memcpy(raw + 4, &opc, 4); - raw += 8; - - // jne tr_dispatch - const s64 rel = reinterpret_cast(spu_runtime::tr_dispatch) - reinterpret_cast(raw) - 6; - *raw++ = 0x0f; - *raw++ = 0x85; - std::memcpy(raw + 0, &rel, 4); - raw += 4; - } - - // trap - //*raw++ = 0xcc; - - // Secondary prologue: sub rsp,0x28 - *raw++ = 0x48; - *raw++ = 0x83; - *raw++ = 0xec; - *raw++ = 0x28; - - // Fix args: xchg r13,rbp - *raw++ = 0x49; - *raw++ = 0x87; - *raw++ = 0xed; - - // mov r12d, eax - *raw++ = 0x41; - *raw++ = 0x89; - *raw++ = 0xc4; - - // mov esi, 0x7f0 - *raw++ = 0xbe; - *raw++ = 0xf0; - *raw++ = 0x07; - *raw++ = 0x00; - *raw++ = 0x00; - - // lea rdi, [rbp + spu_thread::gpr] - *raw++ = 0x48; - *raw++ = 0x8d; - *raw++ = 0x7d; - *raw++ = ::narrow(::offset32(&spu_thread::gpr)); - - // Save base pc: mov [rbp + spu_thread::base_pc], eax - *raw++ = 0x89; - *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::base_pc)); - - // inc block_counter - *raw++ = 0x48; - *raw++ = 0xff; - *raw++ = 0x85; - const u32 blc_off = ::offset32(&spu_thread::block_counter); - std::memcpy(raw, &blc_off, 4); - raw += 4; - - // lea r14, [local epilogue] - *raw++ = 0x4c; - *raw++ = 0x8d; - *raw++ = 0x35; - const u32 epi_off = ::size32(func.data) * 16; - std::memcpy(raw, &epi_off, 4); - raw += 4; - - // Instructions (each instruction occupies fixed number of bytes) - for (u32 i = 0; i < func.data.size(); i++) - { - const u32 pos = m_pos + i * 4; - - if (!func.data[i]) - { - // Save pc: mov [rbp + spu_thread::pc], r12d - *raw++ = 0x44; - *raw++ = 0x89; - *raw++ = 0x65; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); - - // Epilogue: add rsp,0x28 - *raw++ = 0x48; - *raw++ = 0x83; - *raw++ = 0xc4; - *raw++ = 0x28; - - // ret (TODO) - *raw++ = 0xc3; - std::memset(raw, 0xcc, 16 - 9); - raw += 16 - 9; - continue; - } - - // Fix endianness - const spu_opcode_t op{std::bit_cast>(func.data[i])}; - - switch (auto type = g_spu_itype.decode(op.opcode)) - { - case spu_itype::BRZ: - case spu_itype::BRHZ: - case spu_itype::BRNZ: - case spu_itype::BRHNZ: - { - const u32 target = spu_branch_target(pos, op.i16); - - if (0 && target >= m_pos && target < m_pos + m_size) - { - *raw++ = type == spu_itype::BRHZ || type == spu_itype::BRHNZ ? 0x66 : 0x90; - *raw++ = 0x83; - *raw++ = 0xbd; - const u32 off = ::offset32(&spu_thread::gpr, op.rt) + 12; - std::memcpy(raw, &off, 4); - raw += 4; - *raw++ = 0x00; - - *raw++ = 0x0f; - *raw++ = type == spu_itype::BRZ || type == spu_itype::BRHZ ? 0x84 : 0x85; - const u32 dif = (target - (pos + 4)) / 4 * 16 + 2; - std::memcpy(raw, &dif, 4); - raw += 4; - - *raw++ = 0x66; - *raw++ = 0x90; - break; - } - - [[fallthrough]]; - } - default: - { - // Ballast: mov r15d, pos - *raw++ = 0x41; - *raw++ = 0xbf; - std::memcpy(raw, &pos, 4); - raw += 4; - - // mov ebx, opc - *raw++ = 0xbb; - std::memcpy(raw, &op, 4); - raw += 4; - - // call spu_* (specially built interpreter function) - const s64 rel = spu_runtime::g_interpreter_table[static_cast(type)] - reinterpret_cast(raw) - 5; - *raw++ = 0xe8; - std::memcpy(raw, &rel, 4); - raw += 4; - break; - } - } - } - - // Local dispatcher/epilogue: fix stack after branch instruction, then dispatch or return - - // add rsp, 8 - *raw++ = 0x48; - *raw++ = 0x83; - *raw++ = 0xc4; - *raw++ = 0x08; - - // and rsp, -16 - *raw++ = 0x48; - *raw++ = 0x83; - *raw++ = 0xe4; - *raw++ = 0xf0; - - // lea rax, [r12 - size] - *raw++ = 0x49; - *raw++ = 0x8d; - *raw++ = 0x84; - *raw++ = 0x24; - const u32 msz = 0u - m_size; - std::memcpy(raw, &msz, 4); - raw += 4; - - // sub eax, [rbp + spu_thread::base_pc] - *raw++ = 0x2b; - *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::base_pc)); - - // cmp eax, (0 - size) - *raw++ = 0x3d; - std::memcpy(raw, &msz, 4); - raw += 4; - - // jb epilogue - *raw++ = 0x72; - *raw++ = +12; - - // movsxd rax, eax - *raw++ = 0x48; - *raw++ = 0x63; - *raw++ = 0xc0; - - // shl rax, 2 - *raw++ = 0x48; - *raw++ = 0xc1; - *raw++ = 0xe0; - *raw++ = 0x02; - - // add rax, r14 - *raw++ = 0x4c; - *raw++ = 0x01; - *raw++ = 0xf0; - - // jmp rax - *raw++ = 0xff; - *raw++ = 0xe0; - - // Save pc: mov [rbp + spu_thread::pc], r12d - *raw++ = 0x44; - *raw++ = 0x89; - *raw++ = 0x65; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); - - // Epilogue: add rsp,0x28 ; ret - *raw++ = 0x48; - *raw++ = 0x83; - *raw++ = 0xc4; - *raw++ = 0x28; - *raw++ = 0xc3; - - const auto fn = reinterpret_cast(result); - - // Install pointer carefully - const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn); - - // Check hash against allowed bounds - const bool inverse_bounds = g_cfg.core.spu_llvm_lower_bound > g_cfg.core.spu_llvm_upper_bound; - - if ((!inverse_bounds && (m_hash_start < g_cfg.core.spu_llvm_lower_bound || m_hash_start > g_cfg.core.spu_llvm_upper_bound)) || - (inverse_bounds && (m_hash_start < g_cfg.core.spu_llvm_lower_bound && m_hash_start > g_cfg.core.spu_llvm_upper_bound))) - { - spu_log.error("[Debug] Skipped function %s", fmt::base57(be_t{m_hash_start})); - } - else if (added) - { - // Send work to LLVM compiler thread - g_fxo->get().registered.push(m_hash_start, add_loc); - } - - // Rebuild trampoline if necessary - if (!m_spurt->rebuild_ubertrampoline(func.data[0])) - { - return nullptr; - } - - if (added) - { - add_loc->compiled.notify_all(); - } - - return fn; - } -}; - -std::unique_ptr spu_recompiler_base::make_fast_llvm_recompiler() -{ - return std::make_unique(); -} diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index d63acdb83c..b1e1edadeb 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -157,7 +157,10 @@ NotUsing - + + NotUsing + + NotUsing @@ -458,7 +461,8 @@ - + + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index e1522df0c7..9480ccdc25 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -186,7 +186,10 @@ Utilities - + + Emu\Cell + + Emu\Cell @@ -726,7 +729,10 @@ Utilities - + + Utilities + + Utilities