diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index a84189bf33..3422cf529d 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -95,6 +95,12 @@ static void* const s_memory = []() -> void* return utils::memory_reserve(s_memory_size); }(); +// Reserve 2G of memory, should replace previous area for ASLR compatibility +static void* const s_memory2 = utils::memory_reserve(0x80000000); + +static u64 s_code_pos = 0; +static u64 s_data_pos = 0; + static void* s_next = s_memory; #ifdef _WIN32 @@ -129,6 +135,11 @@ extern void jit_finalize() utils::memory_decommit(s_memory, s_memory_size); s_next = s_memory; + + utils::memory_decommit(s_memory2, 0x80000000); + + s_code_pos = 0; + s_data_pos = 0; } // Helper class @@ -311,24 +322,25 @@ struct MemoryManager : llvm::RTDyldMemoryManager // Simple memory manager struct MemoryManager2 : llvm::RTDyldMemoryManager { - // Reserve 2 GiB - void* const m_memory = utils::memory_reserve(0x80000000); + // Patchwork again... + void* const m_memory = s_memory2; u8* const m_code = static_cast(m_memory) + 0x00000000; u8* const m_data = static_cast(m_memory) + 0x40000000; - u64 m_code_pos = 0; - u64 m_data_pos = 0; + u64& m_code_pos = s_code_pos; + u64& m_data_pos = s_data_pos; MemoryManager2() = default; ~MemoryManager2() override { - utils::memory_release(m_memory, 0x80000000); } u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override { + std::lock_guard lock(s_mutex); + // Simple allocation const u64 old = m_code_pos; const u64 pos = ::align(m_code_pos, align); @@ -349,12 +361,20 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx); } + if (!sec_id && sec_name.empty()) + { + // Special case: don't log + return m_code + pos; + } + LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align); return m_code + pos; } u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override { + std::lock_guard lock(s_mutex); + // Simple allocation const u64 old = m_data_pos; const u64 pos = ::align(m_data_pos, align); @@ -642,33 +662,12 @@ u64 jit_compiler::get(const std::string& name) return m_engine->getGlobalValueAddress(name); } -std::unordered_map jit_compiler::add(std::unordered_map data) +u8* jit_compiler::alloc(u32 size) { - // Lock memory manager - std::lock_guard lock(s_mutex); + // Dummy memory manager object + MemoryManager2 mm; - std::unordered_map result; - - std::size_t size = 0; - - for (auto&& pair : data) - { - size += ::align(pair.second.size(), 16); - } - - utils::memory_commit(s_next, size, utils::protection::wx); - std::memset(s_next, 0xc3, ::align(size, 4096)); - - for (auto&& pair : data) - { - std::memcpy(s_next, pair.second.data(), pair.second.size()); - result.emplace(pair.first, (u64)s_next); - s_next = (void*)::align((u64)s_next + pair.second.size(), 16); - } - - s_next = (void*)::align((u64)s_next, 4096); - - return result; + return mm.allocateCodeSection(size, 16, 0, {}); } #endif diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 6c3c792718..575016cdf5 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -61,6 +61,7 @@ FT build_function_asm(F&& builder) #include #include +#include #include #include "types.h" @@ -129,8 +130,8 @@ public: // Get compiled function address u64 get(const std::string& name); - // Add functions directly to the memory manager (name -> code) - static std::unordered_map add(std::unordered_map); + // Allocate writable executable memory (alignment is assumed 16) + static u8* alloc(u32 size); // Get CPU info static std::string cpu(const std::string& _cpu); diff --git a/Utilities/cond.cpp b/Utilities/cond.cpp index f6d819333d..c263990227 100644 --- a/Utilities/cond.cpp +++ b/Utilities/cond.cpp @@ -10,7 +10,7 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept { - verify("cond_variable overflow" HERE), (_old & 0xffff) == 0; // Very unlikely: it requires 65535 distinct threads to wait simultaneously + verify("cond_variable overflow" HERE), (_old & 0xffff) != 0xffff; // Very unlikely: it requires 65535 distinct threads to wait simultaneously return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int { @@ -42,7 +42,8 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept void cond_variable::imp_wake(u32 _count) noexcept { - balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32 + // TODO (notify_one) + balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32 { // Subtract already signaled number from total amount of waiters const u32 can_sig = (value & 0xffff) - (value >> 16); @@ -266,7 +267,7 @@ void cond_x16::imp_notify() noexcept return; } - balanced_awaken(m_cvx16, utils::popcnt16(wait_mask)); + balanced_awaken(m_cvx16, utils::popcnt16(wait_mask)); } bool lf_queue_base::wait(u64 _timeout) diff --git a/Utilities/sync.h b/Utilities/sync.h index 95a7e37226..859dc2b845 100644 --- a/Utilities/sync.h +++ b/Utilities/sync.h @@ -186,7 +186,7 @@ bool balanced_wait_until(atomic_t& var, u64 usec_timeout, Pred&& pred) { if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000)) { - if (!test_pred(value) && !test_pred(value, nullptr)) + if (!test_pred(value, nullptr)) { return false; } @@ -220,7 +220,7 @@ bool balanced_wait_until(atomic_t& var, u64 usec_timeout, Pred&& pred) return true; } - if (!test_pred(value) && !test_pred(value, nullptr)) + if (!test_pred(value, nullptr)) { // Stolen notification: restore balance NtReleaseKeyedEvent(nullptr, &var, false, nullptr); @@ -237,7 +237,7 @@ bool balanced_wait_until(atomic_t& var, u64 usec_timeout, Pred&& pred) { if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast(value), is_inf ? nullptr : &timeout) == 0) { - if (!test_pred(value) && !test_pred(value, nullptr)) + if (!test_pred(value, nullptr)) { return false; } @@ -257,7 +257,7 @@ bool balanced_wait_until(atomic_t& var, u64 usec_timeout, Pred&& pred) #endif } -template +template void balanced_awaken(atomic_t& var, u32 weight) { static_assert(sizeof(T) == 4 || sizeof(T) == 8); @@ -265,11 +265,13 @@ void balanced_awaken(atomic_t& var, u32 weight) #ifdef _WIN32 if (OptWaitOnAddress) { - if (weight > 1) + if (All || weight > 3) { OptWakeByAddressAll(&var); + return; } - else if (weight == 1) + + for (u32 i = 0; i < weight; i++) { OptWakeByAddressSingle(&var); } @@ -282,9 +284,9 @@ void balanced_awaken(atomic_t& var, u32 weight) NtReleaseKeyedEvent(nullptr, &var, false, nullptr); } #else - if (weight) + if (All || weight) { - futex(&var, FUTEX_WAKE_PRIVATE, std::min(INT_MAX, weight)); + futex(&var, FUTEX_WAKE_PRIVATE, All ? INT_MAX : std::min(INT_MAX, weight)); } return; diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 2751fb5fba..49f25cfb25 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -32,33 +32,8 @@ std::unique_ptr spu_recompiler_base::make_asmjit_recompiler return std::make_unique(); } -spu_runtime::spu_runtime() -{ - m_cache_path = fxm::check_unlocked()->cache; - - if (g_cfg.core.spu_debug) - { - fs::file(m_cache_path + "spu.log", fs::rewrite); - } - - LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized..."); - - // Initialize lookup table - for (auto& v : m_dispatcher) - { - v.raw() = &spu_recompiler_base::dispatch; - } - - // Initialize "empty" block - m_map[std::vector()] = &spu_recompiler_base::dispatch; -} - spu_recompiler::spu_recompiler() { - if (!g_cfg.core.spu_shared_runtime) - { - m_spurt = std::make_shared(); - } } void spu_recompiler::init() @@ -68,6 +43,7 @@ void spu_recompiler::init() { m_cache = fxm::get(); m_spurt = fxm::get_always(); + m_asmrt = m_spurt->get_asmjit_rt(); } } @@ -83,19 +59,22 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) { init(); - // Don't lock without shared runtime - std::unique_lock lock(m_spurt->m_mutex, std::defer_lock); - - if (g_cfg.core.spu_shared_runtime) - { - lock.lock(); - } + std::unique_lock lock(m_spurt->m_mutex); // Try to find existing function, register new one if necessary const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr); auto& fn_location = fn_info.first->second; + if (!fn_location && !fn_info.second) + { + // Wait if already in progress + while (!fn_location) + { + m_spurt->m_cond.wait(lock); + } + } + if (fn_location) { return fn_location; @@ -103,6 +82,8 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) auto& func = fn_info.first->first; + lock.unlock(); + using namespace asmjit; SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode); @@ -124,7 +105,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) } CodeHolder code; - code.init(m_spurt->m_jitrt.getCodeInfo()); + code.init(m_asmrt->getCodeInfo()); code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign; X86Assembler compiler(&code); @@ -861,14 +842,11 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // Compile and get function address spu_function_t fn; - if (m_spurt->m_jitrt.add(&fn, &code)) + if (m_asmrt->add(&fn, &code)) { LOG_FATAL(SPU, "Failed to build a function"); } - // Register function - fn_location = fn; - if (g_cfg.core.spu_debug) { // Add ASMJIT logs @@ -885,6 +863,11 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) m_cache->add(func); } + lock.lock(); + + // Register function (possibly temporarily) + fn_location = fn; + // Generate a dispatcher (übertrampoline) std::vector addrv{func[0]}; const auto beg = m_spurt->m_map.lower_bound(addrv); @@ -899,19 +882,11 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) else { CodeHolder code; - code.init(m_spurt->m_jitrt.getCodeInfo()); + code.init(m_asmrt->getCodeInfo()); X86Assembler compiler(&code); this->c = &compiler; - if (g_cfg.core.spu_debug) - { - // Set logger - code.setLogger(&logger); - } - - compiler.comment("\n\nTrampoline:\n\n"); - struct work { u32 size; @@ -1110,7 +1085,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) spu_function_t tr; - if (m_spurt->m_jitrt.add(&tr, &code)) + if (m_asmrt->add(&tr, &code)) { LOG_FATAL(SPU, "Failed to build a trampoline"); } @@ -1118,6 +1093,9 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) m_spurt->m_dispatcher[func[0] / 4] = tr; } + lock.unlock(); + m_spurt->m_cond.notify_all(); + return fn; } diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index ce43792c19..f8a093d6fe 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -1,33 +1,10 @@ #pragma once #include "Utilities/JIT.h" -#include "Utilities/mutex.h" #include "SPURecompiler.h" #include -// SPU ASMJIT Runtime object (global) -class spu_runtime -{ - shared_mutex m_mutex; - - asmjit::JitRuntime m_jitrt; - - // All functions - std::map, spu_function_t> m_map; - - // All dispatchers - std::array, 0x10000> m_dispatcher; - - // Debug module output location - std::string m_cache_path; - - friend class spu_recompiler; - -public: - spu_runtime(); -}; - // SPU ASMJIT Recompiler class spu_recompiler : public spu_recompiler_base { @@ -43,6 +20,9 @@ public: virtual spu_function_t compile(std::vector&&) override; private: + // ASMJIT runtime + asmjit::JitRuntime* m_asmrt; + // emitter: asmjit::X86Assembler* c; diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 5fdd988569..acb2da2325 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -24,7 +24,7 @@ const spu_decoder s_spu_iname; extern u64 get_timebased_time(); spu_cache::spu_cache(const std::string& loc) - : m_file(loc, fs::read + fs::write + fs::create) + : m_file(loc, fs::read + fs::write + fs::create + fs::append) { } @@ -76,18 +76,22 @@ void spu_cache::add(const std::vector& func) return; } - be_t size = ::size32(func) - 1; - be_t addr = func[0]; - m_file.write(size); - m_file.write(addr); - m_file.write(func.data() + 1, func.size() * 4 - 4); + // Allocate buffer + const auto buf = std::make_unique[]>(func.size() + 1); + + buf[0] = ::size32(func) - 1; + buf[1] = func[0]; + std::memcpy(buf.get() + 2, func.data() + 1, func.size() * 4 - 4); + + // Append data + m_file.write(buf.get(), func.size() * 4 + 4); } void spu_cache::initialize() { const std::string ppu_cache = Emu.PPUCache(); - if (ppu_cache.empty() || !g_cfg.core.spu_shared_runtime) + if (ppu_cache.empty()) { return; } @@ -105,30 +109,34 @@ void spu_cache::initialize() // Read cache auto func_list = cache->get(); + atomic_t fnext{}; - // Recompiler instance for cache initialization - std::unique_ptr compiler; + // Initialize compiler instances for parallel compilation + u32 max_threads = static_cast(g_cfg.core.llvm_threads); + u32 thread_count = max_threads > 0 ? std::min(max_threads, std::thread::hardware_concurrency()) : std::thread::hardware_concurrency(); + std::vector> compilers{thread_count}; - if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) + for (auto& compiler : compilers) { - compiler = spu_recompiler_base::make_asmjit_recompiler(); - } + if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) + { + compiler = spu_recompiler_base::make_asmjit_recompiler(); + } + else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) + { + compiler = spu_recompiler_base::make_llvm_recompiler(); + } + else + { + compilers.clear(); + break; + } - if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) - { - compiler = spu_recompiler_base::make_llvm_recompiler(); - } - - if (compiler) - { compiler->init(); } - if (compiler && !func_list.empty()) + if (compilers.size() && !func_list.empty()) { - // Fake LS - std::vector> ls(0x10000); - // Initialize progress dialog (wait for previous progress done) while (g_progr_ptotal) { @@ -137,10 +145,20 @@ void spu_cache::initialize() g_progr = "Building SPU cache..."; g_progr_ptotal += func_list.size(); + } + + std::deque>> thread_queue; + + for (std::size_t i = 0; i < compilers.size(); i++) thread_queue.emplace_back("Worker " + std::to_string(i), [&, compiler = compilers[i].get()]() + { + // Fake LS + std::vector> ls(0x10000); // Build functions - for (auto&& func : func_list) + for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++) { + std::vector& func = func_list[func_i]; + if (Emu.IsStopped()) { g_progr_pdone++; @@ -185,13 +203,22 @@ void spu_cache::initialize() g_progr_pdone++; } + }); - if (Emu.IsStopped()) - { - LOG_ERROR(SPU, "SPU Runtime: Cache building aborted."); - return; - } + // Join all threads + while (!thread_queue.empty()) + { + thread_queue.pop_front(); + } + if (Emu.IsStopped()) + { + LOG_ERROR(SPU, "SPU Runtime: Cache building aborted."); + return; + } + + if (compilers.size() && !func_list.empty()) + { LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size()); } @@ -202,6 +229,317 @@ void spu_cache::initialize() }); } +spu_runtime::spu_runtime() +{ + // Initialize lookup table + for (auto& v : m_dispatcher) + { + v.raw() = &spu_recompiler_base::dispatch; + } + + // Initialize "empty" block + m_map[std::vector()] = &spu_recompiler_base::dispatch; + + // Clear LLVM output + m_cache_path = Emu.PPUCache(); + fs::create_dir(m_cache_path + "llvm/"); + fs::remove_all(m_cache_path + "llvm/", false); + + if (g_cfg.core.spu_debug) + { + fs::file(m_cache_path + "spu.log", fs::rewrite); + } + + LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized..."); +} + +asmjit::JitRuntime* spu_runtime::get_asmjit_rt() +{ + std::lock_guard lock(m_mutex); + + m_asmjit_rts.emplace_back(std::make_unique()); + + return m_asmjit_rts.back().get(); +} + +void spu_runtime::add(std::pair, spu_function_t>& where, spu_function_t compiled) +{ + std::unique_lock lock(m_mutex); + + // Function info + const std::vector& func = where.first; + + // + const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga); + + // Set pointer to the compiled function + where.second = compiled; + + // Generate a dispatcher (übertrampoline) + std::vector addrv{func[0]}; + const auto beg = m_map.lower_bound(addrv); + addrv[0] += 4; + const auto _end = m_map.lower_bound(addrv); + const u32 size0 = std::distance(beg, _end); + + if (size0 == 1) + { + m_dispatcher[func[0] / 4] = compiled; + } + else + { + // Allocate some writable executable memory +#ifdef LLVM_AVAILABLE + const auto wxptr = jit_compiler::alloc(size0 * 20); +#else + u8* const wxptr = new u8[size0 * 20]; // dummy +#endif + + // Raw assembly pointer + u8* raw = wxptr; + + struct work + { + u32 size; + u32 level; + u8* rel32; + std::map, spu_function_t>::iterator beg; + std::map, spu_function_t>::iterator end; + }; + + // Write jump instruction with rel32 immediate + auto make_jump = [&](u8 op, auto target) + { + verify("Asm overflow" HERE), raw + 6 <= wxptr + size0 * 20; + + if (!target && !tr_dispatch) + { + // Generate a special trampoline with pause instruction +#ifdef LLVM_AVAILABLE + const auto trptr = jit_compiler::alloc(16); +#else + u8* const trptr = new u8[16]; // dummy +#endif + trptr[0] = 0xf3; // pause + trptr[1] = 0x90; + trptr[2] = 0xff; // jmp [rip] + trptr[3] = 0x25; + std::memset(trptr + 4, 0, 4); + const u64 target = reinterpret_cast(&spu_recompiler_base::dispatch); + std::memcpy(trptr + 8, &target, 8); + tr_dispatch = reinterpret_cast(trptr); + } + + // Fallback to dispatch if no target + const u64 taddr = target ? reinterpret_cast(target) : reinterpret_cast(tr_dispatch); + + // Compute the distance + const s64 rel = taddr - reinterpret_cast(raw) - (op != 0xe9 ? 6 : 5); + + verify(HERE), rel >= INT32_MIN, rel <= INT32_MAX; + + if (op != 0xe9) + { + // First jcc byte + *raw++ = 0x0f; + verify(HERE), (op >> 4) == 0x8; + } + + *raw++ = op; + + const s32 r32 = static_cast(rel); + + std::memcpy(raw, &r32, 4); + raw += 4; + }; + + std::vector workload; + workload.reserve(size0); + workload.emplace_back(); + workload.back().size = size0; + workload.back().level = 1; + workload.back().rel32 = 0; + workload.back().beg = beg; + workload.back().end = _end; + + for (std::size_t i = 0; i < workload.size(); i++) + { + // Get copy of the workload info + work w = workload[i]; + + // Split range in two parts + auto it = w.beg; + auto it2 = w.beg; + u32 size1 = w.size / 2; + u32 size2 = w.size - size1; + std::advance(it2, w.size / 2); + + while (true) + { + it = it2; + size1 = w.size - size2; + + if (w.level >= w.beg->first.size()) + { + // Cannot split: smallest function is a prefix of bigger ones (TODO) + break; + } + + const u32 x1 = w.beg->first.at(w.level); + + if (!x1) + { + // Cannot split: some functions contain holes at this level + w.level++; + continue; + } + + // Adjust ranges (forward) + while (it != w.end && x1 == it->first.at(w.level)) + { + it++; + size1++; + } + + if (it == w.end) + { + // Cannot split: words are identical within the range at this level + w.level++; + } + else + { + size2 = w.size - size1; + break; + } + } + + if (w.rel32) + { + // Patch rel32 linking it to the current location if necessary + const s32 r32 = ::narrow(raw - w.rel32, HERE); + std::memcpy(w.rel32 - 4, &r32, 4); + } + + if (w.level >= w.beg->first.size()) + { + // If functions cannot be compared, assume smallest function + LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level); + make_jump(0xe9, w.beg->second); // jmp rel32 + continue; + } + + // Value for comparison + const u32 x = it->first.at(w.level); + + // Adjust ranges (backward) + while (true) + { + it--; + + if (it->first.at(w.level) != x) + { + it++; + break; + } + + verify(HERE), it != w.beg; + size1--; + size2++; + } + + // Emit 32-bit comparison: cmp [ls+addr], imm32 + verify("Asm overflow" HERE), raw + 10 <= wxptr + size0 * 20; + const u32 cmp_lsa = start + (w.level - 1) * 4; + *raw++ = 0x81; +#ifdef _WIN32 + *raw++ = 0xba; +#else + *raw++ = 0xbe; +#endif + std::memcpy(raw, &cmp_lsa, 4); + std::memcpy(raw + 4, &x, 4); + raw += 8; + + // Low subrange target + if (size1 == 1) + { + make_jump(0x82, w.beg->second); // jb rel32 + } + else + { + make_jump(0x82, raw); // jb rel32 (stub) + workload.push_back(w); + workload.back().end = it; + workload.back().size = size1; + workload.back().rel32 = raw; + } + + // Second subrange target + if (size2 == 1) + { + make_jump(0xe9, it->second); // jmp rel32 + } + else + { + it2 = it; + + // Select additional midrange for equality comparison + while (it2 != w.end && it2->first.at(w.level) == x) + { + size2--; + it2++; + } + + if (it2 != w.end) + { + // High subrange target + if (size2 == 1) + { + make_jump(0x87, it2->second); // ja rel32 + } + else + { + make_jump(0x87, raw); // ja rel32 (stub) + workload.push_back(w); + workload.back().beg = it2; + workload.back().size = size2; + workload.back().rel32 = raw; + } + + const u32 size3 = w.size - size1 - size2; + + if (size3 == 1) + { + make_jump(0xe9, it->second); // jmp rel32 + } + else + { + make_jump(0xe9, raw); // jmp rel32 (stub) + workload.push_back(w); + workload.back().beg = it; + workload.back().end = it2; + workload.back().size = size3; + workload.back().rel32 = raw; + } + } + else + { + make_jump(0xe9, raw); // jmp rel32 (stub) + workload.push_back(w); + workload.back().beg = it; + workload.back().size = w.size - size1; + workload.back().rel32 = raw; + } + } + } + + m_dispatcher[func[0] / 4] = reinterpret_cast(reinterpret_cast(wxptr)); + } + + lock.unlock(); + m_cond.notify_all(); +} + spu_recompiler_base::spu_recompiler_base() { } @@ -1491,55 +1829,14 @@ void spu_recompiler_base::dump(std::string& out) #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Vectorize.h" -#include "Utilities/JIT.h" - -class spu_llvm_runtime -{ - shared_mutex m_mutex; - - // All functions - std::map, spu_function_t> m_map; - - // All dispatchers - std::array, 0x10000> m_dispatcher; - - // JIT instance - jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)}; - - // Debug module output location - std::string m_cache_path; - - friend class spu_llvm_recompiler; - -public: - spu_llvm_runtime() - { - // Initialize lookup table - for (auto& v : m_dispatcher) - { - v.raw() = &spu_recompiler_base::dispatch; - } - - // Initialize "empty" block - m_map[std::vector()] = &spu_recompiler_base::dispatch; - - // Clear LLVM output - m_cache_path = Emu.PPUCache(); - fs::create_dir(m_cache_path + "llvm/"); - fs::remove_all(m_cache_path + "llvm/", false); - - if (g_cfg.core.spu_debug) - { - fs::file(m_cache_path + "spu.log", fs::rewrite); - } - - LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized..."); - } -}; class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { - std::shared_ptr m_spurt; + // SPU Runtime Instance + std::shared_ptr m_spurt; + + // JIT Instance + jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)}; // Current function (chunk) llvm::Function* m_function; @@ -2239,11 +2536,6 @@ public: : spu_recompiler_base() , cpu_translator(nullptr, false) { - if (g_cfg.core.spu_shared_runtime) - { - // TODO (local context is unsupported) - //m_spurt = std::make_shared(); - } } virtual void init() override @@ -2252,9 +2544,9 @@ public: if (!m_spurt) { m_cache = fxm::get(); - m_spurt = fxm::get_always(); - m_context = m_spurt->m_jit.get_context(); - m_use_ssse3 = m_spurt->m_jit.has_ssse3(); + m_spurt = fxm::get_always(); + m_context = m_jit.get_context(); + m_use_ssse3 = m_jit.has_ssse3(); } } @@ -2271,18 +2563,22 @@ public: init(); // Don't lock without shared runtime - std::unique_lock lock(m_spurt->m_mutex, std::defer_lock); - - if (g_cfg.core.spu_shared_runtime) - { - lock.lock(); - } + std::unique_lock lock(m_spurt->m_mutex); // Try to find existing function, register new one if necessary const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr); auto& fn_location = fn_info.first->second; + if (!fn_location && !fn_info.second) + { + // Wait if already in progress + while (!fn_location) + { + m_spurt->m_cond.wait(lock); + } + } + if (fn_location) { return fn_location; @@ -2290,6 +2586,8 @@ public: auto& func = fn_info.first->first; + lock.unlock(); + std::string hash; { sha1_context ctx; @@ -2770,179 +3068,6 @@ public: m_scan_queue.clear(); m_function_table = nullptr; - // Generate a dispatcher (übertrampoline) - std::vector addrv{func[0]}; - const auto beg = m_spurt->m_map.lower_bound(addrv); - addrv[0] += 4; - const auto _end = m_spurt->m_map.lower_bound(addrv); - const u32 size0 = std::distance(beg, _end); - - if (size0 > 1) - { - const auto trampoline = cast(module->getOrInsertFunction(fmt::format("spu-0x%05x-trampoline-%03u", func[0], size0), get_type(), get_type(), get_type())); - set_function(trampoline); - - struct work - { - u32 size; - u32 level; - BasicBlock* label; - std::map, spu_function_t>::iterator beg; - std::map, spu_function_t>::iterator end; - }; - - std::vector workload; - workload.reserve(size0); - workload.emplace_back(); - workload.back().size = size0; - workload.back().level = 1; - workload.back().beg = beg; - workload.back().end = _end; - workload.back().label = m_ir->GetInsertBlock(); - - for (std::size_t i = 0; i < workload.size(); i++) - { - // Get copy of the workload info - work w = workload[i]; - - // Switch targets - std::vector> targets; - - llvm::BasicBlock* def{}; - - bool unsorted = false; - - while (w.level < w.beg->first.size()) - { - const u32 x1 = w.beg->first.at(w.level); - - if (x1 == 0) - { - // Cannot split: some functions contain holes at this level - auto it = w.end; - it--; - - if (it->first.at(w.level) != 0) - { - unsorted = true; - } - - w.level++; - continue; - } - - auto it = w.beg; - auto it2 = it; - u32 x = x1; - bool split = false; - - while (it2 != w.end) - { - it2++; - - const u32 x2 = it2 != w.end ? it2->first.at(w.level) : x1; - - if (x2 != x) - { - const u32 dist = std::distance(it, it2); - - const auto b = llvm::BasicBlock::Create(m_context, "", m_function); - - if (dist == 1 && x != 0) - { - m_ir->SetInsertPoint(b); - - if (const u64 fval = reinterpret_cast(it->second)) - { - const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType()); - m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall(); - } - else - { - verify(HERE, &it->second == &fn_location); - m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall(); - } - - m_ir->CreateRetVoid(); - } - else - { - workload.emplace_back(w); - workload.back().beg = it; - workload.back().end = it2; - workload.back().label = b; - workload.back().size = dist; - } - - if (x == 0) - { - def = b; - } - else - { - targets.emplace_back(std::make_pair(x, b)); - } - - x = x2; - it = it2; - split = true; - } - } - - if (!split) - { - // Cannot split: words are identical within the range at this level - w.level++; - } - else - { - break; - } - } - - if (!def && targets.empty()) - { - LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level); - m_ir->SetInsertPoint(w.label); - - if (const u64 fval = reinterpret_cast(w.beg->second)) - { - const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType()); - m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall(); - } - else - { - verify(HERE, &w.beg->second == &fn_location); - m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall(); - } - - m_ir->CreateRetVoid(); - continue; - } - - if (!def) - { - def = llvm::BasicBlock::Create(m_context, "", m_function); - - m_ir->SetInsertPoint(def); - tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0)); - } - - m_ir->SetInsertPoint(w.label); - const auto add = m_ir->CreateGEP(m_lsptr, m_ir->getInt64(start + w.level * 4 - 4)); - const auto ptr = m_ir->CreateBitCast(add, get_type()); - const auto val = m_ir->CreateLoad(ptr); - const auto sw = m_ir->CreateSwitch(val, def, ::size32(targets)); - - for (auto& pair : targets) - { - sw->addCase(m_ir->getInt32(pair.first), pair.second); - } - } - } - - spu_function_t fn{}, tr{}; - std::string log; raw_string_ostream out(log); @@ -2970,32 +3095,19 @@ public: if (g_cfg.core.spu_debug) { // Testing only - m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/"); + m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/"); } else { - m_spurt->m_jit.add(std::move(module)); + m_jit.add(std::move(module)); } - m_spurt->m_jit.fin(); - fn = reinterpret_cast(m_spurt->m_jit.get_engine().getPointerToFunction(main_func)); - tr = fn; - - if (size0 > 1) - { - tr = reinterpret_cast(m_spurt->m_jit.get_engine().getPointerToFunction(m_function)); - } + m_jit.fin(); // Register function pointer - fn_location = fn; + const spu_function_t fn = reinterpret_cast(m_jit.get_engine().getPointerToFunction(main_func)); - // Trampoline - m_spurt->m_dispatcher[func[0] / 4] = tr; - - LOG_NOTICE(SPU, "[0x%x] Compiled: %p", func[0], fn); - - if (tr != fn) - LOG_NOTICE(SPU, "[0x%x] T: %p", func[0], tr); + m_spurt->add(*fn_info.first, fn); if (g_cfg.core.spu_debug) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index f75ea57faa..7d88ea9c94 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -1,6 +1,9 @@ #pragma once #include "Utilities/File.h" +#include "Utilities/mutex.h" +#include "Utilities/cond.h" +#include "Utilities/JIT.h" #include "SPUThread.h" #include #include @@ -30,6 +33,40 @@ public: static void initialize(); }; +// Helper class +class spu_runtime +{ +public: + shared_mutex m_mutex; + + cond_variable m_cond; + + // All functions + std::map, spu_function_t> m_map; + + // All dispatchers + std::array, 0x10000> m_dispatcher; + + // Debug module output location + std::string m_cache_path; + +private: + // Temporarily: asmjit runtime collection + std::deque> m_asmjit_rts; + + // Trampoline to spu_recompiler_base::dispatch + spu_function_t tr_dispatch = nullptr; + +public: + spu_runtime(); + + // Get new ASMJIT runtime + asmjit::JitRuntime* get_asmjit_rt(); + + // Add compiled function and generate trampoline if necessary + void add(std::pair, spu_function_t>& where, spu_function_t compiled); +}; + // SPU Recompiler instance base class class spu_recompiler_base { diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 5f0f06d8e4..cf09ead762 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -367,7 +367,6 @@ struct cfg_root : cfg::node cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield - cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads cfg::_enum spu_block_size{this, "SPU Block Size", spu_block_size_type::safe}; cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false}; cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};