mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-01-31 12:31:45 +01:00
SPU: multithread compilation
Allow parallel compilation of SPU code, both at startup and runtime Remove 'SPU Shared Runtime' option (it became obsolete) Refactor spu_runtime class (now is common for ASMJIT and LLVM) Implement SPU ubertrampoline generation in raw assembly (LLVM) Minor improvement of balanced_wait_until<> and balanced_awaken<> Make JIT MemoryManager2 shared (global) Fix wrong assertion in cond_variable
This commit is contained in:
parent
8d5d44141e
commit
4f152ad126
@ -95,6 +95,12 @@ static void* const s_memory = []() -> void*
|
||||
return utils::memory_reserve(s_memory_size);
|
||||
}();
|
||||
|
||||
// Reserve 2G of memory, should replace previous area for ASLR compatibility
|
||||
static void* const s_memory2 = utils::memory_reserve(0x80000000);
|
||||
|
||||
static u64 s_code_pos = 0;
|
||||
static u64 s_data_pos = 0;
|
||||
|
||||
static void* s_next = s_memory;
|
||||
|
||||
#ifdef _WIN32
|
||||
@ -129,6 +135,11 @@ extern void jit_finalize()
|
||||
utils::memory_decommit(s_memory, s_memory_size);
|
||||
|
||||
s_next = s_memory;
|
||||
|
||||
utils::memory_decommit(s_memory2, 0x80000000);
|
||||
|
||||
s_code_pos = 0;
|
||||
s_data_pos = 0;
|
||||
}
|
||||
|
||||
// Helper class
|
||||
@ -311,24 +322,25 @@ struct MemoryManager : llvm::RTDyldMemoryManager
|
||||
// Simple memory manager
|
||||
struct MemoryManager2 : llvm::RTDyldMemoryManager
|
||||
{
|
||||
// Reserve 2 GiB
|
||||
void* const m_memory = utils::memory_reserve(0x80000000);
|
||||
// Patchwork again...
|
||||
void* const m_memory = s_memory2;
|
||||
|
||||
u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
|
||||
u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;
|
||||
|
||||
u64 m_code_pos = 0;
|
||||
u64 m_data_pos = 0;
|
||||
u64& m_code_pos = s_code_pos;
|
||||
u64& m_data_pos = s_data_pos;
|
||||
|
||||
MemoryManager2() = default;
|
||||
|
||||
~MemoryManager2() override
|
||||
{
|
||||
utils::memory_release(m_memory, 0x80000000);
|
||||
}
|
||||
|
||||
u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
|
||||
{
|
||||
std::lock_guard lock(s_mutex);
|
||||
|
||||
// Simple allocation
|
||||
const u64 old = m_code_pos;
|
||||
const u64 pos = ::align(m_code_pos, align);
|
||||
@ -349,12 +361,20 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
|
||||
utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
|
||||
}
|
||||
|
||||
if (!sec_id && sec_name.empty())
|
||||
{
|
||||
// Special case: don't log
|
||||
return m_code + pos;
|
||||
}
|
||||
|
||||
LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
|
||||
return m_code + pos;
|
||||
}
|
||||
|
||||
u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
|
||||
{
|
||||
std::lock_guard lock(s_mutex);
|
||||
|
||||
// Simple allocation
|
||||
const u64 old = m_data_pos;
|
||||
const u64 pos = ::align(m_data_pos, align);
|
||||
@ -642,33 +662,12 @@ u64 jit_compiler::get(const std::string& name)
|
||||
return m_engine->getGlobalValueAddress(name);
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, u64> jit_compiler::add(std::unordered_map<std::string, std::string> data)
|
||||
u8* jit_compiler::alloc(u32 size)
|
||||
{
|
||||
// Lock memory manager
|
||||
std::lock_guard lock(s_mutex);
|
||||
// Dummy memory manager object
|
||||
MemoryManager2 mm;
|
||||
|
||||
std::unordered_map<std::string, u64> result;
|
||||
|
||||
std::size_t size = 0;
|
||||
|
||||
for (auto&& pair : data)
|
||||
{
|
||||
size += ::align(pair.second.size(), 16);
|
||||
}
|
||||
|
||||
utils::memory_commit(s_next, size, utils::protection::wx);
|
||||
std::memset(s_next, 0xc3, ::align(size, 4096));
|
||||
|
||||
for (auto&& pair : data)
|
||||
{
|
||||
std::memcpy(s_next, pair.second.data(), pair.second.size());
|
||||
result.emplace(pair.first, (u64)s_next);
|
||||
s_next = (void*)::align((u64)s_next + pair.second.size(), 16);
|
||||
}
|
||||
|
||||
s_next = (void*)::align((u64)s_next, 4096);
|
||||
|
||||
return result;
|
||||
return mm.allocateCodeSection(size, 16, 0, {});
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -61,6 +61,7 @@ FT build_function_asm(F&& builder)
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "types.h"
|
||||
@ -129,8 +130,8 @@ public:
|
||||
// Get compiled function address
|
||||
u64 get(const std::string& name);
|
||||
|
||||
// Add functions directly to the memory manager (name -> code)
|
||||
static std::unordered_map<std::string, u64> add(std::unordered_map<std::string, std::string>);
|
||||
// Allocate writable executable memory (alignment is assumed 16)
|
||||
static u8* alloc(u32 size);
|
||||
|
||||
// Get CPU info
|
||||
static std::string cpu(const std::string& _cpu);
|
||||
|
@ -10,7 +10,7 @@
|
||||
|
||||
bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
|
||||
{
|
||||
verify("cond_variable overflow" HERE), (_old & 0xffff) == 0; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
|
||||
verify("cond_variable overflow" HERE), (_old & 0xffff) != 0xffff; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
|
||||
|
||||
return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int
|
||||
{
|
||||
@ -42,7 +42,8 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
|
||||
|
||||
void cond_variable::imp_wake(u32 _count) noexcept
|
||||
{
|
||||
balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32
|
||||
// TODO (notify_one)
|
||||
balanced_awaken<true>(m_value, m_value.atomic_op([&](u32& value) -> u32
|
||||
{
|
||||
// Subtract already signaled number from total amount of waiters
|
||||
const u32 can_sig = (value & 0xffff) - (value >> 16);
|
||||
@ -266,7 +267,7 @@ void cond_x16::imp_notify() noexcept
|
||||
return;
|
||||
}
|
||||
|
||||
balanced_awaken(m_cvx16, utils::popcnt16(wait_mask));
|
||||
balanced_awaken<true>(m_cvx16, utils::popcnt16(wait_mask));
|
||||
}
|
||||
|
||||
bool lf_queue_base::wait(u64 _timeout)
|
||||
|
@ -186,7 +186,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
|
||||
{
|
||||
if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000))
|
||||
{
|
||||
if (!test_pred(value) && !test_pred(value, nullptr))
|
||||
if (!test_pred(value, nullptr))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@ -220,7 +220,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!test_pred(value) && !test_pred(value, nullptr))
|
||||
if (!test_pred(value, nullptr))
|
||||
{
|
||||
// Stolen notification: restore balance
|
||||
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
|
||||
@ -237,7 +237,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
|
||||
{
|
||||
if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast<u32>(value), is_inf ? nullptr : &timeout) == 0)
|
||||
{
|
||||
if (!test_pred(value) && !test_pred(value, nullptr))
|
||||
if (!test_pred(value, nullptr))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@ -257,7 +257,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <bool All = false, typename T>
|
||||
void balanced_awaken(atomic_t<T>& var, u32 weight)
|
||||
{
|
||||
static_assert(sizeof(T) == 4 || sizeof(T) == 8);
|
||||
@ -265,11 +265,13 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
|
||||
#ifdef _WIN32
|
||||
if (OptWaitOnAddress)
|
||||
{
|
||||
if (weight > 1)
|
||||
if (All || weight > 3)
|
||||
{
|
||||
OptWakeByAddressAll(&var);
|
||||
return;
|
||||
}
|
||||
else if (weight == 1)
|
||||
|
||||
for (u32 i = 0; i < weight; i++)
|
||||
{
|
||||
OptWakeByAddressSingle(&var);
|
||||
}
|
||||
@ -282,9 +284,9 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
|
||||
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
|
||||
}
|
||||
#else
|
||||
if (weight)
|
||||
if (All || weight)
|
||||
{
|
||||
futex(&var, FUTEX_WAKE_PRIVATE, std::min<u32>(INT_MAX, weight));
|
||||
futex(&var, FUTEX_WAKE_PRIVATE, All ? INT_MAX : std::min<u32>(INT_MAX, weight));
|
||||
}
|
||||
|
||||
return;
|
||||
|
@ -32,33 +32,8 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler
|
||||
return std::make_unique<spu_recompiler>();
|
||||
}
|
||||
|
||||
spu_runtime::spu_runtime()
|
||||
{
|
||||
m_cache_path = fxm::check_unlocked<ppu_module>()->cache;
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
fs::file(m_cache_path + "spu.log", fs::rewrite);
|
||||
}
|
||||
|
||||
LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized...");
|
||||
|
||||
// Initialize lookup table
|
||||
for (auto& v : m_dispatcher)
|
||||
{
|
||||
v.raw() = &spu_recompiler_base::dispatch;
|
||||
}
|
||||
|
||||
// Initialize "empty" block
|
||||
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
|
||||
}
|
||||
|
||||
spu_recompiler::spu_recompiler()
|
||||
{
|
||||
if (!g_cfg.core.spu_shared_runtime)
|
||||
{
|
||||
m_spurt = std::make_shared<spu_runtime>();
|
||||
}
|
||||
}
|
||||
|
||||
void spu_recompiler::init()
|
||||
@ -68,6 +43,7 @@ void spu_recompiler::init()
|
||||
{
|
||||
m_cache = fxm::get<spu_cache>();
|
||||
m_spurt = fxm::get_always<spu_runtime>();
|
||||
m_asmrt = m_spurt->get_asmjit_rt();
|
||||
}
|
||||
}
|
||||
|
||||
@ -83,19 +59,22 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
||||
{
|
||||
init();
|
||||
|
||||
// Don't lock without shared runtime
|
||||
std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
|
||||
|
||||
if (g_cfg.core.spu_shared_runtime)
|
||||
{
|
||||
lock.lock();
|
||||
}
|
||||
std::unique_lock lock(m_spurt->m_mutex);
|
||||
|
||||
// Try to find existing function, register new one if necessary
|
||||
const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
|
||||
|
||||
auto& fn_location = fn_info.first->second;
|
||||
|
||||
if (!fn_location && !fn_info.second)
|
||||
{
|
||||
// Wait if already in progress
|
||||
while (!fn_location)
|
||||
{
|
||||
m_spurt->m_cond.wait(lock);
|
||||
}
|
||||
}
|
||||
|
||||
if (fn_location)
|
||||
{
|
||||
return fn_location;
|
||||
@ -103,6 +82,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
||||
|
||||
auto& func = fn_info.first->first;
|
||||
|
||||
lock.unlock();
|
||||
|
||||
using namespace asmjit;
|
||||
|
||||
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
|
||||
@ -124,7 +105,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
||||
}
|
||||
|
||||
CodeHolder code;
|
||||
code.init(m_spurt->m_jitrt.getCodeInfo());
|
||||
code.init(m_asmrt->getCodeInfo());
|
||||
code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign;
|
||||
|
||||
X86Assembler compiler(&code);
|
||||
@ -861,14 +842,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
||||
// Compile and get function address
|
||||
spu_function_t fn;
|
||||
|
||||
if (m_spurt->m_jitrt.add(&fn, &code))
|
||||
if (m_asmrt->add(&fn, &code))
|
||||
{
|
||||
LOG_FATAL(SPU, "Failed to build a function");
|
||||
}
|
||||
|
||||
// Register function
|
||||
fn_location = fn;
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
// Add ASMJIT logs
|
||||
@ -885,6 +863,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
||||
m_cache->add(func);
|
||||
}
|
||||
|
||||
lock.lock();
|
||||
|
||||
// Register function (possibly temporarily)
|
||||
fn_location = fn;
|
||||
|
||||
// Generate a dispatcher (übertrampoline)
|
||||
std::vector<u32> addrv{func[0]};
|
||||
const auto beg = m_spurt->m_map.lower_bound(addrv);
|
||||
@ -899,19 +882,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
||||
else
|
||||
{
|
||||
CodeHolder code;
|
||||
code.init(m_spurt->m_jitrt.getCodeInfo());
|
||||
code.init(m_asmrt->getCodeInfo());
|
||||
|
||||
X86Assembler compiler(&code);
|
||||
this->c = &compiler;
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
// Set logger
|
||||
code.setLogger(&logger);
|
||||
}
|
||||
|
||||
compiler.comment("\n\nTrampoline:\n\n");
|
||||
|
||||
struct work
|
||||
{
|
||||
u32 size;
|
||||
@ -1110,7 +1085,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
||||
|
||||
spu_function_t tr;
|
||||
|
||||
if (m_spurt->m_jitrt.add(&tr, &code))
|
||||
if (m_asmrt->add(&tr, &code))
|
||||
{
|
||||
LOG_FATAL(SPU, "Failed to build a trampoline");
|
||||
}
|
||||
@ -1118,6 +1093,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
||||
m_spurt->m_dispatcher[func[0] / 4] = tr;
|
||||
}
|
||||
|
||||
lock.unlock();
|
||||
m_spurt->m_cond.notify_all();
|
||||
|
||||
return fn;
|
||||
}
|
||||
|
||||
|
@ -1,33 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include "Utilities/JIT.h"
|
||||
#include "Utilities/mutex.h"
|
||||
#include "SPURecompiler.h"
|
||||
|
||||
#include <functional>
|
||||
|
||||
// SPU ASMJIT Runtime object (global)
|
||||
class spu_runtime
|
||||
{
|
||||
shared_mutex m_mutex;
|
||||
|
||||
asmjit::JitRuntime m_jitrt;
|
||||
|
||||
// All functions
|
||||
std::map<std::vector<u32>, spu_function_t> m_map;
|
||||
|
||||
// All dispatchers
|
||||
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
|
||||
|
||||
// Debug module output location
|
||||
std::string m_cache_path;
|
||||
|
||||
friend class spu_recompiler;
|
||||
|
||||
public:
|
||||
spu_runtime();
|
||||
};
|
||||
|
||||
// SPU ASMJIT Recompiler
|
||||
class spu_recompiler : public spu_recompiler_base
|
||||
{
|
||||
@ -43,6 +20,9 @@ public:
|
||||
virtual spu_function_t compile(std::vector<u32>&&) override;
|
||||
|
||||
private:
|
||||
// ASMJIT runtime
|
||||
asmjit::JitRuntime* m_asmrt;
|
||||
|
||||
// emitter:
|
||||
asmjit::X86Assembler* c;
|
||||
|
||||
|
@ -24,7 +24,7 @@ const spu_decoder<spu_iname> s_spu_iname;
|
||||
extern u64 get_timebased_time();
|
||||
|
||||
spu_cache::spu_cache(const std::string& loc)
|
||||
: m_file(loc, fs::read + fs::write + fs::create)
|
||||
: m_file(loc, fs::read + fs::write + fs::create + fs::append)
|
||||
{
|
||||
}
|
||||
|
||||
@ -76,18 +76,22 @@ void spu_cache::add(const std::vector<u32>& func)
|
||||
return;
|
||||
}
|
||||
|
||||
be_t<u32> size = ::size32(func) - 1;
|
||||
be_t<u32> addr = func[0];
|
||||
m_file.write(size);
|
||||
m_file.write(addr);
|
||||
m_file.write(func.data() + 1, func.size() * 4 - 4);
|
||||
// Allocate buffer
|
||||
const auto buf = std::make_unique<be_t<u32>[]>(func.size() + 1);
|
||||
|
||||
buf[0] = ::size32(func) - 1;
|
||||
buf[1] = func[0];
|
||||
std::memcpy(buf.get() + 2, func.data() + 1, func.size() * 4 - 4);
|
||||
|
||||
// Append data
|
||||
m_file.write(buf.get(), func.size() * 4 + 4);
|
||||
}
|
||||
|
||||
void spu_cache::initialize()
|
||||
{
|
||||
const std::string ppu_cache = Emu.PPUCache();
|
||||
|
||||
if (ppu_cache.empty() || !g_cfg.core.spu_shared_runtime)
|
||||
if (ppu_cache.empty())
|
||||
{
|
||||
return;
|
||||
}
|
||||
@ -105,30 +109,34 @@ void spu_cache::initialize()
|
||||
|
||||
// Read cache
|
||||
auto func_list = cache->get();
|
||||
atomic_t<std::size_t> fnext{};
|
||||
|
||||
// Recompiler instance for cache initialization
|
||||
std::unique_ptr<spu_recompiler_base> compiler;
|
||||
// Initialize compiler instances for parallel compilation
|
||||
u32 max_threads = static_cast<u32>(g_cfg.core.llvm_threads);
|
||||
u32 thread_count = max_threads > 0 ? std::min(max_threads, std::thread::hardware_concurrency()) : std::thread::hardware_concurrency();
|
||||
std::vector<std::unique_ptr<spu_recompiler_base>> compilers{thread_count};
|
||||
|
||||
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
|
||||
for (auto& compiler : compilers)
|
||||
{
|
||||
compiler = spu_recompiler_base::make_asmjit_recompiler();
|
||||
}
|
||||
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
|
||||
{
|
||||
compiler = spu_recompiler_base::make_asmjit_recompiler();
|
||||
}
|
||||
else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
|
||||
{
|
||||
compiler = spu_recompiler_base::make_llvm_recompiler();
|
||||
}
|
||||
else
|
||||
{
|
||||
compilers.clear();
|
||||
break;
|
||||
}
|
||||
|
||||
if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
|
||||
{
|
||||
compiler = spu_recompiler_base::make_llvm_recompiler();
|
||||
}
|
||||
|
||||
if (compiler)
|
||||
{
|
||||
compiler->init();
|
||||
}
|
||||
|
||||
if (compiler && !func_list.empty())
|
||||
if (compilers.size() && !func_list.empty())
|
||||
{
|
||||
// Fake LS
|
||||
std::vector<be_t<u32>> ls(0x10000);
|
||||
|
||||
// Initialize progress dialog (wait for previous progress done)
|
||||
while (g_progr_ptotal)
|
||||
{
|
||||
@ -137,10 +145,20 @@ void spu_cache::initialize()
|
||||
|
||||
g_progr = "Building SPU cache...";
|
||||
g_progr_ptotal += func_list.size();
|
||||
}
|
||||
|
||||
std::deque<named_thread<std::function<void()>>> thread_queue;
|
||||
|
||||
for (std::size_t i = 0; i < compilers.size(); i++) thread_queue.emplace_back("Worker " + std::to_string(i), [&, compiler = compilers[i].get()]()
|
||||
{
|
||||
// Fake LS
|
||||
std::vector<be_t<u32>> ls(0x10000);
|
||||
|
||||
// Build functions
|
||||
for (auto&& func : func_list)
|
||||
for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++)
|
||||
{
|
||||
std::vector<u32>& func = func_list[func_i];
|
||||
|
||||
if (Emu.IsStopped())
|
||||
{
|
||||
g_progr_pdone++;
|
||||
@ -185,13 +203,22 @@ void spu_cache::initialize()
|
||||
|
||||
g_progr_pdone++;
|
||||
}
|
||||
});
|
||||
|
||||
if (Emu.IsStopped())
|
||||
{
|
||||
LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
|
||||
return;
|
||||
}
|
||||
// Join all threads
|
||||
while (!thread_queue.empty())
|
||||
{
|
||||
thread_queue.pop_front();
|
||||
}
|
||||
|
||||
if (Emu.IsStopped())
|
||||
{
|
||||
LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (compilers.size() && !func_list.empty())
|
||||
{
|
||||
LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
|
||||
}
|
||||
|
||||
@ -202,6 +229,317 @@ void spu_cache::initialize()
|
||||
});
|
||||
}
|
||||
|
||||
spu_runtime::spu_runtime()
|
||||
{
|
||||
// Initialize lookup table
|
||||
for (auto& v : m_dispatcher)
|
||||
{
|
||||
v.raw() = &spu_recompiler_base::dispatch;
|
||||
}
|
||||
|
||||
// Initialize "empty" block
|
||||
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
|
||||
|
||||
// Clear LLVM output
|
||||
m_cache_path = Emu.PPUCache();
|
||||
fs::create_dir(m_cache_path + "llvm/");
|
||||
fs::remove_all(m_cache_path + "llvm/", false);
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
fs::file(m_cache_path + "spu.log", fs::rewrite);
|
||||
}
|
||||
|
||||
LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized...");
|
||||
}
|
||||
|
||||
asmjit::JitRuntime* spu_runtime::get_asmjit_rt()
|
||||
{
|
||||
std::lock_guard lock(m_mutex);
|
||||
|
||||
m_asmjit_rts.emplace_back(std::make_unique<asmjit::JitRuntime>());
|
||||
|
||||
return m_asmjit_rts.back().get();
|
||||
}
|
||||
|
||||
void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled)
|
||||
{
|
||||
std::unique_lock lock(m_mutex);
|
||||
|
||||
// Function info
|
||||
const std::vector<u32>& func = where.first;
|
||||
|
||||
//
|
||||
const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
|
||||
|
||||
// Set pointer to the compiled function
|
||||
where.second = compiled;
|
||||
|
||||
// Generate a dispatcher (übertrampoline)
|
||||
std::vector<u32> addrv{func[0]};
|
||||
const auto beg = m_map.lower_bound(addrv);
|
||||
addrv[0] += 4;
|
||||
const auto _end = m_map.lower_bound(addrv);
|
||||
const u32 size0 = std::distance(beg, _end);
|
||||
|
||||
if (size0 == 1)
|
||||
{
|
||||
m_dispatcher[func[0] / 4] = compiled;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Allocate some writable executable memory
|
||||
#ifdef LLVM_AVAILABLE
|
||||
const auto wxptr = jit_compiler::alloc(size0 * 20);
|
||||
#else
|
||||
u8* const wxptr = new u8[size0 * 20]; // dummy
|
||||
#endif
|
||||
|
||||
// Raw assembly pointer
|
||||
u8* raw = wxptr;
|
||||
|
||||
struct work
|
||||
{
|
||||
u32 size;
|
||||
u32 level;
|
||||
u8* rel32;
|
||||
std::map<std::vector<u32>, spu_function_t>::iterator beg;
|
||||
std::map<std::vector<u32>, spu_function_t>::iterator end;
|
||||
};
|
||||
|
||||
// Write jump instruction with rel32 immediate
|
||||
auto make_jump = [&](u8 op, auto target)
|
||||
{
|
||||
verify("Asm overflow" HERE), raw + 6 <= wxptr + size0 * 20;
|
||||
|
||||
if (!target && !tr_dispatch)
|
||||
{
|
||||
// Generate a special trampoline with pause instruction
|
||||
#ifdef LLVM_AVAILABLE
|
||||
const auto trptr = jit_compiler::alloc(16);
|
||||
#else
|
||||
u8* const trptr = new u8[16]; // dummy
|
||||
#endif
|
||||
trptr[0] = 0xf3; // pause
|
||||
trptr[1] = 0x90;
|
||||
trptr[2] = 0xff; // jmp [rip]
|
||||
trptr[3] = 0x25;
|
||||
std::memset(trptr + 4, 0, 4);
|
||||
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
|
||||
std::memcpy(trptr + 8, &target, 8);
|
||||
tr_dispatch = reinterpret_cast<spu_function_t>(trptr);
|
||||
}
|
||||
|
||||
// Fallback to dispatch if no target
|
||||
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
|
||||
|
||||
// Compute the distance
|
||||
const s64 rel = taddr - reinterpret_cast<u64>(raw) - (op != 0xe9 ? 6 : 5);
|
||||
|
||||
verify(HERE), rel >= INT32_MIN, rel <= INT32_MAX;
|
||||
|
||||
if (op != 0xe9)
|
||||
{
|
||||
// First jcc byte
|
||||
*raw++ = 0x0f;
|
||||
verify(HERE), (op >> 4) == 0x8;
|
||||
}
|
||||
|
||||
*raw++ = op;
|
||||
|
||||
const s32 r32 = static_cast<s32>(rel);
|
||||
|
||||
std::memcpy(raw, &r32, 4);
|
||||
raw += 4;
|
||||
};
|
||||
|
||||
std::vector<work> workload;
|
||||
workload.reserve(size0);
|
||||
workload.emplace_back();
|
||||
workload.back().size = size0;
|
||||
workload.back().level = 1;
|
||||
workload.back().rel32 = 0;
|
||||
workload.back().beg = beg;
|
||||
workload.back().end = _end;
|
||||
|
||||
for (std::size_t i = 0; i < workload.size(); i++)
|
||||
{
|
||||
// Get copy of the workload info
|
||||
work w = workload[i];
|
||||
|
||||
// Split range in two parts
|
||||
auto it = w.beg;
|
||||
auto it2 = w.beg;
|
||||
u32 size1 = w.size / 2;
|
||||
u32 size2 = w.size - size1;
|
||||
std::advance(it2, w.size / 2);
|
||||
|
||||
while (true)
|
||||
{
|
||||
it = it2;
|
||||
size1 = w.size - size2;
|
||||
|
||||
if (w.level >= w.beg->first.size())
|
||||
{
|
||||
// Cannot split: smallest function is a prefix of bigger ones (TODO)
|
||||
break;
|
||||
}
|
||||
|
||||
const u32 x1 = w.beg->first.at(w.level);
|
||||
|
||||
if (!x1)
|
||||
{
|
||||
// Cannot split: some functions contain holes at this level
|
||||
w.level++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Adjust ranges (forward)
|
||||
while (it != w.end && x1 == it->first.at(w.level))
|
||||
{
|
||||
it++;
|
||||
size1++;
|
||||
}
|
||||
|
||||
if (it == w.end)
|
||||
{
|
||||
// Cannot split: words are identical within the range at this level
|
||||
w.level++;
|
||||
}
|
||||
else
|
||||
{
|
||||
size2 = w.size - size1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (w.rel32)
|
||||
{
|
||||
// Patch rel32 linking it to the current location if necessary
|
||||
const s32 r32 = ::narrow<s32>(raw - w.rel32, HERE);
|
||||
std::memcpy(w.rel32 - 4, &r32, 4);
|
||||
}
|
||||
|
||||
if (w.level >= w.beg->first.size())
|
||||
{
|
||||
// If functions cannot be compared, assume smallest function
|
||||
LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
|
||||
make_jump(0xe9, w.beg->second); // jmp rel32
|
||||
continue;
|
||||
}
|
||||
|
||||
// Value for comparison
|
||||
const u32 x = it->first.at(w.level);
|
||||
|
||||
// Adjust ranges (backward)
|
||||
while (true)
|
||||
{
|
||||
it--;
|
||||
|
||||
if (it->first.at(w.level) != x)
|
||||
{
|
||||
it++;
|
||||
break;
|
||||
}
|
||||
|
||||
verify(HERE), it != w.beg;
|
||||
size1--;
|
||||
size2++;
|
||||
}
|
||||
|
||||
// Emit 32-bit comparison: cmp [ls+addr], imm32
|
||||
verify("Asm overflow" HERE), raw + 10 <= wxptr + size0 * 20;
|
||||
const u32 cmp_lsa = start + (w.level - 1) * 4;
|
||||
*raw++ = 0x81;
|
||||
#ifdef _WIN32
|
||||
*raw++ = 0xba;
|
||||
#else
|
||||
*raw++ = 0xbe;
|
||||
#endif
|
||||
std::memcpy(raw, &cmp_lsa, 4);
|
||||
std::memcpy(raw + 4, &x, 4);
|
||||
raw += 8;
|
||||
|
||||
// Low subrange target
|
||||
if (size1 == 1)
|
||||
{
|
||||
make_jump(0x82, w.beg->second); // jb rel32
|
||||
}
|
||||
else
|
||||
{
|
||||
make_jump(0x82, raw); // jb rel32 (stub)
|
||||
workload.push_back(w);
|
||||
workload.back().end = it;
|
||||
workload.back().size = size1;
|
||||
workload.back().rel32 = raw;
|
||||
}
|
||||
|
||||
// Second subrange target
|
||||
if (size2 == 1)
|
||||
{
|
||||
make_jump(0xe9, it->second); // jmp rel32
|
||||
}
|
||||
else
|
||||
{
|
||||
it2 = it;
|
||||
|
||||
// Select additional midrange for equality comparison
|
||||
while (it2 != w.end && it2->first.at(w.level) == x)
|
||||
{
|
||||
size2--;
|
||||
it2++;
|
||||
}
|
||||
|
||||
if (it2 != w.end)
|
||||
{
|
||||
// High subrange target
|
||||
if (size2 == 1)
|
||||
{
|
||||
make_jump(0x87, it2->second); // ja rel32
|
||||
}
|
||||
else
|
||||
{
|
||||
make_jump(0x87, raw); // ja rel32 (stub)
|
||||
workload.push_back(w);
|
||||
workload.back().beg = it2;
|
||||
workload.back().size = size2;
|
||||
workload.back().rel32 = raw;
|
||||
}
|
||||
|
||||
const u32 size3 = w.size - size1 - size2;
|
||||
|
||||
if (size3 == 1)
|
||||
{
|
||||
make_jump(0xe9, it->second); // jmp rel32
|
||||
}
|
||||
else
|
||||
{
|
||||
make_jump(0xe9, raw); // jmp rel32 (stub)
|
||||
workload.push_back(w);
|
||||
workload.back().beg = it;
|
||||
workload.back().end = it2;
|
||||
workload.back().size = size3;
|
||||
workload.back().rel32 = raw;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
make_jump(0xe9, raw); // jmp rel32 (stub)
|
||||
workload.push_back(w);
|
||||
workload.back().beg = it;
|
||||
workload.back().size = w.size - size1;
|
||||
workload.back().rel32 = raw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_dispatcher[func[0] / 4] = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
|
||||
}
|
||||
|
||||
lock.unlock();
|
||||
m_cond.notify_all();
|
||||
}
|
||||
|
||||
spu_recompiler_base::spu_recompiler_base()
|
||||
{
|
||||
}
|
||||
@ -1491,55 +1829,14 @@ void spu_recompiler_base::dump(std::string& out)
|
||||
#include "llvm/Transforms/Scalar.h"
|
||||
#include "llvm/Transforms/IPO.h"
|
||||
#include "llvm/Transforms/Vectorize.h"
|
||||
#include "Utilities/JIT.h"
|
||||
|
||||
class spu_llvm_runtime
|
||||
{
|
||||
shared_mutex m_mutex;
|
||||
|
||||
// All functions
|
||||
std::map<std::vector<u32>, spu_function_t> m_map;
|
||||
|
||||
// All dispatchers
|
||||
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
|
||||
|
||||
// JIT instance
|
||||
jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
|
||||
|
||||
// Debug module output location
|
||||
std::string m_cache_path;
|
||||
|
||||
friend class spu_llvm_recompiler;
|
||||
|
||||
public:
|
||||
spu_llvm_runtime()
|
||||
{
|
||||
// Initialize lookup table
|
||||
for (auto& v : m_dispatcher)
|
||||
{
|
||||
v.raw() = &spu_recompiler_base::dispatch;
|
||||
}
|
||||
|
||||
// Initialize "empty" block
|
||||
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
|
||||
|
||||
// Clear LLVM output
|
||||
m_cache_path = Emu.PPUCache();
|
||||
fs::create_dir(m_cache_path + "llvm/");
|
||||
fs::remove_all(m_cache_path + "llvm/", false);
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
fs::file(m_cache_path + "spu.log", fs::rewrite);
|
||||
}
|
||||
|
||||
LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized...");
|
||||
}
|
||||
};
|
||||
|
||||
class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
{
|
||||
std::shared_ptr<spu_llvm_runtime> m_spurt;
|
||||
// SPU Runtime Instance
|
||||
std::shared_ptr<spu_runtime> m_spurt;
|
||||
|
||||
// JIT Instance
|
||||
jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
|
||||
|
||||
// Current function (chunk)
|
||||
llvm::Function* m_function;
|
||||
@ -2239,11 +2536,6 @@ public:
|
||||
: spu_recompiler_base()
|
||||
, cpu_translator(nullptr, false)
|
||||
{
|
||||
if (g_cfg.core.spu_shared_runtime)
|
||||
{
|
||||
// TODO (local context is unsupported)
|
||||
//m_spurt = std::make_shared<spu_llvm_runtime>();
|
||||
}
|
||||
}
|
||||
|
||||
virtual void init() override
|
||||
@ -2252,9 +2544,9 @@ public:
|
||||
if (!m_spurt)
|
||||
{
|
||||
m_cache = fxm::get<spu_cache>();
|
||||
m_spurt = fxm::get_always<spu_llvm_runtime>();
|
||||
m_context = m_spurt->m_jit.get_context();
|
||||
m_use_ssse3 = m_spurt->m_jit.has_ssse3();
|
||||
m_spurt = fxm::get_always<spu_runtime>();
|
||||
m_context = m_jit.get_context();
|
||||
m_use_ssse3 = m_jit.has_ssse3();
|
||||
}
|
||||
}
|
||||
|
||||
@ -2271,18 +2563,22 @@ public:
|
||||
init();
|
||||
|
||||
// Don't lock without shared runtime
|
||||
std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
|
||||
|
||||
if (g_cfg.core.spu_shared_runtime)
|
||||
{
|
||||
lock.lock();
|
||||
}
|
||||
std::unique_lock lock(m_spurt->m_mutex);
|
||||
|
||||
// Try to find existing function, register new one if necessary
|
||||
const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
|
||||
|
||||
auto& fn_location = fn_info.first->second;
|
||||
|
||||
if (!fn_location && !fn_info.second)
|
||||
{
|
||||
// Wait if already in progress
|
||||
while (!fn_location)
|
||||
{
|
||||
m_spurt->m_cond.wait(lock);
|
||||
}
|
||||
}
|
||||
|
||||
if (fn_location)
|
||||
{
|
||||
return fn_location;
|
||||
@ -2290,6 +2586,8 @@ public:
|
||||
|
||||
auto& func = fn_info.first->first;
|
||||
|
||||
lock.unlock();
|
||||
|
||||
std::string hash;
|
||||
{
|
||||
sha1_context ctx;
|
||||
@ -2770,179 +3068,6 @@ public:
|
||||
m_scan_queue.clear();
|
||||
m_function_table = nullptr;
|
||||
|
||||
// Generate a dispatcher (übertrampoline)
|
||||
std::vector<u32> addrv{func[0]};
|
||||
const auto beg = m_spurt->m_map.lower_bound(addrv);
|
||||
addrv[0] += 4;
|
||||
const auto _end = m_spurt->m_map.lower_bound(addrv);
|
||||
const u32 size0 = std::distance(beg, _end);
|
||||
|
||||
if (size0 > 1)
|
||||
{
|
||||
const auto trampoline = cast<Function>(module->getOrInsertFunction(fmt::format("spu-0x%05x-trampoline-%03u", func[0], size0), get_type<void>(), get_type<u8*>(), get_type<u8*>()));
|
||||
set_function(trampoline);
|
||||
|
||||
struct work
|
||||
{
|
||||
u32 size;
|
||||
u32 level;
|
||||
BasicBlock* label;
|
||||
std::map<std::vector<u32>, spu_function_t>::iterator beg;
|
||||
std::map<std::vector<u32>, spu_function_t>::iterator end;
|
||||
};
|
||||
|
||||
std::vector<work> workload;
|
||||
workload.reserve(size0);
|
||||
workload.emplace_back();
|
||||
workload.back().size = size0;
|
||||
workload.back().level = 1;
|
||||
workload.back().beg = beg;
|
||||
workload.back().end = _end;
|
||||
workload.back().label = m_ir->GetInsertBlock();
|
||||
|
||||
for (std::size_t i = 0; i < workload.size(); i++)
|
||||
{
|
||||
// Get copy of the workload info
|
||||
work w = workload[i];
|
||||
|
||||
// Switch targets
|
||||
std::vector<std::pair<u32, llvm::BasicBlock*>> targets;
|
||||
|
||||
llvm::BasicBlock* def{};
|
||||
|
||||
bool unsorted = false;
|
||||
|
||||
while (w.level < w.beg->first.size())
|
||||
{
|
||||
const u32 x1 = w.beg->first.at(w.level);
|
||||
|
||||
if (x1 == 0)
|
||||
{
|
||||
// Cannot split: some functions contain holes at this level
|
||||
auto it = w.end;
|
||||
it--;
|
||||
|
||||
if (it->first.at(w.level) != 0)
|
||||
{
|
||||
unsorted = true;
|
||||
}
|
||||
|
||||
w.level++;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto it = w.beg;
|
||||
auto it2 = it;
|
||||
u32 x = x1;
|
||||
bool split = false;
|
||||
|
||||
while (it2 != w.end)
|
||||
{
|
||||
it2++;
|
||||
|
||||
const u32 x2 = it2 != w.end ? it2->first.at(w.level) : x1;
|
||||
|
||||
if (x2 != x)
|
||||
{
|
||||
const u32 dist = std::distance(it, it2);
|
||||
|
||||
const auto b = llvm::BasicBlock::Create(m_context, "", m_function);
|
||||
|
||||
if (dist == 1 && x != 0)
|
||||
{
|
||||
m_ir->SetInsertPoint(b);
|
||||
|
||||
if (const u64 fval = reinterpret_cast<u64>(it->second))
|
||||
{
|
||||
const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
|
||||
m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
|
||||
}
|
||||
else
|
||||
{
|
||||
verify(HERE, &it->second == &fn_location);
|
||||
m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
|
||||
}
|
||||
|
||||
m_ir->CreateRetVoid();
|
||||
}
|
||||
else
|
||||
{
|
||||
workload.emplace_back(w);
|
||||
workload.back().beg = it;
|
||||
workload.back().end = it2;
|
||||
workload.back().label = b;
|
||||
workload.back().size = dist;
|
||||
}
|
||||
|
||||
if (x == 0)
|
||||
{
|
||||
def = b;
|
||||
}
|
||||
else
|
||||
{
|
||||
targets.emplace_back(std::make_pair(x, b));
|
||||
}
|
||||
|
||||
x = x2;
|
||||
it = it2;
|
||||
split = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!split)
|
||||
{
|
||||
// Cannot split: words are identical within the range at this level
|
||||
w.level++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!def && targets.empty())
|
||||
{
|
||||
LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
|
||||
m_ir->SetInsertPoint(w.label);
|
||||
|
||||
if (const u64 fval = reinterpret_cast<u64>(w.beg->second))
|
||||
{
|
||||
const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
|
||||
m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
|
||||
}
|
||||
else
|
||||
{
|
||||
verify(HERE, &w.beg->second == &fn_location);
|
||||
m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
|
||||
}
|
||||
|
||||
m_ir->CreateRetVoid();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!def)
|
||||
{
|
||||
def = llvm::BasicBlock::Create(m_context, "", m_function);
|
||||
|
||||
m_ir->SetInsertPoint(def);
|
||||
tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0));
|
||||
}
|
||||
|
||||
m_ir->SetInsertPoint(w.label);
|
||||
const auto add = m_ir->CreateGEP(m_lsptr, m_ir->getInt64(start + w.level * 4 - 4));
|
||||
const auto ptr = m_ir->CreateBitCast(add, get_type<u32*>());
|
||||
const auto val = m_ir->CreateLoad(ptr);
|
||||
const auto sw = m_ir->CreateSwitch(val, def, ::size32(targets));
|
||||
|
||||
for (auto& pair : targets)
|
||||
{
|
||||
sw->addCase(m_ir->getInt32(pair.first), pair.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
spu_function_t fn{}, tr{};
|
||||
|
||||
std::string log;
|
||||
|
||||
raw_string_ostream out(log);
|
||||
@ -2970,32 +3095,19 @@ public:
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
// Testing only
|
||||
m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
|
||||
m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
|
||||
}
|
||||
else
|
||||
{
|
||||
m_spurt->m_jit.add(std::move(module));
|
||||
m_jit.add(std::move(module));
|
||||
}
|
||||
|
||||
m_spurt->m_jit.fin();
|
||||
fn = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(main_func));
|
||||
tr = fn;
|
||||
|
||||
if (size0 > 1)
|
||||
{
|
||||
tr = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(m_function));
|
||||
}
|
||||
m_jit.fin();
|
||||
|
||||
// Register function pointer
|
||||
fn_location = fn;
|
||||
const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));
|
||||
|
||||
// Trampoline
|
||||
m_spurt->m_dispatcher[func[0] / 4] = tr;
|
||||
|
||||
LOG_NOTICE(SPU, "[0x%x] Compiled: %p", func[0], fn);
|
||||
|
||||
if (tr != fn)
|
||||
LOG_NOTICE(SPU, "[0x%x] T: %p", func[0], tr);
|
||||
m_spurt->add(*fn_info.first, fn);
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
|
@ -1,6 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include "Utilities/File.h"
|
||||
#include "Utilities/mutex.h"
|
||||
#include "Utilities/cond.h"
|
||||
#include "Utilities/JIT.h"
|
||||
#include "SPUThread.h"
|
||||
#include <vector>
|
||||
#include <bitset>
|
||||
@ -30,6 +33,40 @@ public:
|
||||
static void initialize();
|
||||
};
|
||||
|
||||
// Helper class
|
||||
class spu_runtime
|
||||
{
|
||||
public:
|
||||
shared_mutex m_mutex;
|
||||
|
||||
cond_variable m_cond;
|
||||
|
||||
// All functions
|
||||
std::map<std::vector<u32>, spu_function_t> m_map;
|
||||
|
||||
// All dispatchers
|
||||
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
|
||||
|
||||
// Debug module output location
|
||||
std::string m_cache_path;
|
||||
|
||||
private:
|
||||
// Temporarily: asmjit runtime collection
|
||||
std::deque<std::unique_ptr<asmjit::JitRuntime>> m_asmjit_rts;
|
||||
|
||||
// Trampoline to spu_recompiler_base::dispatch
|
||||
spu_function_t tr_dispatch = nullptr;
|
||||
|
||||
public:
|
||||
spu_runtime();
|
||||
|
||||
// Get new ASMJIT runtime
|
||||
asmjit::JitRuntime* get_asmjit_rt();
|
||||
|
||||
// Add compiled function and generate trampoline if necessary
|
||||
void add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled);
|
||||
};
|
||||
|
||||
// SPU Recompiler instance base class
|
||||
class spu_recompiler_base
|
||||
{
|
||||
|
@ -367,7 +367,6 @@ struct cfg_root : cfg::node
|
||||
cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
|
||||
cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
|
||||
cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield
|
||||
cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads
|
||||
cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
|
||||
cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
|
||||
cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};
|
||||
|
Loading…
x
Reference in New Issue
Block a user