mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-01-31 20:41:45 +01:00
SPU: multithread compilation
Allow parallel compilation of SPU code, both at startup and runtime Remove 'SPU Shared Runtime' option (it became obsolete) Refactor spu_runtime class (now is common for ASMJIT and LLVM) Implement SPU ubertrampoline generation in raw assembly (LLVM) Minor improvement of balanced_wait_until<> and balanced_awaken<> Make JIT MemoryManager2 shared (global) Fix wrong assertion in cond_variable
This commit is contained in:
parent
8d5d44141e
commit
4f152ad126
@ -95,6 +95,12 @@ static void* const s_memory = []() -> void*
|
|||||||
return utils::memory_reserve(s_memory_size);
|
return utils::memory_reserve(s_memory_size);
|
||||||
}();
|
}();
|
||||||
|
|
||||||
|
// Reserve 2G of memory, should replace previous area for ASLR compatibility
|
||||||
|
static void* const s_memory2 = utils::memory_reserve(0x80000000);
|
||||||
|
|
||||||
|
static u64 s_code_pos = 0;
|
||||||
|
static u64 s_data_pos = 0;
|
||||||
|
|
||||||
static void* s_next = s_memory;
|
static void* s_next = s_memory;
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@ -129,6 +135,11 @@ extern void jit_finalize()
|
|||||||
utils::memory_decommit(s_memory, s_memory_size);
|
utils::memory_decommit(s_memory, s_memory_size);
|
||||||
|
|
||||||
s_next = s_memory;
|
s_next = s_memory;
|
||||||
|
|
||||||
|
utils::memory_decommit(s_memory2, 0x80000000);
|
||||||
|
|
||||||
|
s_code_pos = 0;
|
||||||
|
s_data_pos = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper class
|
// Helper class
|
||||||
@ -311,24 +322,25 @@ struct MemoryManager : llvm::RTDyldMemoryManager
|
|||||||
// Simple memory manager
|
// Simple memory manager
|
||||||
struct MemoryManager2 : llvm::RTDyldMemoryManager
|
struct MemoryManager2 : llvm::RTDyldMemoryManager
|
||||||
{
|
{
|
||||||
// Reserve 2 GiB
|
// Patchwork again...
|
||||||
void* const m_memory = utils::memory_reserve(0x80000000);
|
void* const m_memory = s_memory2;
|
||||||
|
|
||||||
u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
|
u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
|
||||||
u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;
|
u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;
|
||||||
|
|
||||||
u64 m_code_pos = 0;
|
u64& m_code_pos = s_code_pos;
|
||||||
u64 m_data_pos = 0;
|
u64& m_data_pos = s_data_pos;
|
||||||
|
|
||||||
MemoryManager2() = default;
|
MemoryManager2() = default;
|
||||||
|
|
||||||
~MemoryManager2() override
|
~MemoryManager2() override
|
||||||
{
|
{
|
||||||
utils::memory_release(m_memory, 0x80000000);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
|
u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
|
||||||
{
|
{
|
||||||
|
std::lock_guard lock(s_mutex);
|
||||||
|
|
||||||
// Simple allocation
|
// Simple allocation
|
||||||
const u64 old = m_code_pos;
|
const u64 old = m_code_pos;
|
||||||
const u64 pos = ::align(m_code_pos, align);
|
const u64 pos = ::align(m_code_pos, align);
|
||||||
@ -349,12 +361,20 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
|
|||||||
utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
|
utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!sec_id && sec_name.empty())
|
||||||
|
{
|
||||||
|
// Special case: don't log
|
||||||
|
return m_code + pos;
|
||||||
|
}
|
||||||
|
|
||||||
LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
|
LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
|
||||||
return m_code + pos;
|
return m_code + pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
|
u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
|
||||||
{
|
{
|
||||||
|
std::lock_guard lock(s_mutex);
|
||||||
|
|
||||||
// Simple allocation
|
// Simple allocation
|
||||||
const u64 old = m_data_pos;
|
const u64 old = m_data_pos;
|
||||||
const u64 pos = ::align(m_data_pos, align);
|
const u64 pos = ::align(m_data_pos, align);
|
||||||
@ -642,33 +662,12 @@ u64 jit_compiler::get(const std::string& name)
|
|||||||
return m_engine->getGlobalValueAddress(name);
|
return m_engine->getGlobalValueAddress(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unordered_map<std::string, u64> jit_compiler::add(std::unordered_map<std::string, std::string> data)
|
u8* jit_compiler::alloc(u32 size)
|
||||||
{
|
{
|
||||||
// Lock memory manager
|
// Dummy memory manager object
|
||||||
std::lock_guard lock(s_mutex);
|
MemoryManager2 mm;
|
||||||
|
|
||||||
std::unordered_map<std::string, u64> result;
|
return mm.allocateCodeSection(size, 16, 0, {});
|
||||||
|
|
||||||
std::size_t size = 0;
|
|
||||||
|
|
||||||
for (auto&& pair : data)
|
|
||||||
{
|
|
||||||
size += ::align(pair.second.size(), 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
utils::memory_commit(s_next, size, utils::protection::wx);
|
|
||||||
std::memset(s_next, 0xc3, ::align(size, 4096));
|
|
||||||
|
|
||||||
for (auto&& pair : data)
|
|
||||||
{
|
|
||||||
std::memcpy(s_next, pair.second.data(), pair.second.size());
|
|
||||||
result.emplace(pair.first, (u64)s_next);
|
|
||||||
s_next = (void*)::align((u64)s_next + pair.second.size(), 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
s_next = (void*)::align((u64)s_next, 4096);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -61,6 +61,7 @@ FT build_function_asm(F&& builder)
|
|||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
#include "types.h"
|
#include "types.h"
|
||||||
@ -129,8 +130,8 @@ public:
|
|||||||
// Get compiled function address
|
// Get compiled function address
|
||||||
u64 get(const std::string& name);
|
u64 get(const std::string& name);
|
||||||
|
|
||||||
// Add functions directly to the memory manager (name -> code)
|
// Allocate writable executable memory (alignment is assumed 16)
|
||||||
static std::unordered_map<std::string, u64> add(std::unordered_map<std::string, std::string>);
|
static u8* alloc(u32 size);
|
||||||
|
|
||||||
// Get CPU info
|
// Get CPU info
|
||||||
static std::string cpu(const std::string& _cpu);
|
static std::string cpu(const std::string& _cpu);
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
|
bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
|
||||||
{
|
{
|
||||||
verify("cond_variable overflow" HERE), (_old & 0xffff) == 0; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
|
verify("cond_variable overflow" HERE), (_old & 0xffff) != 0xffff; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
|
||||||
|
|
||||||
return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int
|
return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int
|
||||||
{
|
{
|
||||||
@ -42,7 +42,8 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
|
|||||||
|
|
||||||
void cond_variable::imp_wake(u32 _count) noexcept
|
void cond_variable::imp_wake(u32 _count) noexcept
|
||||||
{
|
{
|
||||||
balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32
|
// TODO (notify_one)
|
||||||
|
balanced_awaken<true>(m_value, m_value.atomic_op([&](u32& value) -> u32
|
||||||
{
|
{
|
||||||
// Subtract already signaled number from total amount of waiters
|
// Subtract already signaled number from total amount of waiters
|
||||||
const u32 can_sig = (value & 0xffff) - (value >> 16);
|
const u32 can_sig = (value & 0xffff) - (value >> 16);
|
||||||
@ -266,7 +267,7 @@ void cond_x16::imp_notify() noexcept
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
balanced_awaken(m_cvx16, utils::popcnt16(wait_mask));
|
balanced_awaken<true>(m_cvx16, utils::popcnt16(wait_mask));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool lf_queue_base::wait(u64 _timeout)
|
bool lf_queue_base::wait(u64 _timeout)
|
||||||
|
@ -186,7 +186,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
|
|||||||
{
|
{
|
||||||
if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000))
|
if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000))
|
||||||
{
|
{
|
||||||
if (!test_pred(value) && !test_pred(value, nullptr))
|
if (!test_pred(value, nullptr))
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -220,7 +220,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!test_pred(value) && !test_pred(value, nullptr))
|
if (!test_pred(value, nullptr))
|
||||||
{
|
{
|
||||||
// Stolen notification: restore balance
|
// Stolen notification: restore balance
|
||||||
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
|
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
|
||||||
@ -237,7 +237,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
|
|||||||
{
|
{
|
||||||
if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast<u32>(value), is_inf ? nullptr : &timeout) == 0)
|
if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast<u32>(value), is_inf ? nullptr : &timeout) == 0)
|
||||||
{
|
{
|
||||||
if (!test_pred(value) && !test_pred(value, nullptr))
|
if (!test_pred(value, nullptr))
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -257,7 +257,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <bool All = false, typename T>
|
||||||
void balanced_awaken(atomic_t<T>& var, u32 weight)
|
void balanced_awaken(atomic_t<T>& var, u32 weight)
|
||||||
{
|
{
|
||||||
static_assert(sizeof(T) == 4 || sizeof(T) == 8);
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8);
|
||||||
@ -265,11 +265,13 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
|
|||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
if (OptWaitOnAddress)
|
if (OptWaitOnAddress)
|
||||||
{
|
{
|
||||||
if (weight > 1)
|
if (All || weight > 3)
|
||||||
{
|
{
|
||||||
OptWakeByAddressAll(&var);
|
OptWakeByAddressAll(&var);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
else if (weight == 1)
|
|
||||||
|
for (u32 i = 0; i < weight; i++)
|
||||||
{
|
{
|
||||||
OptWakeByAddressSingle(&var);
|
OptWakeByAddressSingle(&var);
|
||||||
}
|
}
|
||||||
@ -282,9 +284,9 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
|
|||||||
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
|
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (weight)
|
if (All || weight)
|
||||||
{
|
{
|
||||||
futex(&var, FUTEX_WAKE_PRIVATE, std::min<u32>(INT_MAX, weight));
|
futex(&var, FUTEX_WAKE_PRIVATE, All ? INT_MAX : std::min<u32>(INT_MAX, weight));
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
@ -32,33 +32,8 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler
|
|||||||
return std::make_unique<spu_recompiler>();
|
return std::make_unique<spu_recompiler>();
|
||||||
}
|
}
|
||||||
|
|
||||||
spu_runtime::spu_runtime()
|
|
||||||
{
|
|
||||||
m_cache_path = fxm::check_unlocked<ppu_module>()->cache;
|
|
||||||
|
|
||||||
if (g_cfg.core.spu_debug)
|
|
||||||
{
|
|
||||||
fs::file(m_cache_path + "spu.log", fs::rewrite);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized...");
|
|
||||||
|
|
||||||
// Initialize lookup table
|
|
||||||
for (auto& v : m_dispatcher)
|
|
||||||
{
|
|
||||||
v.raw() = &spu_recompiler_base::dispatch;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize "empty" block
|
|
||||||
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
|
|
||||||
}
|
|
||||||
|
|
||||||
spu_recompiler::spu_recompiler()
|
spu_recompiler::spu_recompiler()
|
||||||
{
|
{
|
||||||
if (!g_cfg.core.spu_shared_runtime)
|
|
||||||
{
|
|
||||||
m_spurt = std::make_shared<spu_runtime>();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_recompiler::init()
|
void spu_recompiler::init()
|
||||||
@ -68,6 +43,7 @@ void spu_recompiler::init()
|
|||||||
{
|
{
|
||||||
m_cache = fxm::get<spu_cache>();
|
m_cache = fxm::get<spu_cache>();
|
||||||
m_spurt = fxm::get_always<spu_runtime>();
|
m_spurt = fxm::get_always<spu_runtime>();
|
||||||
|
m_asmrt = m_spurt->get_asmjit_rt();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,19 +59,22 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
|||||||
{
|
{
|
||||||
init();
|
init();
|
||||||
|
|
||||||
// Don't lock without shared runtime
|
std::unique_lock lock(m_spurt->m_mutex);
|
||||||
std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
|
|
||||||
|
|
||||||
if (g_cfg.core.spu_shared_runtime)
|
|
||||||
{
|
|
||||||
lock.lock();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to find existing function, register new one if necessary
|
// Try to find existing function, register new one if necessary
|
||||||
const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
|
const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
|
||||||
|
|
||||||
auto& fn_location = fn_info.first->second;
|
auto& fn_location = fn_info.first->second;
|
||||||
|
|
||||||
|
if (!fn_location && !fn_info.second)
|
||||||
|
{
|
||||||
|
// Wait if already in progress
|
||||||
|
while (!fn_location)
|
||||||
|
{
|
||||||
|
m_spurt->m_cond.wait(lock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (fn_location)
|
if (fn_location)
|
||||||
{
|
{
|
||||||
return fn_location;
|
return fn_location;
|
||||||
@ -103,6 +82,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
|||||||
|
|
||||||
auto& func = fn_info.first->first;
|
auto& func = fn_info.first->first;
|
||||||
|
|
||||||
|
lock.unlock();
|
||||||
|
|
||||||
using namespace asmjit;
|
using namespace asmjit;
|
||||||
|
|
||||||
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
|
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
|
||||||
@ -124,7 +105,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
CodeHolder code;
|
CodeHolder code;
|
||||||
code.init(m_spurt->m_jitrt.getCodeInfo());
|
code.init(m_asmrt->getCodeInfo());
|
||||||
code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign;
|
code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign;
|
||||||
|
|
||||||
X86Assembler compiler(&code);
|
X86Assembler compiler(&code);
|
||||||
@ -861,14 +842,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
|||||||
// Compile and get function address
|
// Compile and get function address
|
||||||
spu_function_t fn;
|
spu_function_t fn;
|
||||||
|
|
||||||
if (m_spurt->m_jitrt.add(&fn, &code))
|
if (m_asmrt->add(&fn, &code))
|
||||||
{
|
{
|
||||||
LOG_FATAL(SPU, "Failed to build a function");
|
LOG_FATAL(SPU, "Failed to build a function");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register function
|
|
||||||
fn_location = fn;
|
|
||||||
|
|
||||||
if (g_cfg.core.spu_debug)
|
if (g_cfg.core.spu_debug)
|
||||||
{
|
{
|
||||||
// Add ASMJIT logs
|
// Add ASMJIT logs
|
||||||
@ -885,6 +863,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
|||||||
m_cache->add(func);
|
m_cache->add(func);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lock.lock();
|
||||||
|
|
||||||
|
// Register function (possibly temporarily)
|
||||||
|
fn_location = fn;
|
||||||
|
|
||||||
// Generate a dispatcher (übertrampoline)
|
// Generate a dispatcher (übertrampoline)
|
||||||
std::vector<u32> addrv{func[0]};
|
std::vector<u32> addrv{func[0]};
|
||||||
const auto beg = m_spurt->m_map.lower_bound(addrv);
|
const auto beg = m_spurt->m_map.lower_bound(addrv);
|
||||||
@ -899,19 +882,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
CodeHolder code;
|
CodeHolder code;
|
||||||
code.init(m_spurt->m_jitrt.getCodeInfo());
|
code.init(m_asmrt->getCodeInfo());
|
||||||
|
|
||||||
X86Assembler compiler(&code);
|
X86Assembler compiler(&code);
|
||||||
this->c = &compiler;
|
this->c = &compiler;
|
||||||
|
|
||||||
if (g_cfg.core.spu_debug)
|
|
||||||
{
|
|
||||||
// Set logger
|
|
||||||
code.setLogger(&logger);
|
|
||||||
}
|
|
||||||
|
|
||||||
compiler.comment("\n\nTrampoline:\n\n");
|
|
||||||
|
|
||||||
struct work
|
struct work
|
||||||
{
|
{
|
||||||
u32 size;
|
u32 size;
|
||||||
@ -1110,7 +1085,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
|||||||
|
|
||||||
spu_function_t tr;
|
spu_function_t tr;
|
||||||
|
|
||||||
if (m_spurt->m_jitrt.add(&tr, &code))
|
if (m_asmrt->add(&tr, &code))
|
||||||
{
|
{
|
||||||
LOG_FATAL(SPU, "Failed to build a trampoline");
|
LOG_FATAL(SPU, "Failed to build a trampoline");
|
||||||
}
|
}
|
||||||
@ -1118,6 +1093,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
|
|||||||
m_spurt->m_dispatcher[func[0] / 4] = tr;
|
m_spurt->m_dispatcher[func[0] / 4] = tr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lock.unlock();
|
||||||
|
m_spurt->m_cond.notify_all();
|
||||||
|
|
||||||
return fn;
|
return fn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,33 +1,10 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "Utilities/JIT.h"
|
#include "Utilities/JIT.h"
|
||||||
#include "Utilities/mutex.h"
|
|
||||||
#include "SPURecompiler.h"
|
#include "SPURecompiler.h"
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
|
||||||
// SPU ASMJIT Runtime object (global)
|
|
||||||
class spu_runtime
|
|
||||||
{
|
|
||||||
shared_mutex m_mutex;
|
|
||||||
|
|
||||||
asmjit::JitRuntime m_jitrt;
|
|
||||||
|
|
||||||
// All functions
|
|
||||||
std::map<std::vector<u32>, spu_function_t> m_map;
|
|
||||||
|
|
||||||
// All dispatchers
|
|
||||||
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
|
|
||||||
|
|
||||||
// Debug module output location
|
|
||||||
std::string m_cache_path;
|
|
||||||
|
|
||||||
friend class spu_recompiler;
|
|
||||||
|
|
||||||
public:
|
|
||||||
spu_runtime();
|
|
||||||
};
|
|
||||||
|
|
||||||
// SPU ASMJIT Recompiler
|
// SPU ASMJIT Recompiler
|
||||||
class spu_recompiler : public spu_recompiler_base
|
class spu_recompiler : public spu_recompiler_base
|
||||||
{
|
{
|
||||||
@ -43,6 +20,9 @@ public:
|
|||||||
virtual spu_function_t compile(std::vector<u32>&&) override;
|
virtual spu_function_t compile(std::vector<u32>&&) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
// ASMJIT runtime
|
||||||
|
asmjit::JitRuntime* m_asmrt;
|
||||||
|
|
||||||
// emitter:
|
// emitter:
|
||||||
asmjit::X86Assembler* c;
|
asmjit::X86Assembler* c;
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ const spu_decoder<spu_iname> s_spu_iname;
|
|||||||
extern u64 get_timebased_time();
|
extern u64 get_timebased_time();
|
||||||
|
|
||||||
spu_cache::spu_cache(const std::string& loc)
|
spu_cache::spu_cache(const std::string& loc)
|
||||||
: m_file(loc, fs::read + fs::write + fs::create)
|
: m_file(loc, fs::read + fs::write + fs::create + fs::append)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -76,18 +76,22 @@ void spu_cache::add(const std::vector<u32>& func)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
be_t<u32> size = ::size32(func) - 1;
|
// Allocate buffer
|
||||||
be_t<u32> addr = func[0];
|
const auto buf = std::make_unique<be_t<u32>[]>(func.size() + 1);
|
||||||
m_file.write(size);
|
|
||||||
m_file.write(addr);
|
buf[0] = ::size32(func) - 1;
|
||||||
m_file.write(func.data() + 1, func.size() * 4 - 4);
|
buf[1] = func[0];
|
||||||
|
std::memcpy(buf.get() + 2, func.data() + 1, func.size() * 4 - 4);
|
||||||
|
|
||||||
|
// Append data
|
||||||
|
m_file.write(buf.get(), func.size() * 4 + 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_cache::initialize()
|
void spu_cache::initialize()
|
||||||
{
|
{
|
||||||
const std::string ppu_cache = Emu.PPUCache();
|
const std::string ppu_cache = Emu.PPUCache();
|
||||||
|
|
||||||
if (ppu_cache.empty() || !g_cfg.core.spu_shared_runtime)
|
if (ppu_cache.empty())
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -105,30 +109,34 @@ void spu_cache::initialize()
|
|||||||
|
|
||||||
// Read cache
|
// Read cache
|
||||||
auto func_list = cache->get();
|
auto func_list = cache->get();
|
||||||
|
atomic_t<std::size_t> fnext{};
|
||||||
|
|
||||||
// Recompiler instance for cache initialization
|
// Initialize compiler instances for parallel compilation
|
||||||
std::unique_ptr<spu_recompiler_base> compiler;
|
u32 max_threads = static_cast<u32>(g_cfg.core.llvm_threads);
|
||||||
|
u32 thread_count = max_threads > 0 ? std::min(max_threads, std::thread::hardware_concurrency()) : std::thread::hardware_concurrency();
|
||||||
|
std::vector<std::unique_ptr<spu_recompiler_base>> compilers{thread_count};
|
||||||
|
|
||||||
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
|
for (auto& compiler : compilers)
|
||||||
{
|
{
|
||||||
compiler = spu_recompiler_base::make_asmjit_recompiler();
|
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
|
||||||
}
|
{
|
||||||
|
compiler = spu_recompiler_base::make_asmjit_recompiler();
|
||||||
|
}
|
||||||
|
else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
|
||||||
|
{
|
||||||
|
compiler = spu_recompiler_base::make_llvm_recompiler();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
compilers.clear();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
|
|
||||||
{
|
|
||||||
compiler = spu_recompiler_base::make_llvm_recompiler();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (compiler)
|
|
||||||
{
|
|
||||||
compiler->init();
|
compiler->init();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (compiler && !func_list.empty())
|
if (compilers.size() && !func_list.empty())
|
||||||
{
|
{
|
||||||
// Fake LS
|
|
||||||
std::vector<be_t<u32>> ls(0x10000);
|
|
||||||
|
|
||||||
// Initialize progress dialog (wait for previous progress done)
|
// Initialize progress dialog (wait for previous progress done)
|
||||||
while (g_progr_ptotal)
|
while (g_progr_ptotal)
|
||||||
{
|
{
|
||||||
@ -137,10 +145,20 @@ void spu_cache::initialize()
|
|||||||
|
|
||||||
g_progr = "Building SPU cache...";
|
g_progr = "Building SPU cache...";
|
||||||
g_progr_ptotal += func_list.size();
|
g_progr_ptotal += func_list.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::deque<named_thread<std::function<void()>>> thread_queue;
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < compilers.size(); i++) thread_queue.emplace_back("Worker " + std::to_string(i), [&, compiler = compilers[i].get()]()
|
||||||
|
{
|
||||||
|
// Fake LS
|
||||||
|
std::vector<be_t<u32>> ls(0x10000);
|
||||||
|
|
||||||
// Build functions
|
// Build functions
|
||||||
for (auto&& func : func_list)
|
for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++)
|
||||||
{
|
{
|
||||||
|
std::vector<u32>& func = func_list[func_i];
|
||||||
|
|
||||||
if (Emu.IsStopped())
|
if (Emu.IsStopped())
|
||||||
{
|
{
|
||||||
g_progr_pdone++;
|
g_progr_pdone++;
|
||||||
@ -185,13 +203,22 @@ void spu_cache::initialize()
|
|||||||
|
|
||||||
g_progr_pdone++;
|
g_progr_pdone++;
|
||||||
}
|
}
|
||||||
|
});
|
||||||
|
|
||||||
if (Emu.IsStopped())
|
// Join all threads
|
||||||
{
|
while (!thread_queue.empty())
|
||||||
LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
|
{
|
||||||
return;
|
thread_queue.pop_front();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Emu.IsStopped())
|
||||||
|
{
|
||||||
|
LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (compilers.size() && !func_list.empty())
|
||||||
|
{
|
||||||
LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
|
LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -202,6 +229,317 @@ void spu_cache::initialize()
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
spu_runtime::spu_runtime()
|
||||||
|
{
|
||||||
|
// Initialize lookup table
|
||||||
|
for (auto& v : m_dispatcher)
|
||||||
|
{
|
||||||
|
v.raw() = &spu_recompiler_base::dispatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize "empty" block
|
||||||
|
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
|
||||||
|
|
||||||
|
// Clear LLVM output
|
||||||
|
m_cache_path = Emu.PPUCache();
|
||||||
|
fs::create_dir(m_cache_path + "llvm/");
|
||||||
|
fs::remove_all(m_cache_path + "llvm/", false);
|
||||||
|
|
||||||
|
if (g_cfg.core.spu_debug)
|
||||||
|
{
|
||||||
|
fs::file(m_cache_path + "spu.log", fs::rewrite);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized...");
|
||||||
|
}
|
||||||
|
|
||||||
|
asmjit::JitRuntime* spu_runtime::get_asmjit_rt()
|
||||||
|
{
|
||||||
|
std::lock_guard lock(m_mutex);
|
||||||
|
|
||||||
|
m_asmjit_rts.emplace_back(std::make_unique<asmjit::JitRuntime>());
|
||||||
|
|
||||||
|
return m_asmjit_rts.back().get();
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled)
|
||||||
|
{
|
||||||
|
std::unique_lock lock(m_mutex);
|
||||||
|
|
||||||
|
// Function info
|
||||||
|
const std::vector<u32>& func = where.first;
|
||||||
|
|
||||||
|
//
|
||||||
|
const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
|
||||||
|
|
||||||
|
// Set pointer to the compiled function
|
||||||
|
where.second = compiled;
|
||||||
|
|
||||||
|
// Generate a dispatcher (übertrampoline)
|
||||||
|
std::vector<u32> addrv{func[0]};
|
||||||
|
const auto beg = m_map.lower_bound(addrv);
|
||||||
|
addrv[0] += 4;
|
||||||
|
const auto _end = m_map.lower_bound(addrv);
|
||||||
|
const u32 size0 = std::distance(beg, _end);
|
||||||
|
|
||||||
|
if (size0 == 1)
|
||||||
|
{
|
||||||
|
m_dispatcher[func[0] / 4] = compiled;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Allocate some writable executable memory
|
||||||
|
#ifdef LLVM_AVAILABLE
|
||||||
|
const auto wxptr = jit_compiler::alloc(size0 * 20);
|
||||||
|
#else
|
||||||
|
u8* const wxptr = new u8[size0 * 20]; // dummy
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Raw assembly pointer
|
||||||
|
u8* raw = wxptr;
|
||||||
|
|
||||||
|
struct work
|
||||||
|
{
|
||||||
|
u32 size;
|
||||||
|
u32 level;
|
||||||
|
u8* rel32;
|
||||||
|
std::map<std::vector<u32>, spu_function_t>::iterator beg;
|
||||||
|
std::map<std::vector<u32>, spu_function_t>::iterator end;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Write jump instruction with rel32 immediate
|
||||||
|
auto make_jump = [&](u8 op, auto target)
|
||||||
|
{
|
||||||
|
verify("Asm overflow" HERE), raw + 6 <= wxptr + size0 * 20;
|
||||||
|
|
||||||
|
if (!target && !tr_dispatch)
|
||||||
|
{
|
||||||
|
// Generate a special trampoline with pause instruction
|
||||||
|
#ifdef LLVM_AVAILABLE
|
||||||
|
const auto trptr = jit_compiler::alloc(16);
|
||||||
|
#else
|
||||||
|
u8* const trptr = new u8[16]; // dummy
|
||||||
|
#endif
|
||||||
|
trptr[0] = 0xf3; // pause
|
||||||
|
trptr[1] = 0x90;
|
||||||
|
trptr[2] = 0xff; // jmp [rip]
|
||||||
|
trptr[3] = 0x25;
|
||||||
|
std::memset(trptr + 4, 0, 4);
|
||||||
|
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
|
||||||
|
std::memcpy(trptr + 8, &target, 8);
|
||||||
|
tr_dispatch = reinterpret_cast<spu_function_t>(trptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to dispatch if no target
|
||||||
|
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
|
||||||
|
|
||||||
|
// Compute the distance
|
||||||
|
const s64 rel = taddr - reinterpret_cast<u64>(raw) - (op != 0xe9 ? 6 : 5);
|
||||||
|
|
||||||
|
verify(HERE), rel >= INT32_MIN, rel <= INT32_MAX;
|
||||||
|
|
||||||
|
if (op != 0xe9)
|
||||||
|
{
|
||||||
|
// First jcc byte
|
||||||
|
*raw++ = 0x0f;
|
||||||
|
verify(HERE), (op >> 4) == 0x8;
|
||||||
|
}
|
||||||
|
|
||||||
|
*raw++ = op;
|
||||||
|
|
||||||
|
const s32 r32 = static_cast<s32>(rel);
|
||||||
|
|
||||||
|
std::memcpy(raw, &r32, 4);
|
||||||
|
raw += 4;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<work> workload;
|
||||||
|
workload.reserve(size0);
|
||||||
|
workload.emplace_back();
|
||||||
|
workload.back().size = size0;
|
||||||
|
workload.back().level = 1;
|
||||||
|
workload.back().rel32 = 0;
|
||||||
|
workload.back().beg = beg;
|
||||||
|
workload.back().end = _end;
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < workload.size(); i++)
|
||||||
|
{
|
||||||
|
// Get copy of the workload info
|
||||||
|
work w = workload[i];
|
||||||
|
|
||||||
|
// Split range in two parts
|
||||||
|
auto it = w.beg;
|
||||||
|
auto it2 = w.beg;
|
||||||
|
u32 size1 = w.size / 2;
|
||||||
|
u32 size2 = w.size - size1;
|
||||||
|
std::advance(it2, w.size / 2);
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
it = it2;
|
||||||
|
size1 = w.size - size2;
|
||||||
|
|
||||||
|
if (w.level >= w.beg->first.size())
|
||||||
|
{
|
||||||
|
// Cannot split: smallest function is a prefix of bigger ones (TODO)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const u32 x1 = w.beg->first.at(w.level);
|
||||||
|
|
||||||
|
if (!x1)
|
||||||
|
{
|
||||||
|
// Cannot split: some functions contain holes at this level
|
||||||
|
w.level++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Adjust ranges (forward)
|
||||||
|
while (it != w.end && x1 == it->first.at(w.level))
|
||||||
|
{
|
||||||
|
it++;
|
||||||
|
size1++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (it == w.end)
|
||||||
|
{
|
||||||
|
// Cannot split: words are identical within the range at this level
|
||||||
|
w.level++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
size2 = w.size - size1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (w.rel32)
|
||||||
|
{
|
||||||
|
// Patch rel32 linking it to the current location if necessary
|
||||||
|
const s32 r32 = ::narrow<s32>(raw - w.rel32, HERE);
|
||||||
|
std::memcpy(w.rel32 - 4, &r32, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (w.level >= w.beg->first.size())
|
||||||
|
{
|
||||||
|
// If functions cannot be compared, assume smallest function
|
||||||
|
LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
|
||||||
|
make_jump(0xe9, w.beg->second); // jmp rel32
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Value for comparison
|
||||||
|
const u32 x = it->first.at(w.level);
|
||||||
|
|
||||||
|
// Adjust ranges (backward)
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
it--;
|
||||||
|
|
||||||
|
if (it->first.at(w.level) != x)
|
||||||
|
{
|
||||||
|
it++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
verify(HERE), it != w.beg;
|
||||||
|
size1--;
|
||||||
|
size2++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Emit 32-bit comparison: cmp [ls+addr], imm32
|
||||||
|
verify("Asm overflow" HERE), raw + 10 <= wxptr + size0 * 20;
|
||||||
|
const u32 cmp_lsa = start + (w.level - 1) * 4;
|
||||||
|
*raw++ = 0x81;
|
||||||
|
#ifdef _WIN32
|
||||||
|
*raw++ = 0xba;
|
||||||
|
#else
|
||||||
|
*raw++ = 0xbe;
|
||||||
|
#endif
|
||||||
|
std::memcpy(raw, &cmp_lsa, 4);
|
||||||
|
std::memcpy(raw + 4, &x, 4);
|
||||||
|
raw += 8;
|
||||||
|
|
||||||
|
// Low subrange target
|
||||||
|
if (size1 == 1)
|
||||||
|
{
|
||||||
|
make_jump(0x82, w.beg->second); // jb rel32
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
make_jump(0x82, raw); // jb rel32 (stub)
|
||||||
|
workload.push_back(w);
|
||||||
|
workload.back().end = it;
|
||||||
|
workload.back().size = size1;
|
||||||
|
workload.back().rel32 = raw;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second subrange target
|
||||||
|
if (size2 == 1)
|
||||||
|
{
|
||||||
|
make_jump(0xe9, it->second); // jmp rel32
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
it2 = it;
|
||||||
|
|
||||||
|
// Select additional midrange for equality comparison
|
||||||
|
while (it2 != w.end && it2->first.at(w.level) == x)
|
||||||
|
{
|
||||||
|
size2--;
|
||||||
|
it2++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (it2 != w.end)
|
||||||
|
{
|
||||||
|
// High subrange target
|
||||||
|
if (size2 == 1)
|
||||||
|
{
|
||||||
|
make_jump(0x87, it2->second); // ja rel32
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
make_jump(0x87, raw); // ja rel32 (stub)
|
||||||
|
workload.push_back(w);
|
||||||
|
workload.back().beg = it2;
|
||||||
|
workload.back().size = size2;
|
||||||
|
workload.back().rel32 = raw;
|
||||||
|
}
|
||||||
|
|
||||||
|
const u32 size3 = w.size - size1 - size2;
|
||||||
|
|
||||||
|
if (size3 == 1)
|
||||||
|
{
|
||||||
|
make_jump(0xe9, it->second); // jmp rel32
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
make_jump(0xe9, raw); // jmp rel32 (stub)
|
||||||
|
workload.push_back(w);
|
||||||
|
workload.back().beg = it;
|
||||||
|
workload.back().end = it2;
|
||||||
|
workload.back().size = size3;
|
||||||
|
workload.back().rel32 = raw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
make_jump(0xe9, raw); // jmp rel32 (stub)
|
||||||
|
workload.push_back(w);
|
||||||
|
workload.back().beg = it;
|
||||||
|
workload.back().size = w.size - size1;
|
||||||
|
workload.back().rel32 = raw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_dispatcher[func[0] / 4] = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
|
||||||
|
}
|
||||||
|
|
||||||
|
lock.unlock();
|
||||||
|
m_cond.notify_all();
|
||||||
|
}
|
||||||
|
|
||||||
spu_recompiler_base::spu_recompiler_base()
|
spu_recompiler_base::spu_recompiler_base()
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@ -1491,55 +1829,14 @@ void spu_recompiler_base::dump(std::string& out)
|
|||||||
#include "llvm/Transforms/Scalar.h"
|
#include "llvm/Transforms/Scalar.h"
|
||||||
#include "llvm/Transforms/IPO.h"
|
#include "llvm/Transforms/IPO.h"
|
||||||
#include "llvm/Transforms/Vectorize.h"
|
#include "llvm/Transforms/Vectorize.h"
|
||||||
#include "Utilities/JIT.h"
|
|
||||||
|
|
||||||
class spu_llvm_runtime
|
|
||||||
{
|
|
||||||
shared_mutex m_mutex;
|
|
||||||
|
|
||||||
// All functions
|
|
||||||
std::map<std::vector<u32>, spu_function_t> m_map;
|
|
||||||
|
|
||||||
// All dispatchers
|
|
||||||
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
|
|
||||||
|
|
||||||
// JIT instance
|
|
||||||
jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
|
|
||||||
|
|
||||||
// Debug module output location
|
|
||||||
std::string m_cache_path;
|
|
||||||
|
|
||||||
friend class spu_llvm_recompiler;
|
|
||||||
|
|
||||||
public:
|
|
||||||
spu_llvm_runtime()
|
|
||||||
{
|
|
||||||
// Initialize lookup table
|
|
||||||
for (auto& v : m_dispatcher)
|
|
||||||
{
|
|
||||||
v.raw() = &spu_recompiler_base::dispatch;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize "empty" block
|
|
||||||
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
|
|
||||||
|
|
||||||
// Clear LLVM output
|
|
||||||
m_cache_path = Emu.PPUCache();
|
|
||||||
fs::create_dir(m_cache_path + "llvm/");
|
|
||||||
fs::remove_all(m_cache_path + "llvm/", false);
|
|
||||||
|
|
||||||
if (g_cfg.core.spu_debug)
|
|
||||||
{
|
|
||||||
fs::file(m_cache_path + "spu.log", fs::rewrite);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized...");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||||
{
|
{
|
||||||
std::shared_ptr<spu_llvm_runtime> m_spurt;
|
// SPU Runtime Instance
|
||||||
|
std::shared_ptr<spu_runtime> m_spurt;
|
||||||
|
|
||||||
|
// JIT Instance
|
||||||
|
jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
|
||||||
|
|
||||||
// Current function (chunk)
|
// Current function (chunk)
|
||||||
llvm::Function* m_function;
|
llvm::Function* m_function;
|
||||||
@ -2239,11 +2536,6 @@ public:
|
|||||||
: spu_recompiler_base()
|
: spu_recompiler_base()
|
||||||
, cpu_translator(nullptr, false)
|
, cpu_translator(nullptr, false)
|
||||||
{
|
{
|
||||||
if (g_cfg.core.spu_shared_runtime)
|
|
||||||
{
|
|
||||||
// TODO (local context is unsupported)
|
|
||||||
//m_spurt = std::make_shared<spu_llvm_runtime>();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void init() override
|
virtual void init() override
|
||||||
@ -2252,9 +2544,9 @@ public:
|
|||||||
if (!m_spurt)
|
if (!m_spurt)
|
||||||
{
|
{
|
||||||
m_cache = fxm::get<spu_cache>();
|
m_cache = fxm::get<spu_cache>();
|
||||||
m_spurt = fxm::get_always<spu_llvm_runtime>();
|
m_spurt = fxm::get_always<spu_runtime>();
|
||||||
m_context = m_spurt->m_jit.get_context();
|
m_context = m_jit.get_context();
|
||||||
m_use_ssse3 = m_spurt->m_jit.has_ssse3();
|
m_use_ssse3 = m_jit.has_ssse3();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2271,18 +2563,22 @@ public:
|
|||||||
init();
|
init();
|
||||||
|
|
||||||
// Don't lock without shared runtime
|
// Don't lock without shared runtime
|
||||||
std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
|
std::unique_lock lock(m_spurt->m_mutex);
|
||||||
|
|
||||||
if (g_cfg.core.spu_shared_runtime)
|
|
||||||
{
|
|
||||||
lock.lock();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to find existing function, register new one if necessary
|
// Try to find existing function, register new one if necessary
|
||||||
const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
|
const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
|
||||||
|
|
||||||
auto& fn_location = fn_info.first->second;
|
auto& fn_location = fn_info.first->second;
|
||||||
|
|
||||||
|
if (!fn_location && !fn_info.second)
|
||||||
|
{
|
||||||
|
// Wait if already in progress
|
||||||
|
while (!fn_location)
|
||||||
|
{
|
||||||
|
m_spurt->m_cond.wait(lock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (fn_location)
|
if (fn_location)
|
||||||
{
|
{
|
||||||
return fn_location;
|
return fn_location;
|
||||||
@ -2290,6 +2586,8 @@ public:
|
|||||||
|
|
||||||
auto& func = fn_info.first->first;
|
auto& func = fn_info.first->first;
|
||||||
|
|
||||||
|
lock.unlock();
|
||||||
|
|
||||||
std::string hash;
|
std::string hash;
|
||||||
{
|
{
|
||||||
sha1_context ctx;
|
sha1_context ctx;
|
||||||
@ -2770,179 +3068,6 @@ public:
|
|||||||
m_scan_queue.clear();
|
m_scan_queue.clear();
|
||||||
m_function_table = nullptr;
|
m_function_table = nullptr;
|
||||||
|
|
||||||
// Generate a dispatcher (übertrampoline)
|
|
||||||
std::vector<u32> addrv{func[0]};
|
|
||||||
const auto beg = m_spurt->m_map.lower_bound(addrv);
|
|
||||||
addrv[0] += 4;
|
|
||||||
const auto _end = m_spurt->m_map.lower_bound(addrv);
|
|
||||||
const u32 size0 = std::distance(beg, _end);
|
|
||||||
|
|
||||||
if (size0 > 1)
|
|
||||||
{
|
|
||||||
const auto trampoline = cast<Function>(module->getOrInsertFunction(fmt::format("spu-0x%05x-trampoline-%03u", func[0], size0), get_type<void>(), get_type<u8*>(), get_type<u8*>()));
|
|
||||||
set_function(trampoline);
|
|
||||||
|
|
||||||
struct work
|
|
||||||
{
|
|
||||||
u32 size;
|
|
||||||
u32 level;
|
|
||||||
BasicBlock* label;
|
|
||||||
std::map<std::vector<u32>, spu_function_t>::iterator beg;
|
|
||||||
std::map<std::vector<u32>, spu_function_t>::iterator end;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<work> workload;
|
|
||||||
workload.reserve(size0);
|
|
||||||
workload.emplace_back();
|
|
||||||
workload.back().size = size0;
|
|
||||||
workload.back().level = 1;
|
|
||||||
workload.back().beg = beg;
|
|
||||||
workload.back().end = _end;
|
|
||||||
workload.back().label = m_ir->GetInsertBlock();
|
|
||||||
|
|
||||||
for (std::size_t i = 0; i < workload.size(); i++)
|
|
||||||
{
|
|
||||||
// Get copy of the workload info
|
|
||||||
work w = workload[i];
|
|
||||||
|
|
||||||
// Switch targets
|
|
||||||
std::vector<std::pair<u32, llvm::BasicBlock*>> targets;
|
|
||||||
|
|
||||||
llvm::BasicBlock* def{};
|
|
||||||
|
|
||||||
bool unsorted = false;
|
|
||||||
|
|
||||||
while (w.level < w.beg->first.size())
|
|
||||||
{
|
|
||||||
const u32 x1 = w.beg->first.at(w.level);
|
|
||||||
|
|
||||||
if (x1 == 0)
|
|
||||||
{
|
|
||||||
// Cannot split: some functions contain holes at this level
|
|
||||||
auto it = w.end;
|
|
||||||
it--;
|
|
||||||
|
|
||||||
if (it->first.at(w.level) != 0)
|
|
||||||
{
|
|
||||||
unsorted = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
w.level++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto it = w.beg;
|
|
||||||
auto it2 = it;
|
|
||||||
u32 x = x1;
|
|
||||||
bool split = false;
|
|
||||||
|
|
||||||
while (it2 != w.end)
|
|
||||||
{
|
|
||||||
it2++;
|
|
||||||
|
|
||||||
const u32 x2 = it2 != w.end ? it2->first.at(w.level) : x1;
|
|
||||||
|
|
||||||
if (x2 != x)
|
|
||||||
{
|
|
||||||
const u32 dist = std::distance(it, it2);
|
|
||||||
|
|
||||||
const auto b = llvm::BasicBlock::Create(m_context, "", m_function);
|
|
||||||
|
|
||||||
if (dist == 1 && x != 0)
|
|
||||||
{
|
|
||||||
m_ir->SetInsertPoint(b);
|
|
||||||
|
|
||||||
if (const u64 fval = reinterpret_cast<u64>(it->second))
|
|
||||||
{
|
|
||||||
const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
|
|
||||||
m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
verify(HERE, &it->second == &fn_location);
|
|
||||||
m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
|
|
||||||
}
|
|
||||||
|
|
||||||
m_ir->CreateRetVoid();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
workload.emplace_back(w);
|
|
||||||
workload.back().beg = it;
|
|
||||||
workload.back().end = it2;
|
|
||||||
workload.back().label = b;
|
|
||||||
workload.back().size = dist;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (x == 0)
|
|
||||||
{
|
|
||||||
def = b;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
targets.emplace_back(std::make_pair(x, b));
|
|
||||||
}
|
|
||||||
|
|
||||||
x = x2;
|
|
||||||
it = it2;
|
|
||||||
split = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!split)
|
|
||||||
{
|
|
||||||
// Cannot split: words are identical within the range at this level
|
|
||||||
w.level++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!def && targets.empty())
|
|
||||||
{
|
|
||||||
LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
|
|
||||||
m_ir->SetInsertPoint(w.label);
|
|
||||||
|
|
||||||
if (const u64 fval = reinterpret_cast<u64>(w.beg->second))
|
|
||||||
{
|
|
||||||
const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
|
|
||||||
m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
verify(HERE, &w.beg->second == &fn_location);
|
|
||||||
m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
|
|
||||||
}
|
|
||||||
|
|
||||||
m_ir->CreateRetVoid();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!def)
|
|
||||||
{
|
|
||||||
def = llvm::BasicBlock::Create(m_context, "", m_function);
|
|
||||||
|
|
||||||
m_ir->SetInsertPoint(def);
|
|
||||||
tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0));
|
|
||||||
}
|
|
||||||
|
|
||||||
m_ir->SetInsertPoint(w.label);
|
|
||||||
const auto add = m_ir->CreateGEP(m_lsptr, m_ir->getInt64(start + w.level * 4 - 4));
|
|
||||||
const auto ptr = m_ir->CreateBitCast(add, get_type<u32*>());
|
|
||||||
const auto val = m_ir->CreateLoad(ptr);
|
|
||||||
const auto sw = m_ir->CreateSwitch(val, def, ::size32(targets));
|
|
||||||
|
|
||||||
for (auto& pair : targets)
|
|
||||||
{
|
|
||||||
sw->addCase(m_ir->getInt32(pair.first), pair.second);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
spu_function_t fn{}, tr{};
|
|
||||||
|
|
||||||
std::string log;
|
std::string log;
|
||||||
|
|
||||||
raw_string_ostream out(log);
|
raw_string_ostream out(log);
|
||||||
@ -2970,32 +3095,19 @@ public:
|
|||||||
if (g_cfg.core.spu_debug)
|
if (g_cfg.core.spu_debug)
|
||||||
{
|
{
|
||||||
// Testing only
|
// Testing only
|
||||||
m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
|
m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
m_spurt->m_jit.add(std::move(module));
|
m_jit.add(std::move(module));
|
||||||
}
|
}
|
||||||
|
|
||||||
m_spurt->m_jit.fin();
|
m_jit.fin();
|
||||||
fn = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(main_func));
|
|
||||||
tr = fn;
|
|
||||||
|
|
||||||
if (size0 > 1)
|
|
||||||
{
|
|
||||||
tr = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(m_function));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register function pointer
|
// Register function pointer
|
||||||
fn_location = fn;
|
const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));
|
||||||
|
|
||||||
// Trampoline
|
m_spurt->add(*fn_info.first, fn);
|
||||||
m_spurt->m_dispatcher[func[0] / 4] = tr;
|
|
||||||
|
|
||||||
LOG_NOTICE(SPU, "[0x%x] Compiled: %p", func[0], fn);
|
|
||||||
|
|
||||||
if (tr != fn)
|
|
||||||
LOG_NOTICE(SPU, "[0x%x] T: %p", func[0], tr);
|
|
||||||
|
|
||||||
if (g_cfg.core.spu_debug)
|
if (g_cfg.core.spu_debug)
|
||||||
{
|
{
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "Utilities/File.h"
|
#include "Utilities/File.h"
|
||||||
|
#include "Utilities/mutex.h"
|
||||||
|
#include "Utilities/cond.h"
|
||||||
|
#include "Utilities/JIT.h"
|
||||||
#include "SPUThread.h"
|
#include "SPUThread.h"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <bitset>
|
#include <bitset>
|
||||||
@ -30,6 +33,40 @@ public:
|
|||||||
static void initialize();
|
static void initialize();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Helper class
|
||||||
|
class spu_runtime
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
shared_mutex m_mutex;
|
||||||
|
|
||||||
|
cond_variable m_cond;
|
||||||
|
|
||||||
|
// All functions
|
||||||
|
std::map<std::vector<u32>, spu_function_t> m_map;
|
||||||
|
|
||||||
|
// All dispatchers
|
||||||
|
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
|
||||||
|
|
||||||
|
// Debug module output location
|
||||||
|
std::string m_cache_path;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Temporarily: asmjit runtime collection
|
||||||
|
std::deque<std::unique_ptr<asmjit::JitRuntime>> m_asmjit_rts;
|
||||||
|
|
||||||
|
// Trampoline to spu_recompiler_base::dispatch
|
||||||
|
spu_function_t tr_dispatch = nullptr;
|
||||||
|
|
||||||
|
public:
|
||||||
|
spu_runtime();
|
||||||
|
|
||||||
|
// Get new ASMJIT runtime
|
||||||
|
asmjit::JitRuntime* get_asmjit_rt();
|
||||||
|
|
||||||
|
// Add compiled function and generate trampoline if necessary
|
||||||
|
void add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled);
|
||||||
|
};
|
||||||
|
|
||||||
// SPU Recompiler instance base class
|
// SPU Recompiler instance base class
|
||||||
class spu_recompiler_base
|
class spu_recompiler_base
|
||||||
{
|
{
|
||||||
|
@ -367,7 +367,6 @@ struct cfg_root : cfg::node
|
|||||||
cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
|
cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
|
||||||
cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
|
cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
|
||||||
cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield
|
cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield
|
||||||
cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads
|
|
||||||
cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
|
cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
|
||||||
cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
|
cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
|
||||||
cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};
|
cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user