1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2025-01-31 12:31:45 +01:00

SPU: multithread compilation

Allow parallel compilation of SPU code, both at startup and runtime
Remove 'SPU Shared Runtime' option (it became obsolete)
Refactor spu_runtime class (now is common for ASMJIT and LLVM)
Implement SPU ubertrampoline generation in raw assembly (LLVM)
Minor improvement of balanced_wait_until<> and balanced_awaken<>
Make JIT MemoryManager2 shared (global)
Fix wrong assertion in cond_variable
This commit is contained in:
Nekotekina 2019-01-21 21:04:32 +03:00
parent 8d5d44141e
commit 4f152ad126
9 changed files with 503 additions and 394 deletions

View File

@ -95,6 +95,12 @@ static void* const s_memory = []() -> void*
return utils::memory_reserve(s_memory_size);
}();
// Reserve 2G of memory, should replace previous area for ASLR compatibility
static void* const s_memory2 = utils::memory_reserve(0x80000000);
static u64 s_code_pos = 0;
static u64 s_data_pos = 0;
static void* s_next = s_memory;
#ifdef _WIN32
@ -129,6 +135,11 @@ extern void jit_finalize()
utils::memory_decommit(s_memory, s_memory_size);
s_next = s_memory;
utils::memory_decommit(s_memory2, 0x80000000);
s_code_pos = 0;
s_data_pos = 0;
}
// Helper class
@ -311,24 +322,25 @@ struct MemoryManager : llvm::RTDyldMemoryManager
// Simple memory manager
struct MemoryManager2 : llvm::RTDyldMemoryManager
{
// Reserve 2 GiB
void* const m_memory = utils::memory_reserve(0x80000000);
// Patchwork again...
void* const m_memory = s_memory2;
u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;
u64 m_code_pos = 0;
u64 m_data_pos = 0;
u64& m_code_pos = s_code_pos;
u64& m_data_pos = s_data_pos;
MemoryManager2() = default;
~MemoryManager2() override
{
utils::memory_release(m_memory, 0x80000000);
}
u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
{
std::lock_guard lock(s_mutex);
// Simple allocation
const u64 old = m_code_pos;
const u64 pos = ::align(m_code_pos, align);
@ -349,12 +361,20 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
}
if (!sec_id && sec_name.empty())
{
// Special case: don't log
return m_code + pos;
}
LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
return m_code + pos;
}
u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
{
std::lock_guard lock(s_mutex);
// Simple allocation
const u64 old = m_data_pos;
const u64 pos = ::align(m_data_pos, align);
@ -642,33 +662,12 @@ u64 jit_compiler::get(const std::string& name)
return m_engine->getGlobalValueAddress(name);
}
std::unordered_map<std::string, u64> jit_compiler::add(std::unordered_map<std::string, std::string> data)
u8* jit_compiler::alloc(u32 size)
{
// Lock memory manager
std::lock_guard lock(s_mutex);
// Dummy memory manager object
MemoryManager2 mm;
std::unordered_map<std::string, u64> result;
std::size_t size = 0;
for (auto&& pair : data)
{
size += ::align(pair.second.size(), 16);
}
utils::memory_commit(s_next, size, utils::protection::wx);
std::memset(s_next, 0xc3, ::align(size, 4096));
for (auto&& pair : data)
{
std::memcpy(s_next, pair.second.data(), pair.second.size());
result.emplace(pair.first, (u64)s_next);
s_next = (void*)::align((u64)s_next + pair.second.size(), 16);
}
s_next = (void*)::align((u64)s_next, 4096);
return result;
return mm.allocateCodeSection(size, 16, 0, {});
}
#endif

View File

@ -61,6 +61,7 @@ FT build_function_asm(F&& builder)
#include <memory>
#include <string>
#include <string_view>
#include <unordered_map>
#include "types.h"
@ -129,8 +130,8 @@ public:
// Get compiled function address
u64 get(const std::string& name);
// Add functions directly to the memory manager (name -> code)
static std::unordered_map<std::string, u64> add(std::unordered_map<std::string, std::string>);
// Allocate writable executable memory (alignment is assumed 16)
static u8* alloc(u32 size);
// Get CPU info
static std::string cpu(const std::string& _cpu);

View File

@ -10,7 +10,7 @@
bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
{
verify("cond_variable overflow" HERE), (_old & 0xffff) == 0; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
verify("cond_variable overflow" HERE), (_old & 0xffff) != 0xffff; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int
{
@ -42,7 +42,8 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
void cond_variable::imp_wake(u32 _count) noexcept
{
balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32
// TODO (notify_one)
balanced_awaken<true>(m_value, m_value.atomic_op([&](u32& value) -> u32
{
// Subtract already signaled number from total amount of waiters
const u32 can_sig = (value & 0xffff) - (value >> 16);
@ -266,7 +267,7 @@ void cond_x16::imp_notify() noexcept
return;
}
balanced_awaken(m_cvx16, utils::popcnt16(wait_mask));
balanced_awaken<true>(m_cvx16, utils::popcnt16(wait_mask));
}
bool lf_queue_base::wait(u64 _timeout)

View File

@ -186,7 +186,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
{
if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000))
{
if (!test_pred(value) && !test_pred(value, nullptr))
if (!test_pred(value, nullptr))
{
return false;
}
@ -220,7 +220,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
return true;
}
if (!test_pred(value) && !test_pred(value, nullptr))
if (!test_pred(value, nullptr))
{
// Stolen notification: restore balance
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
@ -237,7 +237,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
{
if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast<u32>(value), is_inf ? nullptr : &timeout) == 0)
{
if (!test_pred(value) && !test_pred(value, nullptr))
if (!test_pred(value, nullptr))
{
return false;
}
@ -257,7 +257,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
#endif
}
template <typename T>
template <bool All = false, typename T>
void balanced_awaken(atomic_t<T>& var, u32 weight)
{
static_assert(sizeof(T) == 4 || sizeof(T) == 8);
@ -265,11 +265,13 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
#ifdef _WIN32
if (OptWaitOnAddress)
{
if (weight > 1)
if (All || weight > 3)
{
OptWakeByAddressAll(&var);
return;
}
else if (weight == 1)
for (u32 i = 0; i < weight; i++)
{
OptWakeByAddressSingle(&var);
}
@ -282,9 +284,9 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
}
#else
if (weight)
if (All || weight)
{
futex(&var, FUTEX_WAKE_PRIVATE, std::min<u32>(INT_MAX, weight));
futex(&var, FUTEX_WAKE_PRIVATE, All ? INT_MAX : std::min<u32>(INT_MAX, weight));
}
return;

View File

@ -32,33 +32,8 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler
return std::make_unique<spu_recompiler>();
}
spu_runtime::spu_runtime()
{
m_cache_path = fxm::check_unlocked<ppu_module>()->cache;
if (g_cfg.core.spu_debug)
{
fs::file(m_cache_path + "spu.log", fs::rewrite);
}
LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized...");
// Initialize lookup table
for (auto& v : m_dispatcher)
{
v.raw() = &spu_recompiler_base::dispatch;
}
// Initialize "empty" block
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
}
spu_recompiler::spu_recompiler()
{
if (!g_cfg.core.spu_shared_runtime)
{
m_spurt = std::make_shared<spu_runtime>();
}
}
void spu_recompiler::init()
@ -68,6 +43,7 @@ void spu_recompiler::init()
{
m_cache = fxm::get<spu_cache>();
m_spurt = fxm::get_always<spu_runtime>();
m_asmrt = m_spurt->get_asmjit_rt();
}
}
@ -83,19 +59,22 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
{
init();
// Don't lock without shared runtime
std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
if (g_cfg.core.spu_shared_runtime)
{
lock.lock();
}
std::unique_lock lock(m_spurt->m_mutex);
// Try to find existing function, register new one if necessary
const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
auto& fn_location = fn_info.first->second;
if (!fn_location && !fn_info.second)
{
// Wait if already in progress
while (!fn_location)
{
m_spurt->m_cond.wait(lock);
}
}
if (fn_location)
{
return fn_location;
@ -103,6 +82,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
auto& func = fn_info.first->first;
lock.unlock();
using namespace asmjit;
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
@ -124,7 +105,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
}
CodeHolder code;
code.init(m_spurt->m_jitrt.getCodeInfo());
code.init(m_asmrt->getCodeInfo());
code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign;
X86Assembler compiler(&code);
@ -861,14 +842,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
// Compile and get function address
spu_function_t fn;
if (m_spurt->m_jitrt.add(&fn, &code))
if (m_asmrt->add(&fn, &code))
{
LOG_FATAL(SPU, "Failed to build a function");
}
// Register function
fn_location = fn;
if (g_cfg.core.spu_debug)
{
// Add ASMJIT logs
@ -885,6 +863,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
m_cache->add(func);
}
lock.lock();
// Register function (possibly temporarily)
fn_location = fn;
// Generate a dispatcher (übertrampoline)
std::vector<u32> addrv{func[0]};
const auto beg = m_spurt->m_map.lower_bound(addrv);
@ -899,19 +882,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
else
{
CodeHolder code;
code.init(m_spurt->m_jitrt.getCodeInfo());
code.init(m_asmrt->getCodeInfo());
X86Assembler compiler(&code);
this->c = &compiler;
if (g_cfg.core.spu_debug)
{
// Set logger
code.setLogger(&logger);
}
compiler.comment("\n\nTrampoline:\n\n");
struct work
{
u32 size;
@ -1110,7 +1085,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
spu_function_t tr;
if (m_spurt->m_jitrt.add(&tr, &code))
if (m_asmrt->add(&tr, &code))
{
LOG_FATAL(SPU, "Failed to build a trampoline");
}
@ -1118,6 +1093,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
m_spurt->m_dispatcher[func[0] / 4] = tr;
}
lock.unlock();
m_spurt->m_cond.notify_all();
return fn;
}

View File

@ -1,33 +1,10 @@
#pragma once
#include "Utilities/JIT.h"
#include "Utilities/mutex.h"
#include "SPURecompiler.h"
#include <functional>
// SPU ASMJIT Runtime object (global)
class spu_runtime
{
shared_mutex m_mutex;
asmjit::JitRuntime m_jitrt;
// All functions
std::map<std::vector<u32>, spu_function_t> m_map;
// All dispatchers
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
// Debug module output location
std::string m_cache_path;
friend class spu_recompiler;
public:
spu_runtime();
};
// SPU ASMJIT Recompiler
class spu_recompiler : public spu_recompiler_base
{
@ -43,6 +20,9 @@ public:
virtual spu_function_t compile(std::vector<u32>&&) override;
private:
// ASMJIT runtime
asmjit::JitRuntime* m_asmrt;
// emitter:
asmjit::X86Assembler* c;

View File

@ -24,7 +24,7 @@ const spu_decoder<spu_iname> s_spu_iname;
extern u64 get_timebased_time();
spu_cache::spu_cache(const std::string& loc)
: m_file(loc, fs::read + fs::write + fs::create)
: m_file(loc, fs::read + fs::write + fs::create + fs::append)
{
}
@ -76,18 +76,22 @@ void spu_cache::add(const std::vector<u32>& func)
return;
}
be_t<u32> size = ::size32(func) - 1;
be_t<u32> addr = func[0];
m_file.write(size);
m_file.write(addr);
m_file.write(func.data() + 1, func.size() * 4 - 4);
// Allocate buffer
const auto buf = std::make_unique<be_t<u32>[]>(func.size() + 1);
buf[0] = ::size32(func) - 1;
buf[1] = func[0];
std::memcpy(buf.get() + 2, func.data() + 1, func.size() * 4 - 4);
// Append data
m_file.write(buf.get(), func.size() * 4 + 4);
}
void spu_cache::initialize()
{
const std::string ppu_cache = Emu.PPUCache();
if (ppu_cache.empty() || !g_cfg.core.spu_shared_runtime)
if (ppu_cache.empty())
{
return;
}
@ -105,30 +109,34 @@ void spu_cache::initialize()
// Read cache
auto func_list = cache->get();
atomic_t<std::size_t> fnext{};
// Recompiler instance for cache initialization
std::unique_ptr<spu_recompiler_base> compiler;
// Initialize compiler instances for parallel compilation
u32 max_threads = static_cast<u32>(g_cfg.core.llvm_threads);
u32 thread_count = max_threads > 0 ? std::min(max_threads, std::thread::hardware_concurrency()) : std::thread::hardware_concurrency();
std::vector<std::unique_ptr<spu_recompiler_base>> compilers{thread_count};
for (auto& compiler : compilers)
{
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
{
compiler = spu_recompiler_base::make_asmjit_recompiler();
}
if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
compiler = spu_recompiler_base::make_llvm_recompiler();
}
if (compiler)
else
{
compilers.clear();
break;
}
compiler->init();
}
if (compiler && !func_list.empty())
if (compilers.size() && !func_list.empty())
{
// Fake LS
std::vector<be_t<u32>> ls(0x10000);
// Initialize progress dialog (wait for previous progress done)
while (g_progr_ptotal)
{
@ -137,10 +145,20 @@ void spu_cache::initialize()
g_progr = "Building SPU cache...";
g_progr_ptotal += func_list.size();
}
std::deque<named_thread<std::function<void()>>> thread_queue;
for (std::size_t i = 0; i < compilers.size(); i++) thread_queue.emplace_back("Worker " + std::to_string(i), [&, compiler = compilers[i].get()]()
{
// Fake LS
std::vector<be_t<u32>> ls(0x10000);
// Build functions
for (auto&& func : func_list)
for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++)
{
std::vector<u32>& func = func_list[func_i];
if (Emu.IsStopped())
{
g_progr_pdone++;
@ -185,6 +203,13 @@ void spu_cache::initialize()
g_progr_pdone++;
}
});
// Join all threads
while (!thread_queue.empty())
{
thread_queue.pop_front();
}
if (Emu.IsStopped())
{
@ -192,6 +217,8 @@ void spu_cache::initialize()
return;
}
if (compilers.size() && !func_list.empty())
{
LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
}
@ -202,6 +229,317 @@ void spu_cache::initialize()
});
}
spu_runtime::spu_runtime()
{
// Initialize lookup table
for (auto& v : m_dispatcher)
{
v.raw() = &spu_recompiler_base::dispatch;
}
// Initialize "empty" block
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
// Clear LLVM output
m_cache_path = Emu.PPUCache();
fs::create_dir(m_cache_path + "llvm/");
fs::remove_all(m_cache_path + "llvm/", false);
if (g_cfg.core.spu_debug)
{
fs::file(m_cache_path + "spu.log", fs::rewrite);
}
LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized...");
}
asmjit::JitRuntime* spu_runtime::get_asmjit_rt()
{
std::lock_guard lock(m_mutex);
m_asmjit_rts.emplace_back(std::make_unique<asmjit::JitRuntime>());
return m_asmjit_rts.back().get();
}
void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled)
{
std::unique_lock lock(m_mutex);
// Function info
const std::vector<u32>& func = where.first;
//
const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
// Set pointer to the compiled function
where.second = compiled;
// Generate a dispatcher (übertrampoline)
std::vector<u32> addrv{func[0]};
const auto beg = m_map.lower_bound(addrv);
addrv[0] += 4;
const auto _end = m_map.lower_bound(addrv);
const u32 size0 = std::distance(beg, _end);
if (size0 == 1)
{
m_dispatcher[func[0] / 4] = compiled;
}
else
{
// Allocate some writable executable memory
#ifdef LLVM_AVAILABLE
const auto wxptr = jit_compiler::alloc(size0 * 20);
#else
u8* const wxptr = new u8[size0 * 20]; // dummy
#endif
// Raw assembly pointer
u8* raw = wxptr;
struct work
{
u32 size;
u32 level;
u8* rel32;
std::map<std::vector<u32>, spu_function_t>::iterator beg;
std::map<std::vector<u32>, spu_function_t>::iterator end;
};
// Write jump instruction with rel32 immediate
auto make_jump = [&](u8 op, auto target)
{
verify("Asm overflow" HERE), raw + 6 <= wxptr + size0 * 20;
if (!target && !tr_dispatch)
{
// Generate a special trampoline with pause instruction
#ifdef LLVM_AVAILABLE
const auto trptr = jit_compiler::alloc(16);
#else
u8* const trptr = new u8[16]; // dummy
#endif
trptr[0] = 0xf3; // pause
trptr[1] = 0x90;
trptr[2] = 0xff; // jmp [rip]
trptr[3] = 0x25;
std::memset(trptr + 4, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
std::memcpy(trptr + 8, &target, 8);
tr_dispatch = reinterpret_cast<spu_function_t>(trptr);
}
// Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
// Compute the distance
const s64 rel = taddr - reinterpret_cast<u64>(raw) - (op != 0xe9 ? 6 : 5);
verify(HERE), rel >= INT32_MIN, rel <= INT32_MAX;
if (op != 0xe9)
{
// First jcc byte
*raw++ = 0x0f;
verify(HERE), (op >> 4) == 0x8;
}
*raw++ = op;
const s32 r32 = static_cast<s32>(rel);
std::memcpy(raw, &r32, 4);
raw += 4;
};
std::vector<work> workload;
workload.reserve(size0);
workload.emplace_back();
workload.back().size = size0;
workload.back().level = 1;
workload.back().rel32 = 0;
workload.back().beg = beg;
workload.back().end = _end;
for (std::size_t i = 0; i < workload.size(); i++)
{
// Get copy of the workload info
work w = workload[i];
// Split range in two parts
auto it = w.beg;
auto it2 = w.beg;
u32 size1 = w.size / 2;
u32 size2 = w.size - size1;
std::advance(it2, w.size / 2);
while (true)
{
it = it2;
size1 = w.size - size2;
if (w.level >= w.beg->first.size())
{
// Cannot split: smallest function is a prefix of bigger ones (TODO)
break;
}
const u32 x1 = w.beg->first.at(w.level);
if (!x1)
{
// Cannot split: some functions contain holes at this level
w.level++;
continue;
}
// Adjust ranges (forward)
while (it != w.end && x1 == it->first.at(w.level))
{
it++;
size1++;
}
if (it == w.end)
{
// Cannot split: words are identical within the range at this level
w.level++;
}
else
{
size2 = w.size - size1;
break;
}
}
if (w.rel32)
{
// Patch rel32 linking it to the current location if necessary
const s32 r32 = ::narrow<s32>(raw - w.rel32, HERE);
std::memcpy(w.rel32 - 4, &r32, 4);
}
if (w.level >= w.beg->first.size())
{
// If functions cannot be compared, assume smallest function
LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
make_jump(0xe9, w.beg->second); // jmp rel32
continue;
}
// Value for comparison
const u32 x = it->first.at(w.level);
// Adjust ranges (backward)
while (true)
{
it--;
if (it->first.at(w.level) != x)
{
it++;
break;
}
verify(HERE), it != w.beg;
size1--;
size2++;
}
// Emit 32-bit comparison: cmp [ls+addr], imm32
verify("Asm overflow" HERE), raw + 10 <= wxptr + size0 * 20;
const u32 cmp_lsa = start + (w.level - 1) * 4;
*raw++ = 0x81;
#ifdef _WIN32
*raw++ = 0xba;
#else
*raw++ = 0xbe;
#endif
std::memcpy(raw, &cmp_lsa, 4);
std::memcpy(raw + 4, &x, 4);
raw += 8;
// Low subrange target
if (size1 == 1)
{
make_jump(0x82, w.beg->second); // jb rel32
}
else
{
make_jump(0x82, raw); // jb rel32 (stub)
workload.push_back(w);
workload.back().end = it;
workload.back().size = size1;
workload.back().rel32 = raw;
}
// Second subrange target
if (size2 == 1)
{
make_jump(0xe9, it->second); // jmp rel32
}
else
{
it2 = it;
// Select additional midrange for equality comparison
while (it2 != w.end && it2->first.at(w.level) == x)
{
size2--;
it2++;
}
if (it2 != w.end)
{
// High subrange target
if (size2 == 1)
{
make_jump(0x87, it2->second); // ja rel32
}
else
{
make_jump(0x87, raw); // ja rel32 (stub)
workload.push_back(w);
workload.back().beg = it2;
workload.back().size = size2;
workload.back().rel32 = raw;
}
const u32 size3 = w.size - size1 - size2;
if (size3 == 1)
{
make_jump(0xe9, it->second); // jmp rel32
}
else
{
make_jump(0xe9, raw); // jmp rel32 (stub)
workload.push_back(w);
workload.back().beg = it;
workload.back().end = it2;
workload.back().size = size3;
workload.back().rel32 = raw;
}
}
else
{
make_jump(0xe9, raw); // jmp rel32 (stub)
workload.push_back(w);
workload.back().beg = it;
workload.back().size = w.size - size1;
workload.back().rel32 = raw;
}
}
}
m_dispatcher[func[0] / 4] = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
}
lock.unlock();
m_cond.notify_all();
}
spu_recompiler_base::spu_recompiler_base()
{
}
@ -1491,55 +1829,14 @@ void spu_recompiler_base::dump(std::string& out)
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Vectorize.h"
#include "Utilities/JIT.h"
class spu_llvm_runtime
{
shared_mutex m_mutex;
// All functions
std::map<std::vector<u32>, spu_function_t> m_map;
// All dispatchers
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
// JIT instance
jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
// Debug module output location
std::string m_cache_path;
friend class spu_llvm_recompiler;
public:
spu_llvm_runtime()
{
// Initialize lookup table
for (auto& v : m_dispatcher)
{
v.raw() = &spu_recompiler_base::dispatch;
}
// Initialize "empty" block
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
// Clear LLVM output
m_cache_path = Emu.PPUCache();
fs::create_dir(m_cache_path + "llvm/");
fs::remove_all(m_cache_path + "llvm/", false);
if (g_cfg.core.spu_debug)
{
fs::file(m_cache_path + "spu.log", fs::rewrite);
}
LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized...");
}
};
class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
{
std::shared_ptr<spu_llvm_runtime> m_spurt;
// SPU Runtime Instance
std::shared_ptr<spu_runtime> m_spurt;
// JIT Instance
jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)};
// Current function (chunk)
llvm::Function* m_function;
@ -2239,11 +2536,6 @@ public:
: spu_recompiler_base()
, cpu_translator(nullptr, false)
{
if (g_cfg.core.spu_shared_runtime)
{
// TODO (local context is unsupported)
//m_spurt = std::make_shared<spu_llvm_runtime>();
}
}
virtual void init() override
@ -2252,9 +2544,9 @@ public:
if (!m_spurt)
{
m_cache = fxm::get<spu_cache>();
m_spurt = fxm::get_always<spu_llvm_runtime>();
m_context = m_spurt->m_jit.get_context();
m_use_ssse3 = m_spurt->m_jit.has_ssse3();
m_spurt = fxm::get_always<spu_runtime>();
m_context = m_jit.get_context();
m_use_ssse3 = m_jit.has_ssse3();
}
}
@ -2271,18 +2563,22 @@ public:
init();
// Don't lock without shared runtime
std::unique_lock lock(m_spurt->m_mutex, std::defer_lock);
if (g_cfg.core.spu_shared_runtime)
{
lock.lock();
}
std::unique_lock lock(m_spurt->m_mutex);
// Try to find existing function, register new one if necessary
const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
auto& fn_location = fn_info.first->second;
if (!fn_location && !fn_info.second)
{
// Wait if already in progress
while (!fn_location)
{
m_spurt->m_cond.wait(lock);
}
}
if (fn_location)
{
return fn_location;
@ -2290,6 +2586,8 @@ public:
auto& func = fn_info.first->first;
lock.unlock();
std::string hash;
{
sha1_context ctx;
@ -2770,179 +3068,6 @@ public:
m_scan_queue.clear();
m_function_table = nullptr;
// Generate a dispatcher (übertrampoline)
std::vector<u32> addrv{func[0]};
const auto beg = m_spurt->m_map.lower_bound(addrv);
addrv[0] += 4;
const auto _end = m_spurt->m_map.lower_bound(addrv);
const u32 size0 = std::distance(beg, _end);
if (size0 > 1)
{
const auto trampoline = cast<Function>(module->getOrInsertFunction(fmt::format("spu-0x%05x-trampoline-%03u", func[0], size0), get_type<void>(), get_type<u8*>(), get_type<u8*>()));
set_function(trampoline);
struct work
{
u32 size;
u32 level;
BasicBlock* label;
std::map<std::vector<u32>, spu_function_t>::iterator beg;
std::map<std::vector<u32>, spu_function_t>::iterator end;
};
std::vector<work> workload;
workload.reserve(size0);
workload.emplace_back();
workload.back().size = size0;
workload.back().level = 1;
workload.back().beg = beg;
workload.back().end = _end;
workload.back().label = m_ir->GetInsertBlock();
for (std::size_t i = 0; i < workload.size(); i++)
{
// Get copy of the workload info
work w = workload[i];
// Switch targets
std::vector<std::pair<u32, llvm::BasicBlock*>> targets;
llvm::BasicBlock* def{};
bool unsorted = false;
while (w.level < w.beg->first.size())
{
const u32 x1 = w.beg->first.at(w.level);
if (x1 == 0)
{
// Cannot split: some functions contain holes at this level
auto it = w.end;
it--;
if (it->first.at(w.level) != 0)
{
unsorted = true;
}
w.level++;
continue;
}
auto it = w.beg;
auto it2 = it;
u32 x = x1;
bool split = false;
while (it2 != w.end)
{
it2++;
const u32 x2 = it2 != w.end ? it2->first.at(w.level) : x1;
if (x2 != x)
{
const u32 dist = std::distance(it, it2);
const auto b = llvm::BasicBlock::Create(m_context, "", m_function);
if (dist == 1 && x != 0)
{
m_ir->SetInsertPoint(b);
if (const u64 fval = reinterpret_cast<u64>(it->second))
{
const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
}
else
{
verify(HERE, &it->second == &fn_location);
m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
}
m_ir->CreateRetVoid();
}
else
{
workload.emplace_back(w);
workload.back().beg = it;
workload.back().end = it2;
workload.back().label = b;
workload.back().size = dist;
}
if (x == 0)
{
def = b;
}
else
{
targets.emplace_back(std::make_pair(x, b));
}
x = x2;
it = it2;
split = true;
}
}
if (!split)
{
// Cannot split: words are identical within the range at this level
w.level++;
}
else
{
break;
}
}
if (!def && targets.empty())
{
LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
m_ir->SetInsertPoint(w.label);
if (const u64 fval = reinterpret_cast<u64>(w.beg->second))
{
const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
}
else
{
verify(HERE, &w.beg->second == &fn_location);
m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
}
m_ir->CreateRetVoid();
continue;
}
if (!def)
{
def = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->SetInsertPoint(def);
tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0));
}
m_ir->SetInsertPoint(w.label);
const auto add = m_ir->CreateGEP(m_lsptr, m_ir->getInt64(start + w.level * 4 - 4));
const auto ptr = m_ir->CreateBitCast(add, get_type<u32*>());
const auto val = m_ir->CreateLoad(ptr);
const auto sw = m_ir->CreateSwitch(val, def, ::size32(targets));
for (auto& pair : targets)
{
sw->addCase(m_ir->getInt32(pair.first), pair.second);
}
}
}
spu_function_t fn{}, tr{};
std::string log;
raw_string_ostream out(log);
@ -2970,32 +3095,19 @@ public:
if (g_cfg.core.spu_debug)
{
// Testing only
m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
}
else
{
m_spurt->m_jit.add(std::move(module));
m_jit.add(std::move(module));
}
m_spurt->m_jit.fin();
fn = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(main_func));
tr = fn;
if (size0 > 1)
{
tr = reinterpret_cast<spu_function_t>(m_spurt->m_jit.get_engine().getPointerToFunction(m_function));
}
m_jit.fin();
// Register function pointer
fn_location = fn;
const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));
// Trampoline
m_spurt->m_dispatcher[func[0] / 4] = tr;
LOG_NOTICE(SPU, "[0x%x] Compiled: %p", func[0], fn);
if (tr != fn)
LOG_NOTICE(SPU, "[0x%x] T: %p", func[0], tr);
m_spurt->add(*fn_info.first, fn);
if (g_cfg.core.spu_debug)
{

View File

@ -1,6 +1,9 @@
#pragma once
#include "Utilities/File.h"
#include "Utilities/mutex.h"
#include "Utilities/cond.h"
#include "Utilities/JIT.h"
#include "SPUThread.h"
#include <vector>
#include <bitset>
@ -30,6 +33,40 @@ public:
static void initialize();
};
// Helper class
class spu_runtime
{
public:
shared_mutex m_mutex;
cond_variable m_cond;
// All functions
std::map<std::vector<u32>, spu_function_t> m_map;
// All dispatchers
std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
// Debug module output location
std::string m_cache_path;
private:
// Temporarily: asmjit runtime collection
std::deque<std::unique_ptr<asmjit::JitRuntime>> m_asmjit_rts;
// Trampoline to spu_recompiler_base::dispatch
spu_function_t tr_dispatch = nullptr;
public:
spu_runtime();
// Get new ASMJIT runtime
asmjit::JitRuntime* get_asmjit_rt();
// Add compiled function and generate trampoline if necessary
void add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled);
};
// SPU Recompiler instance base class
class spu_recompiler_base
{

View File

@ -367,7 +367,6 @@ struct cfg_root : cfg::node
cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield
cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads
cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};