mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 10:42:36 +01:00
SPU LLVM: implemented asynchronous compilation
Implemented interpreter-based pre-recompiler. Interpreter functions are build with SPU LLVM.
This commit is contained in:
parent
29cddc30f0
commit
b329bb604c
@ -45,9 +45,12 @@ void spu_recompiler::init()
|
||||
}
|
||||
}
|
||||
|
||||
spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u32>& func)
|
||||
spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u32>& func, void* fn_location)
|
||||
{
|
||||
const auto fn_location = m_spurt->find(last_reset_count, func);
|
||||
if (!fn_location)
|
||||
{
|
||||
fn_location = m_spurt->find(last_reset_count, func);
|
||||
}
|
||||
|
||||
if (fn_location == spu_runtime::g_dispatcher)
|
||||
{
|
||||
|
@ -13,7 +13,7 @@ public:
|
||||
|
||||
virtual void init() override;
|
||||
|
||||
virtual spu_function_t compile(u64 last_reset_count, const std::vector<u32>&) override;
|
||||
virtual spu_function_t compile(u64 last_reset_count, const std::vector<u32>&, void*) override;
|
||||
|
||||
private:
|
||||
// ASMJIT runtime
|
||||
|
@ -277,6 +277,8 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, sp
|
||||
c.jmp(args[1]);
|
||||
});
|
||||
|
||||
DECLARE(spu_runtime::g_interpreter_table) = {};
|
||||
|
||||
DECLARE(spu_runtime::g_interpreter) = nullptr;
|
||||
|
||||
spu_cache::spu_cache(const std::string& loc)
|
||||
@ -392,16 +394,20 @@ void spu_cache::initialize()
|
||||
u32 thread_count = max_threads > 0 ? std::min(max_threads, std::thread::hardware_concurrency()) : std::thread::hardware_concurrency();
|
||||
std::vector<std::unique_ptr<spu_recompiler_base>> compilers{thread_count};
|
||||
|
||||
if (g_cfg.core.spu_decoder == spu_decoder_type::fast)
|
||||
if (g_cfg.core.spu_decoder == spu_decoder_type::fast || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
|
||||
{
|
||||
if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11))
|
||||
{
|
||||
compiler->init();
|
||||
|
||||
if (compiler->compile(0, {}) && spu_runtime::g_interpreter)
|
||||
if (compiler->compile(0, {}, nullptr) && spu_runtime::g_interpreter)
|
||||
{
|
||||
LOG_SUCCESS(SPU, "SPU Runtime: built interpreter.");
|
||||
return;
|
||||
|
||||
if (g_cfg.core.spu_decoder != spu_decoder_type::llvm)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -450,7 +456,7 @@ void spu_cache::initialize()
|
||||
// Build functions
|
||||
for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++)
|
||||
{
|
||||
std::vector<u32>& func = func_list[func_i];
|
||||
const std::vector<u32>& func = std::as_const(func_list)[func_i];
|
||||
|
||||
if (Emu.IsStopped() || fail_flag)
|
||||
{
|
||||
@ -476,7 +482,7 @@ void spu_cache::initialize()
|
||||
LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
|
||||
}
|
||||
|
||||
if (!compiler->compile(0, func))
|
||||
if (!compiler->compile(0, func, nullptr))
|
||||
{
|
||||
// Likely, out of JIT memory. Signal to prevent further building.
|
||||
fail_flag |= 1;
|
||||
@ -615,7 +621,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
|
||||
auto& where = *static_cast<decltype(m_map)::value_type*>(_where);
|
||||
|
||||
// Function info
|
||||
const std::vector<u32>& func = where.first;
|
||||
const std::vector<u32>& func = get_func(_where);
|
||||
|
||||
//
|
||||
const u32 _off = 1 + (func[0] / 4) * (false);
|
||||
@ -1137,7 +1143,7 @@ void spu_recompiler_base::make_function(const std::vector<u32>& data)
|
||||
{
|
||||
for (u64 reset_count = m_spurt->get_reset_count();;)
|
||||
{
|
||||
if (LIKELY(compile(reset_count, data)))
|
||||
if (LIKELY(compile(reset_count, data, nullptr)))
|
||||
{
|
||||
break;
|
||||
}
|
||||
@ -3276,6 +3282,7 @@ void spu_recompiler_base::dump(std::string& out)
|
||||
#include "llvm/ADT/Triple.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IR/InlineAsm.h"
|
||||
#include "llvm/Analysis/Lint.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Scalar.h"
|
||||
@ -4231,14 +4238,17 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual spu_function_t compile(u64 last_reset_count, const std::vector<u32>& func) override
|
||||
virtual spu_function_t compile(u64 last_reset_count, const std::vector<u32>& func, void* fn_location) override
|
||||
{
|
||||
if (func.empty() && last_reset_count == 0 && m_interp_magn)
|
||||
{
|
||||
return compile_interpreter();
|
||||
}
|
||||
|
||||
const auto fn_location = m_spurt->find(last_reset_count, func);
|
||||
if (!fn_location)
|
||||
{
|
||||
fn_location = m_spurt->find(last_reset_count, func);
|
||||
}
|
||||
|
||||
if (fn_location == spu_runtime::g_dispatcher)
|
||||
{
|
||||
@ -4273,14 +4283,7 @@ public:
|
||||
m_hash_start = hash_start;
|
||||
}
|
||||
|
||||
if (g_fxo->get<spu_cache>())
|
||||
{
|
||||
LOG_SUCCESS(SPU, "LLVM: Building %s (size %u)...", m_hash, func.size() - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, m_hash);
|
||||
}
|
||||
LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, m_hash);
|
||||
|
||||
m_pos = func[0];
|
||||
m_base = func[0];
|
||||
@ -4838,6 +4841,11 @@ public:
|
||||
fs::file(m_spurt->get_cache_path() + "spu-ir.log", fs::write + fs::append).write(log);
|
||||
}
|
||||
|
||||
if (g_fxo->get<spu_cache>())
|
||||
{
|
||||
LOG_SUCCESS(SPU, "New block compiled successfully");
|
||||
}
|
||||
|
||||
return fn;
|
||||
}
|
||||
|
||||
@ -4899,7 +4907,6 @@ public:
|
||||
m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", ret_func));
|
||||
m_thread = &*(ret_func->arg_begin() + 1);
|
||||
m_interp_pc = &*(ret_func->arg_begin() + 2);
|
||||
m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(&spu_thread::pc));
|
||||
m_ir->CreateRetVoid();
|
||||
|
||||
// Add entry function, serves as a trampoline
|
||||
@ -4953,12 +4960,13 @@ public:
|
||||
}
|
||||
|
||||
// Fill interpreter table
|
||||
std::array<llvm::Function*, 256> ifuncs{};
|
||||
std::vector<llvm::Constant*> iptrs;
|
||||
iptrs.reserve(1ull << m_interp_magn);
|
||||
|
||||
m_block = nullptr;
|
||||
|
||||
auto last_itype = spu_itype::UNK;
|
||||
auto last_itype = spu_itype::type{255};
|
||||
|
||||
for (u32 i = 0; i < 1u << m_interp_magn;)
|
||||
{
|
||||
@ -4988,8 +4996,12 @@ public:
|
||||
// Build if necessary
|
||||
if (f->empty())
|
||||
{
|
||||
if (last_itype != itype)
|
||||
{
|
||||
ifuncs[itype] = f;
|
||||
}
|
||||
|
||||
f->setCallingConv(CallingConv::GHC);
|
||||
f->setLinkage(GlobalValue::InternalLinkage);
|
||||
|
||||
m_function = f;
|
||||
m_lsptr = &*(f->arg_begin() + 0);
|
||||
@ -5090,6 +5102,62 @@ public:
|
||||
m_interp_pc = m_interp_pc_next;
|
||||
}
|
||||
|
||||
if (last_itype != itype)
|
||||
{
|
||||
// Reset to discard dead code
|
||||
llvm::cast<LoadInst>(next_if)->setVolatile(false);
|
||||
|
||||
if (itype & spu_itype::branch)
|
||||
{
|
||||
const auto _stop = BasicBlock::Create(m_context, "", f);
|
||||
const auto _next = BasicBlock::Create(m_context, "", f);
|
||||
m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(spu_ptr<u32>(&spu_thread::state))), _stop, _next, m_md_unlikely);
|
||||
m_ir->SetInsertPoint(_stop);
|
||||
m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(&spu_thread::pc));
|
||||
|
||||
const auto escape_yes = BasicBlock::Create(m_context, "", f);
|
||||
const auto escape_no = BasicBlock::Create(m_context, "", f);
|
||||
m_ir->CreateCondBr(call("spu_exec_check_state", &exec_check_state, m_thread), escape_yes, escape_no);
|
||||
m_ir->SetInsertPoint(escape_yes);
|
||||
call("spu_escape", spu_runtime::g_escape, m_thread);
|
||||
m_ir->CreateBr(_next);
|
||||
m_ir->SetInsertPoint(escape_no);
|
||||
m_ir->CreateBr(_next);
|
||||
m_ir->SetInsertPoint(_next);
|
||||
}
|
||||
|
||||
llvm::Value* fret = m_ir->CreateBitCast(m_interp_table, if_type->getPointerTo());
|
||||
|
||||
if (itype == spu_itype::WRCH ||
|
||||
itype == spu_itype::RDCH ||
|
||||
itype == spu_itype::RCHCNT ||
|
||||
itype == spu_itype::STOP ||
|
||||
itype == spu_itype::STOPD ||
|
||||
itype == spu_itype::UNK ||
|
||||
itype == spu_itype::DFCMEQ ||
|
||||
itype == spu_itype::DFCMGT ||
|
||||
itype == spu_itype::DFCGT ||
|
||||
itype == spu_itype::DFCEQ ||
|
||||
itype == spu_itype::DFTSV)
|
||||
{
|
||||
m_interp_7f0 = m_ir->getInt32(0x7f0);
|
||||
m_interp_regs = _ptr(m_thread, get_reg_offset(0));
|
||||
fret = ret_func;
|
||||
}
|
||||
else if (!(itype & spu_itype::branch))
|
||||
{
|
||||
// Hack: inline ret instruction before final jmp; this is not reliable.
|
||||
m_ir->CreateCall(InlineAsm::get(get_ftype<void>(), "ret", "", true, false, InlineAsm::AD_Intel));
|
||||
fret = ret_func;
|
||||
}
|
||||
|
||||
const auto arg3 = UndefValue::get(get_type<u32>());
|
||||
const auto _ret = m_ir->CreateCall(fret, {m_lsptr, m_thread, m_interp_pc, arg3, m_interp_table, m_interp_7f0, m_interp_regs});
|
||||
_ret->setCallingConv(CallingConv::GHC);
|
||||
_ret->setTailCall();
|
||||
m_ir->CreateRetVoid();
|
||||
}
|
||||
|
||||
if (!m_ir->GetInsertBlock()->getTerminator())
|
||||
{
|
||||
// Call next instruction.
|
||||
@ -5129,7 +5197,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (last_itype != itype)
|
||||
if (last_itype != itype && g_cfg.core.spu_decoder != spu_decoder_type::llvm)
|
||||
{
|
||||
// Repeat after probing
|
||||
last_itype = itype;
|
||||
@ -5194,6 +5262,12 @@ public:
|
||||
// Register interpreter entry point
|
||||
spu_runtime::g_interpreter = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));
|
||||
|
||||
for (u32 i = 0; i < spu_runtime::g_interpreter_table.size(); i++)
|
||||
{
|
||||
// Fill exported interpreter table
|
||||
spu_runtime::g_interpreter_table[i] = ifuncs[i] ? reinterpret_cast<u64>(m_jit.get_engine().getPointerToFunction(ifuncs[i])) : 0;
|
||||
}
|
||||
|
||||
if (!spu_runtime::g_interpreter)
|
||||
{
|
||||
return nullptr;
|
||||
@ -8242,3 +8316,416 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_llvm_recompiler(u
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// SPU LLVM recompiler thread context
|
||||
struct spu_llvm
|
||||
{
|
||||
// Workload
|
||||
lf_queue<std::pair<void*, u8*>> registered;
|
||||
|
||||
void operator()()
|
||||
{
|
||||
// SPU LLVM Recompiler instance
|
||||
const auto compiler = spu_recompiler_base::make_llvm_recompiler();
|
||||
compiler->init();
|
||||
|
||||
// Fake LS
|
||||
std::vector<be_t<u32>> ls(0x10000);
|
||||
|
||||
for (auto* parg : registered)
|
||||
{
|
||||
if (thread_ctrl::state() == thread_state::aborting)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (!parg)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::vector<u32>& func = spu_runtime::get_func(parg->first);
|
||||
|
||||
// Get data start
|
||||
const u32 start = func[0];
|
||||
const u32 size0 = ::size32(func);
|
||||
|
||||
// Initialize LS with function data only
|
||||
for (u32 i = 1, pos = start; i < size0; i++, pos += 4)
|
||||
{
|
||||
ls[pos / 4] = se_storage<u32>::swap(func[i]);
|
||||
}
|
||||
|
||||
// Call analyser
|
||||
const std::vector<u32>& func2 = compiler->analyse(ls.data(), func[0]);
|
||||
|
||||
if (func2.size() != size0)
|
||||
{
|
||||
LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
|
||||
}
|
||||
|
||||
if (const auto target = compiler->compile(0, func, parg->first))
|
||||
{
|
||||
// Redirect old function
|
||||
const s64 rel = reinterpret_cast<u64>(target) - reinterpret_cast<u64>(parg->second) - 5;
|
||||
|
||||
union
|
||||
{
|
||||
u8 bytes[8];
|
||||
u64 result;
|
||||
};
|
||||
|
||||
bytes[0] = 0xe9; // jmp rel32
|
||||
std::memcpy(bytes + 1, &rel, 4);
|
||||
bytes[5] = 0x90;
|
||||
bytes[6] = 0x90;
|
||||
bytes[7] = 0x90;
|
||||
|
||||
atomic_storage<u64>::release(*reinterpret_cast<u64*>(parg->second), result);
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func2[0]);
|
||||
}
|
||||
|
||||
// Clear fake LS
|
||||
for (u32 i = 1, pos = start; i < func2.size(); i++, pos += 4)
|
||||
{
|
||||
if (se_storage<u32>::swap(func2[i]) != ls[pos / 4])
|
||||
{
|
||||
LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed at 0x%x", func2[0], pos);
|
||||
}
|
||||
|
||||
ls[pos / 4] = 0;
|
||||
}
|
||||
|
||||
if (func2.size() != size0)
|
||||
{
|
||||
std::memset(ls.data(), 0, 0x40000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr auto thread_name = "SPU LLVM"sv;
|
||||
};
|
||||
|
||||
using spu_llvm_thread = named_thread<spu_llvm>;
|
||||
|
||||
struct spu_fast : public spu_recompiler_base
|
||||
{
|
||||
virtual void init() override
|
||||
{
|
||||
if (!m_spurt)
|
||||
{
|
||||
m_spurt = g_fxo->get<spu_runtime>();
|
||||
}
|
||||
}
|
||||
|
||||
virtual spu_function_t compile(u64 last_reset_count, const std::vector<u32>& func, void* fn_location) override
|
||||
{
|
||||
if (!fn_location)
|
||||
{
|
||||
fn_location = m_spurt->find(last_reset_count, func);
|
||||
}
|
||||
|
||||
if (fn_location == spu_runtime::g_dispatcher)
|
||||
{
|
||||
return &dispatch;
|
||||
}
|
||||
|
||||
if (!fn_location)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
std::string log;
|
||||
this->dump(log);
|
||||
fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log);
|
||||
}
|
||||
|
||||
// Allocate executable area with necessary size
|
||||
const auto result = jit_runtime::alloc(8 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
|
||||
|
||||
if (!result)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
m_pos = func[0];
|
||||
m_size = (::size32(func) - 1) * 4;
|
||||
|
||||
u8* raw = result;
|
||||
|
||||
// 8-byte NOP for patching
|
||||
*raw++ = 0x0f;
|
||||
*raw++ = 0x1f;
|
||||
*raw++ = 0x84;
|
||||
*raw++ = 0x00;
|
||||
*raw++ = 0x00;
|
||||
*raw++ = 0x00;
|
||||
*raw++ = 0x00;
|
||||
*raw++ = 0x00;
|
||||
|
||||
// Load PC: mov eax, [r13 + spu_thread::pc]
|
||||
*raw++ = 0x41;
|
||||
*raw++ = 0x8b;
|
||||
*raw++ = 0x45;
|
||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
|
||||
|
||||
// Get LS address starting from PC: lea rcx, [rbp + rax]
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x8d;
|
||||
*raw++ = 0x4c;
|
||||
*raw++ = 0x05;
|
||||
*raw++ = 0x00;
|
||||
|
||||
// Verification (slow)
|
||||
for (u32 i = 1; i < func.size(); i++)
|
||||
{
|
||||
if (!func[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// cmp dword ptr [rcx + off], opc
|
||||
*raw++ = 0x81;
|
||||
*raw++ = 0xb9;
|
||||
const u32 off = (i - 1) * 4;
|
||||
const u32 opc = func[i];
|
||||
std::memcpy(raw + 0, &off, 4);
|
||||
std::memcpy(raw + 4, &opc, 4);
|
||||
raw += 8;
|
||||
|
||||
// jne tr_dispatch
|
||||
const s64 rel = reinterpret_cast<u64>(spu_runtime::tr_dispatch) - reinterpret_cast<u64>(raw) - 6;
|
||||
*raw++ = 0x0f;
|
||||
*raw++ = 0x85;
|
||||
std::memcpy(raw + 0, &rel, 4);
|
||||
raw += 4;
|
||||
}
|
||||
|
||||
// trap
|
||||
//*raw++ = 0xcc;
|
||||
|
||||
// Secondary prologue: sub rsp,0x28
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x83;
|
||||
*raw++ = 0xec;
|
||||
*raw++ = 0x28;
|
||||
|
||||
// Fix args: xchg r13,rbp
|
||||
*raw++ = 0x49;
|
||||
*raw++ = 0x87;
|
||||
*raw++ = 0xed;
|
||||
|
||||
// mov r12d, eax
|
||||
*raw++ = 0x41;
|
||||
*raw++ = 0x89;
|
||||
*raw++ = 0xc4;
|
||||
|
||||
// mov esi, 0x7f0
|
||||
*raw++ = 0xbe;
|
||||
*raw++ = 0xf0;
|
||||
*raw++ = 0x07;
|
||||
*raw++ = 0x00;
|
||||
*raw++ = 0x00;
|
||||
|
||||
// lea rdi, [rbp + spu_thread::gpr]
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x8d;
|
||||
*raw++ = 0x7d;
|
||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::gpr));
|
||||
|
||||
// Save base pc: mov [rbp + spu_thread::base_pc], eax
|
||||
*raw++ = 0x89;
|
||||
*raw++ = 0x45;
|
||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::base_pc));
|
||||
|
||||
// inc block_counter
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0xff;
|
||||
*raw++ = 0x85;
|
||||
const u32 blc_off = ::offset32(&spu_thread::block_counter);
|
||||
std::memcpy(raw, &blc_off, 4);
|
||||
raw += 4;
|
||||
|
||||
// lea r14, [local epilogue]
|
||||
*raw++ = 0x4c;
|
||||
*raw++ = 0x8d;
|
||||
*raw++ = 0x35;
|
||||
const u32 epi_off = (::size32(func) - 1) * 16;
|
||||
std::memcpy(raw, &epi_off, 4);
|
||||
raw += 4;
|
||||
|
||||
// Instructions (each instruction occupies fixed number of bytes)
|
||||
for (u32 i = 1; i < func.size(); i++)
|
||||
{
|
||||
const u32 pos = m_pos + (i - 1) * 4;
|
||||
|
||||
if (!func[i])
|
||||
{
|
||||
// Save pc: mov [rbp + spu_thread::pc], r12d
|
||||
*raw++ = 0x44;
|
||||
*raw++ = 0x89;
|
||||
*raw++ = 0x65;
|
||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
|
||||
|
||||
// Epilogue: add rsp,0x28
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x83;
|
||||
*raw++ = 0xc4;
|
||||
*raw++ = 0x28;
|
||||
|
||||
// ret (TODO)
|
||||
*raw++ = 0xc3;
|
||||
std::memset(raw, 0xcc, 16 - 9);
|
||||
raw += 16 - 9;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Fix endianness
|
||||
const spu_opcode_t op{se_storage<u32>::swap(func[i])};
|
||||
|
||||
switch (auto type = s_spu_itype.decode(op.opcode))
|
||||
{
|
||||
case spu_itype::BRZ:
|
||||
case spu_itype::BRHZ:
|
||||
case spu_itype::BRNZ:
|
||||
case spu_itype::BRHNZ:
|
||||
{
|
||||
const u32 target = spu_branch_target(pos, op.i16);
|
||||
|
||||
if (0 && target >= m_pos && target < m_pos + m_size)
|
||||
{
|
||||
*raw++ = type == spu_itype::BRHZ || type == spu_itype::BRHNZ ? 0x66 : 0x90;
|
||||
*raw++ = 0x83;
|
||||
*raw++ = 0xbd;
|
||||
const u32 off = ::offset32(&spu_thread::gpr, op.rt) + 12;
|
||||
std::memcpy(raw, &off, 4);
|
||||
raw += 4;
|
||||
*raw++ = 0x00;
|
||||
|
||||
*raw++ = 0x0f;
|
||||
*raw++ = type == spu_itype::BRZ || type == spu_itype::BRHZ ? 0x84 : 0x85;
|
||||
const u32 dif = (target - (pos + 4)) / 4 * 16 + 2;
|
||||
std::memcpy(raw, &dif, 4);
|
||||
raw += 4;
|
||||
|
||||
*raw++ = 0x66;
|
||||
*raw++ = 0x90;
|
||||
break;
|
||||
}
|
||||
|
||||
[[fallthrough]];
|
||||
}
|
||||
default:
|
||||
{
|
||||
// Ballast: mov r15d, pos
|
||||
*raw++ = 0x41;
|
||||
*raw++ = 0xbf;
|
||||
std::memcpy(raw, &pos, 4);
|
||||
raw += 4;
|
||||
|
||||
// mov ebx, opc
|
||||
*raw++ = 0xbb;
|
||||
std::memcpy(raw, &op, 4);
|
||||
raw += 4;
|
||||
|
||||
// call spu_* (specially built interpreter function)
|
||||
const s64 rel = spu_runtime::g_interpreter_table[type] - reinterpret_cast<u64>(raw) - 5;
|
||||
*raw++ = 0xe8;
|
||||
std::memcpy(raw, &rel, 4);
|
||||
raw += 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Local dispatcher/epilogue: fix stack after branch instruction, then dispatch or return
|
||||
|
||||
// add rsp, 8
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x83;
|
||||
*raw++ = 0xc4;
|
||||
*raw++ = 0x08;
|
||||
|
||||
// and rsp, -16
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x83;
|
||||
*raw++ = 0xe4;
|
||||
*raw++ = 0xf0;
|
||||
|
||||
// lea rax, [r12 - size]
|
||||
*raw++ = 0x49;
|
||||
*raw++ = 0x8d;
|
||||
*raw++ = 0x84;
|
||||
*raw++ = 0x24;
|
||||
const u32 msz = 0u - m_size;
|
||||
std::memcpy(raw, &msz, 4);
|
||||
raw += 4;
|
||||
|
||||
// sub eax, [rbp + spu_thread::base_pc]
|
||||
*raw++ = 0x2b;
|
||||
*raw++ = 0x45;
|
||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::base_pc));
|
||||
|
||||
// cmp eax, (0 - size)
|
||||
*raw++ = 0x3d;
|
||||
std::memcpy(raw, &msz, 4);
|
||||
raw += 4;
|
||||
|
||||
// jb epilogue
|
||||
*raw++ = 0x72;
|
||||
*raw++ = +12;
|
||||
|
||||
// movsxd rax, eax
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x63;
|
||||
*raw++ = 0xc0;
|
||||
|
||||
// shl rax, 2
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0xc1;
|
||||
*raw++ = 0xe0;
|
||||
*raw++ = 0x02;
|
||||
|
||||
// add rax, r14
|
||||
*raw++ = 0x4c;
|
||||
*raw++ = 0x01;
|
||||
*raw++ = 0xf0;
|
||||
|
||||
// jmp rax
|
||||
*raw++ = 0xff;
|
||||
*raw++ = 0xe0;
|
||||
|
||||
// Save pc: mov [rbp + spu_thread::pc], r12d
|
||||
*raw++ = 0x44;
|
||||
*raw++ = 0x89;
|
||||
*raw++ = 0x65;
|
||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
|
||||
|
||||
// Epilogue: add rsp,0x28 ; ret
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x83;
|
||||
*raw++ = 0xc4;
|
||||
*raw++ = 0x28;
|
||||
*raw++ = 0xc3;
|
||||
|
||||
if (!m_spurt->add(last_reset_count, fn_location, reinterpret_cast<spu_function_t>(result)))
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Send work to LLVM compiler thread; after add() to avoid race
|
||||
g_fxo->get<spu_llvm_thread>()->registered.push(fn_location, result);
|
||||
|
||||
return reinterpret_cast<spu_function_t>(result);
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_fast_llvm_recompiler()
|
||||
{
|
||||
return std::make_unique<spu_fast>();
|
||||
}
|
||||
|
@ -104,6 +104,12 @@ public:
|
||||
// Return opaque pointer for add()
|
||||
void* find(u64 last_reset_count, const std::vector<u32>&);
|
||||
|
||||
// Get func from opaque ptr
|
||||
static inline const std::vector<u32>& get_func(void* _where)
|
||||
{
|
||||
return static_cast<decltype(m_map)::value_type*>(_where)->first;
|
||||
}
|
||||
|
||||
// Find existing function
|
||||
spu_function_t find(const u32* ls, u32 addr) const;
|
||||
|
||||
@ -134,6 +140,9 @@ public:
|
||||
// Similar to g_escape, but doing tail call to the new function.
|
||||
static void(*const g_tail_escape)(spu_thread*, spu_function_t, u8*);
|
||||
|
||||
// Interpreter table (spu_itype -> ptr)
|
||||
static std::array<u64, 256> g_interpreter_table;
|
||||
|
||||
// Interpreter entry point
|
||||
static spu_function_t g_interpreter;
|
||||
|
||||
@ -364,7 +373,7 @@ public:
|
||||
virtual void init() = 0;
|
||||
|
||||
// Compile function (may fail)
|
||||
virtual spu_function_t compile(u64 last_reset_count, const std::vector<u32>&) = 0;
|
||||
virtual spu_function_t compile(u64 last_reset_count, const std::vector<u32>&, void*) = 0;
|
||||
|
||||
// Compile function, handle failure
|
||||
void make_function(const std::vector<u32>&);
|
||||
@ -400,4 +409,7 @@ public:
|
||||
|
||||
// Create recompiler instance (LLVM)
|
||||
static std::unique_ptr<spu_recompiler_base> make_llvm_recompiler(u8 magn = 0);
|
||||
|
||||
// Create recompiler instance (interpreter-based LLVM)
|
||||
static std::unique_ptr<spu_recompiler_base> make_fast_llvm_recompiler();
|
||||
};
|
||||
|
@ -1227,7 +1227,7 @@ spu_thread::spu_thread(vm::addr_t ls, lv2_spu_group* group, u32 index, std::stri
|
||||
|
||||
if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
|
||||
{
|
||||
jit = spu_recompiler_base::make_llvm_recompiler();
|
||||
jit = spu_recompiler_base::make_fast_llvm_recompiler();
|
||||
}
|
||||
|
||||
if (g_cfg.core.spu_decoder != spu_decoder_type::fast && g_cfg.core.spu_decoder != spu_decoder_type::precise)
|
||||
|
@ -514,6 +514,12 @@ public:
|
||||
|
||||
u32 pc = 0;
|
||||
|
||||
// May be used internally by recompilers.
|
||||
u32 base_pc = 0;
|
||||
|
||||
// May be used by recompilers.
|
||||
u8* memory_base_addr = vm::g_base_addr;
|
||||
|
||||
// General-Purpose Registers
|
||||
std::array<v128, 128> gpr;
|
||||
SPU_FPSCR fpscr;
|
||||
@ -581,8 +587,6 @@ public:
|
||||
|
||||
u64 saved_native_sp = 0; // Host thread's stack pointer for emulated longjmp
|
||||
|
||||
u8* memory_base_addr = vm::g_base_addr;
|
||||
|
||||
std::array<v128, 0x4000> stack_mirror; // Return address information
|
||||
|
||||
void push_snr(u32 number, u32 value);
|
||||
|
Loading…
Reference in New Issue
Block a user