mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 02:32:36 +01:00
SPU LLVM: reintroduce branch patchpoints
Previously only used on SPU ASMJIT, may improve perf in some cases. Now refactored to spu_runtime::make_branch_patchpoint.
This commit is contained in:
parent
7f6a410770
commit
fb64b28886
@ -947,35 +947,11 @@ void spu_recompiler::branch_fixed(u32 target)
|
||||
return;
|
||||
}
|
||||
|
||||
c->mov(x86::rax, imm_ptr(spu_runtime::g_dispatcher + target / 4));
|
||||
c->mov(x86::rax, x86::qword_ptr(x86::rax));
|
||||
|
||||
c->mov(SPU_OFF_32(pc), target);
|
||||
c->xor_(qw0->r32(), qw0->r32());
|
||||
c->cmp(SPU_OFF_32(state), 0);
|
||||
c->jnz(label_stop);
|
||||
|
||||
if (false)
|
||||
{
|
||||
// Don't generate patch points (TODO)
|
||||
c->xor_(qw0->r32(), qw0->r32());
|
||||
c->jmp(x86::rax);
|
||||
return;
|
||||
}
|
||||
|
||||
// Set patch address as a third argument and fallback to it
|
||||
Label patch_point = c->newLabel();
|
||||
c->lea(*qw0, x86::qword_ptr(patch_point));
|
||||
|
||||
// Need to emit exactly one executable instruction within 8 bytes
|
||||
c->align(kAlignCode, 8);
|
||||
c->bind(patch_point);
|
||||
//c->dq(0x841f0f);
|
||||
c->jmp(imm_ptr(&spu_recompiler_base::branch));
|
||||
|
||||
// Fallback to the branch via dispatcher
|
||||
c->align(kAlignCode, 8);
|
||||
c->xor_(qw0->r32(), qw0->r32());
|
||||
c->jmp(x86::rax);
|
||||
c->jmp(imm_ptr(m_spurt->make_branch_patchpoint(target)));
|
||||
}
|
||||
|
||||
void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
|
||||
|
@ -259,6 +259,15 @@ spu_runtime::spu_runtime()
|
||||
|
||||
workload.reserve(250);
|
||||
|
||||
// Generate a trampoline to spu_recompiler_base::branch
|
||||
u8* const trptr = jit_runtime::alloc(16, 16);
|
||||
trptr[0] = 0xff; // jmp [rip]
|
||||
trptr[1] = 0x25;
|
||||
std::memset(trptr + 2, 0, 4);
|
||||
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::branch);
|
||||
std::memcpy(trptr + 6, &target, 8);
|
||||
tr_branch = reinterpret_cast<spu_function_t>(trptr);
|
||||
|
||||
LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized...");
|
||||
}
|
||||
|
||||
@ -539,6 +548,40 @@ void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where,
|
||||
m_cond.notify_all();
|
||||
}
|
||||
|
||||
spu_function_t spu_runtime::make_branch_patchpoint(u32 target) const
|
||||
{
|
||||
u8* const raw = jit_runtime::alloc(16, 16);
|
||||
|
||||
// Save address of the following jmp
|
||||
#ifdef _WIN32
|
||||
raw[0] = 0x4c; // lea r8, [rip+1]
|
||||
raw[1] = 0x8d;
|
||||
raw[2] = 0x05;
|
||||
#else
|
||||
raw[0] = 0x48; // lea rdx, [rip+1]
|
||||
raw[1] = 0x8d;
|
||||
raw[2] = 0x15;
|
||||
#endif
|
||||
raw[3] = 0x01;
|
||||
raw[4] = 0x00;
|
||||
raw[5] = 0x00;
|
||||
raw[6] = 0x00;
|
||||
raw[7] = 0x90; // nop
|
||||
|
||||
// Jump to spu_recompiler_base::branch
|
||||
raw[8] = 0xe9;
|
||||
// Compute the distance
|
||||
const s64 rel = reinterpret_cast<u64>(tr_branch) - reinterpret_cast<u64>(raw + 8) - 5;
|
||||
std::memcpy(raw + 9, &rel, 4);
|
||||
raw[13] = 0xcc;
|
||||
|
||||
// Write compressed target address
|
||||
raw[14] = target >> 2;
|
||||
raw[15] = target >> 10;
|
||||
|
||||
return reinterpret_cast<spu_function_t>(raw);
|
||||
}
|
||||
|
||||
spu_recompiler_base::spu_recompiler_base()
|
||||
{
|
||||
}
|
||||
@ -549,10 +592,25 @@ spu_recompiler_base::~spu_recompiler_base()
|
||||
|
||||
void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
|
||||
{
|
||||
// If code verification failed from a patched patchpoint, clear it with a single NOP
|
||||
// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
|
||||
if (rip)
|
||||
{
|
||||
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), 0x841f0f);
|
||||
const u32 target = *(u16*)(rip + 6) * 4;
|
||||
const s64 rel = reinterpret_cast<u64>(spu_runtime::g_dispatcher) + 2 * target - reinterpret_cast<u64>(rip - 8) - 6;
|
||||
|
||||
union
|
||||
{
|
||||
u8 bytes[8];
|
||||
u64 result;
|
||||
};
|
||||
|
||||
bytes[0] = 0xff; // jmp [rip + 0x...]
|
||||
bytes[1] = 0x25;
|
||||
std::memcpy(bytes + 2, &rel, 4);
|
||||
bytes[6] = 0x90;
|
||||
bytes[7] = 0x90;
|
||||
|
||||
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip - 8), result);
|
||||
}
|
||||
|
||||
// Second attempt (recover from the recursion after repeated unsuccessful trampoline call)
|
||||
@ -580,7 +638,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
|
||||
void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
|
||||
{
|
||||
// Compile (TODO: optimize search of the existing functions)
|
||||
const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), spu.pc)));
|
||||
const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), *(u16*)(rip + 6) * 4)));
|
||||
|
||||
// Overwrite jump to this function with jump to the compiled function
|
||||
const s64 rel = reinterpret_cast<u64>(func) - reinterpret_cast<u64>(rip) - 5;
|
||||
@ -599,23 +657,22 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
|
||||
{
|
||||
bytes[0] = 0xeb; // jmp rel8
|
||||
bytes[1] = static_cast<s8>(rel8);
|
||||
std::memset(bytes + 2, 0x90, 6);
|
||||
std::memset(bytes + 2, 0xcc, 4);
|
||||
}
|
||||
else
|
||||
{
|
||||
bytes[0] = 0xe9; // jmp rel32
|
||||
std::memcpy(bytes + 1, &rel, 4);
|
||||
std::memset(bytes + 5, 0x90, 3);
|
||||
bytes[5] = 0xcc;
|
||||
}
|
||||
|
||||
// Preserve target address
|
||||
bytes[6] = rip[6];
|
||||
bytes[7] = rip[7];
|
||||
}
|
||||
else
|
||||
{
|
||||
// Far jumps: extremely rare and disabled due to implementation complexity
|
||||
LOG_ERROR(SPU, "Impossible far jump");
|
||||
bytes[0] = 0x0f; // nop (8-byte form)
|
||||
bytes[1] = 0x1f;
|
||||
bytes[2] = 0x84;
|
||||
std::memset(bytes + 3, 0x00, 5);
|
||||
fmt::throw_exception("Impossible far jump: %p -> %p", rip, func);
|
||||
}
|
||||
|
||||
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), result);
|
||||
@ -1985,13 +2042,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
LOG_ERROR(SPU, "[0x%x] Predecessor not found for target 0x%x (chunk=0x%x, entry=0x%x, size=%u)", m_pos, target, m_entry, m_function_queue[0], m_size / 4);
|
||||
}
|
||||
|
||||
// Generate external indirect tail call
|
||||
// Generate a patchpoint for fixed location
|
||||
const auto cblock = m_ir->GetInsertBlock();
|
||||
const auto result = llvm::BasicBlock::Create(m_context, "", m_function);
|
||||
m_ir->SetInsertPoint(result);
|
||||
m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(&spu_thread::pc));
|
||||
const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u8*>(), get_type<u8*>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
|
||||
tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->getInt64((u64)(spu_runtime::g_dispatcher + target / 4)), type)));
|
||||
const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u8*>(), get_type<u8*>(), get_type<u32>()}, false)->getPointerTo();
|
||||
tail(m_ir->CreateIntToPtr(m_ir->getInt64((u64)m_spurt->make_branch_patchpoint(target)), type));
|
||||
m_ir->SetInsertPoint(cblock);
|
||||
return result;
|
||||
}
|
||||
|
@ -67,12 +67,18 @@ private:
|
||||
// Trampoline to spu_recompiler_base::dispatch
|
||||
spu_function_t tr_dispatch = nullptr;
|
||||
|
||||
// Trampoline to spu_recompiler_base::branch
|
||||
spu_function_t tr_branch = nullptr;
|
||||
|
||||
public:
|
||||
spu_runtime();
|
||||
|
||||
// Add compiled function and generate trampoline if necessary
|
||||
void add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled);
|
||||
|
||||
// Generate a patchable trampoline to spu_recompiler_base::branch
|
||||
spu_function_t make_branch_patchpoint(u32 target) const;
|
||||
|
||||
// All dispatchers (array allocated in jit memory)
|
||||
static atomic_t<spu_function_t>* const g_dispatcher;
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user