1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 02:32:36 +01:00

SPU LLVM: reintroduce branch patchpoints

Previously only used on SPU ASMJIT, may improve perf in some cases.
Now refactored to spu_runtime::make_branch_patchpoint.
This commit is contained in:
Nekotekina 2019-02-28 01:08:18 +03:00
parent 7f6a410770
commit fb64b28886
3 changed files with 79 additions and 40 deletions

View File

@ -947,35 +947,11 @@ void spu_recompiler::branch_fixed(u32 target)
return;
}
c->mov(x86::rax, imm_ptr(spu_runtime::g_dispatcher + target / 4));
c->mov(x86::rax, x86::qword_ptr(x86::rax));
c->mov(SPU_OFF_32(pc), target);
c->xor_(qw0->r32(), qw0->r32());
c->cmp(SPU_OFF_32(state), 0);
c->jnz(label_stop);
if (false)
{
// Don't generate patch points (TODO)
c->xor_(qw0->r32(), qw0->r32());
c->jmp(x86::rax);
return;
}
// Set patch address as a third argument and fallback to it
Label patch_point = c->newLabel();
c->lea(*qw0, x86::qword_ptr(patch_point));
// Need to emit exactly one executable instruction within 8 bytes
c->align(kAlignCode, 8);
c->bind(patch_point);
//c->dq(0x841f0f);
c->jmp(imm_ptr(&spu_recompiler_base::branch));
// Fallback to the branch via dispatcher
c->align(kAlignCode, 8);
c->xor_(qw0->r32(), qw0->r32());
c->jmp(x86::rax);
c->jmp(imm_ptr(m_spurt->make_branch_patchpoint(target)));
}
void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)

View File

@ -259,6 +259,15 @@ spu_runtime::spu_runtime()
workload.reserve(250);
// Generate a trampoline to spu_recompiler_base::branch
u8* const trptr = jit_runtime::alloc(16, 16);
trptr[0] = 0xff; // jmp [rip]
trptr[1] = 0x25;
std::memset(trptr + 2, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::branch);
std::memcpy(trptr + 6, &target, 8);
tr_branch = reinterpret_cast<spu_function_t>(trptr);
LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized...");
}
@ -539,6 +548,40 @@ void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where,
m_cond.notify_all();
}
spu_function_t spu_runtime::make_branch_patchpoint(u32 target) const
{
u8* const raw = jit_runtime::alloc(16, 16);
// Save address of the following jmp
#ifdef _WIN32
raw[0] = 0x4c; // lea r8, [rip+1]
raw[1] = 0x8d;
raw[2] = 0x05;
#else
raw[0] = 0x48; // lea rdx, [rip+1]
raw[1] = 0x8d;
raw[2] = 0x15;
#endif
raw[3] = 0x01;
raw[4] = 0x00;
raw[5] = 0x00;
raw[6] = 0x00;
raw[7] = 0x90; // nop
// Jump to spu_recompiler_base::branch
raw[8] = 0xe9;
// Compute the distance
const s64 rel = reinterpret_cast<u64>(tr_branch) - reinterpret_cast<u64>(raw + 8) - 5;
std::memcpy(raw + 9, &rel, 4);
raw[13] = 0xcc;
// Write compressed target address
raw[14] = target >> 2;
raw[15] = target >> 10;
return reinterpret_cast<spu_function_t>(raw);
}
spu_recompiler_base::spu_recompiler_base()
{
}
@ -549,10 +592,25 @@ spu_recompiler_base::~spu_recompiler_base()
void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
{
// If code verification failed from a patched patchpoint, clear it with a single NOP
// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
if (rip)
{
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), 0x841f0f);
const u32 target = *(u16*)(rip + 6) * 4;
const s64 rel = reinterpret_cast<u64>(spu_runtime::g_dispatcher) + 2 * target - reinterpret_cast<u64>(rip - 8) - 6;
union
{
u8 bytes[8];
u64 result;
};
bytes[0] = 0xff; // jmp [rip + 0x...]
bytes[1] = 0x25;
std::memcpy(bytes + 2, &rel, 4);
bytes[6] = 0x90;
bytes[7] = 0x90;
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip - 8), result);
}
// Second attempt (recover from the recursion after repeated unsuccessful trampoline call)
@ -580,7 +638,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
{
// Compile (TODO: optimize search of the existing functions)
const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), spu.pc)));
const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), *(u16*)(rip + 6) * 4)));
// Overwrite jump to this function with jump to the compiled function
const s64 rel = reinterpret_cast<u64>(func) - reinterpret_cast<u64>(rip) - 5;
@ -599,23 +657,22 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
{
bytes[0] = 0xeb; // jmp rel8
bytes[1] = static_cast<s8>(rel8);
std::memset(bytes + 2, 0x90, 6);
std::memset(bytes + 2, 0xcc, 4);
}
else
{
bytes[0] = 0xe9; // jmp rel32
std::memcpy(bytes + 1, &rel, 4);
std::memset(bytes + 5, 0x90, 3);
bytes[5] = 0xcc;
}
// Preserve target address
bytes[6] = rip[6];
bytes[7] = rip[7];
}
else
{
// Far jumps: extremely rare and disabled due to implementation complexity
LOG_ERROR(SPU, "Impossible far jump");
bytes[0] = 0x0f; // nop (8-byte form)
bytes[1] = 0x1f;
bytes[2] = 0x84;
std::memset(bytes + 3, 0x00, 5);
fmt::throw_exception("Impossible far jump: %p -> %p", rip, func);
}
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), result);
@ -1985,13 +2042,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
LOG_ERROR(SPU, "[0x%x] Predecessor not found for target 0x%x (chunk=0x%x, entry=0x%x, size=%u)", m_pos, target, m_entry, m_function_queue[0], m_size / 4);
}
// Generate external indirect tail call
// Generate a patchpoint for fixed location
const auto cblock = m_ir->GetInsertBlock();
const auto result = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->SetInsertPoint(result);
m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(&spu_thread::pc));
const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u8*>(), get_type<u8*>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->getInt64((u64)(spu_runtime::g_dispatcher + target / 4)), type)));
const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u8*>(), get_type<u8*>(), get_type<u32>()}, false)->getPointerTo();
tail(m_ir->CreateIntToPtr(m_ir->getInt64((u64)m_spurt->make_branch_patchpoint(target)), type));
m_ir->SetInsertPoint(cblock);
return result;
}

View File

@ -67,12 +67,18 @@ private:
// Trampoline to spu_recompiler_base::dispatch
spu_function_t tr_dispatch = nullptr;
// Trampoline to spu_recompiler_base::branch
spu_function_t tr_branch = nullptr;
public:
spu_runtime();
// Add compiled function and generate trampoline if necessary
void add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled);
// Generate a patchable trampoline to spu_recompiler_base::branch
spu_function_t make_branch_patchpoint(u32 target) const;
// All dispatchers (array allocated in jit memory)
static atomic_t<spu_function_t>* const g_dispatcher;
};