mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 02:32:36 +01:00
TSX: new fallback method (time-based)
Basically, using timestamp counter. Rewritten vm::reservation_op with the same principle. Rewritten another transaction helper. Add two new settings for configuring fallbacks. Two limits are specified in nanoseconds (first and second). Fix PUTLLC reload logic (prevent reusing garbage).
This commit is contained in:
parent
80530e8aef
commit
86fc842c89
@ -57,31 +57,68 @@ namespace asmjit
|
||||
|
||||
// Emit xbegin and adjacent loop, return label at xbegin (don't use xabort please)
|
||||
template <typename F>
|
||||
[[nodiscard]] inline asmjit::Label build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, const asmjit::X86Gp& ctr, uint less_than, F func)
|
||||
[[nodiscard]] inline asmjit::Label build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, F func)
|
||||
{
|
||||
Label fall = c.newLabel();
|
||||
Label begin = c.newLabel();
|
||||
c.jmp(begin);
|
||||
c.bind(fall);
|
||||
|
||||
// First invoked after failure
|
||||
func();
|
||||
|
||||
c.add(ctr, 1);
|
||||
|
||||
// Don't repeat on zero status (may indicate syscall or interrupt)
|
||||
c.test(x86::eax, x86::eax);
|
||||
c.jz(fallback);
|
||||
|
||||
// First invoked after failure (can fallback to proceed, or jump anywhere else)
|
||||
func();
|
||||
|
||||
// Other bad statuses are ignored regardless of repeat flag (TODO)
|
||||
c.cmp(ctr, less_than);
|
||||
c.jae(fallback);
|
||||
c.align(kAlignCode, 16);
|
||||
c.bind(begin);
|
||||
return fall;
|
||||
|
||||
// xbegin should be issued manually, allows to add more check before entering transaction
|
||||
}
|
||||
|
||||
// Helper to spill RDX (EDX) register for RDTSC
|
||||
inline void build_swap_rdx_with(asmjit::X86Assembler& c, std::array<X86Gp, 4>& args, const asmjit::X86Gp& with)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
c.xchg(args[1], with);
|
||||
args[1] = with;
|
||||
#else
|
||||
c.xchg(args[2], with);
|
||||
args[2] = with;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Get full RDTSC value into chosen register (clobbers rax/rdx or saves only rax with other target)
|
||||
inline void build_get_tsc(asmjit::X86Assembler& c, const asmjit::X86Gp& to = asmjit::x86::rax)
|
||||
{
|
||||
if (&to != &x86::rax && &to != &x86::rdx)
|
||||
{
|
||||
// Swap to save its contents
|
||||
c.xchg(x86::rax, to);
|
||||
}
|
||||
|
||||
c.rdtsc();
|
||||
c.shl(x86::rdx, 32);
|
||||
|
||||
if (&to == &x86::rax)
|
||||
{
|
||||
c.or_(x86::rax, x86::rdx);
|
||||
}
|
||||
else if (&to == &x86::rdx)
|
||||
{
|
||||
c.or_(x86::rdx, x86::rax);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Swap back, maybe there is more effective way to do it
|
||||
c.xchg(x86::rax, to);
|
||||
c.mov(to.r32(), to.r32());
|
||||
c.or_(to.r64(), x86::rdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build runtime function with asmjit::X86Assembler
|
||||
|
@ -2,15 +2,18 @@
|
||||
|
||||
#include "types.h"
|
||||
|
||||
extern bool g_use_rtm;
|
||||
extern u64 g_rtm_tx_limit1;
|
||||
|
||||
namespace utils
|
||||
{
|
||||
// Transaction helper (Max = max attempts) (result = pair of success and op result)
|
||||
template <uint Max = 10, typename F, typename R = std::invoke_result_t<F>>
|
||||
// Transaction helper (result = pair of success and op result, or just bool)
|
||||
template <typename F, typename R = std::invoke_result_t<F>>
|
||||
inline auto tx_start(F op)
|
||||
{
|
||||
uint status = -1;
|
||||
|
||||
for (uint i = 0; i < Max; i++)
|
||||
for (auto stamp0 = __rdtsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = __rdtsc())
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
|
||||
|
@ -73,6 +73,11 @@ public:
|
||||
return !!(state & (cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause));
|
||||
}
|
||||
|
||||
bool has_pause_flag() const
|
||||
{
|
||||
return !!(state & cpu_flag::pause);
|
||||
}
|
||||
|
||||
// Check thread type
|
||||
u32 id_type() const
|
||||
{
|
||||
|
@ -292,7 +292,7 @@ namespace _spurs
|
||||
namespace _spurs
|
||||
{
|
||||
// Add workload
|
||||
s32 add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg);
|
||||
s32 add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg);
|
||||
}
|
||||
|
||||
//s32 _cellSpursWorkloadAttributeInitialize(vm::ptr<CellSpursWorkloadAttribute> attr, u32 revision, u32 sdkVersion, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt);
|
||||
@ -2295,7 +2295,7 @@ s32 cellSpursWorkloadAttributeSetShutdownCompletionEventHook(vm::ptr<CellSpursWo
|
||||
return CELL_OK;
|
||||
}
|
||||
|
||||
s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg)
|
||||
s32 _spurs::add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg)
|
||||
{
|
||||
if (!spurs || !wid || !pm)
|
||||
{
|
||||
@ -2420,7 +2420,7 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
|
||||
|
||||
u32 res_wkl;
|
||||
const auto wkl = &spurs->wklInfo(wnum);
|
||||
vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
|
||||
vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
|
||||
{
|
||||
const u32 mask = op.wklMskB & ~(0x80000000u >> wnum);
|
||||
res_wkl = 0;
|
||||
@ -2456,12 +2456,12 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
|
||||
}
|
||||
|
||||
/// Add workload
|
||||
s32 cellSpursAddWorkload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt)
|
||||
s32 cellSpursAddWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt)
|
||||
{
|
||||
cellSpurs.warning("cellSpursAddWorkload(spurs=*0x%x, wid=*0x%x, pm=*0x%x, size=0x%x, data=0x%llx, priority=*0x%x, minCnt=0x%x, maxCnt=0x%x)",
|
||||
spurs, wid, pm, size, data, priority, minCnt, maxCnt);
|
||||
|
||||
return _spurs::add_workload(spurs, wid, pm, size, data, *priority, minCnt, maxCnt, vm::null, vm::null, vm::null, vm::null);
|
||||
return _spurs::add_workload(ppu, spurs, wid, pm, size, data, *priority, minCnt, maxCnt, vm::null, vm::null, vm::null, vm::null);
|
||||
}
|
||||
|
||||
/// Add workload
|
||||
@ -2484,7 +2484,7 @@ s32 cellSpursAddWorkloadWithAttribute(ppu_thread& ppu, vm::ptr<CellSpurs> spurs,
|
||||
return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
|
||||
}
|
||||
|
||||
return _spurs::add_workload(spurs, wid, attr->pm, attr->size, attr->data, attr->priority, attr->minContention, attr->maxContention, attr->nameClass, attr->nameInstance, attr->hook, attr->hookArg);
|
||||
return _spurs::add_workload(ppu, spurs, wid, attr->pm, attr->size, attr->data, attr->priority, attr->minContention, attr->maxContention, attr->nameClass, attr->nameInstance, attr->hook, attr->hookArg);
|
||||
}
|
||||
|
||||
/// Request workload shutdown
|
||||
@ -2506,7 +2506,7 @@ s32 cellSpursShutdownWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid
|
||||
|
||||
bool send_event;
|
||||
s32 rc, old_state;
|
||||
if (!vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
|
||||
if (!vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
|
||||
{
|
||||
auto& state = wid < CELL_SPURS_MAX_WORKLOAD ? op.wklState1[wid] : op.wklState2[wid % 16];
|
||||
|
||||
@ -2663,7 +2663,7 @@ s32 cellSpursRemoveWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid)
|
||||
}
|
||||
|
||||
s32 rc;
|
||||
vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
|
||||
vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
|
||||
{
|
||||
auto& state = wid < CELL_SPURS_MAX_WORKLOAD ? op.wklState1[wid] : op.wklState2[wid % 16];
|
||||
|
||||
@ -3040,7 +3040,7 @@ s32 _cellSpursWorkloadFlagReceiver(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u3
|
||||
};
|
||||
|
||||
s32 res;
|
||||
vm::reservation_op(vm::unsafe_ptr_cast<wklFlagOp>(spurs), [&](wklFlagOp& val)
|
||||
vm::reservation_op(ppu, vm::unsafe_ptr_cast<wklFlagOp>(spurs), [&](wklFlagOp& val)
|
||||
{
|
||||
if (is_set)
|
||||
{
|
||||
@ -3189,7 +3189,7 @@ s32 cellSpursEventFlagSet(ppu_thread& ppu, vm::ptr<CellSpursEventFlag> eventFlag
|
||||
u16 pendingRecv;
|
||||
u16 pendingRecvTaskEvents[16];
|
||||
|
||||
vm::reservation_op(vm::unsafe_ptr_cast<CellSpursEventFlag_x00>(eventFlag), [bits, &send, &ppuWaitSlot, &ppuEvents, &pendingRecv, &pendingRecvTaskEvents](CellSpursEventFlag_x00& eventFlag)
|
||||
vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursEventFlag_x00>(eventFlag), [bits, &send, &ppuWaitSlot, &ppuEvents, &pendingRecv, &pendingRecvTaskEvents](CellSpursEventFlag_x00& eventFlag)
|
||||
{
|
||||
send = false;
|
||||
ppuWaitSlot = 0;
|
||||
@ -4081,7 +4081,7 @@ s32 _cellSpursSendSignal(ppu_thread& ppu, vm::ptr<CellSpursTaskset> taskset, u32
|
||||
|
||||
int signal;
|
||||
|
||||
vm::reservation_op(vm::unsafe_ptr_cast<spurs_taskset_signal_op>(taskset), [&](spurs_taskset_signal_op& op)
|
||||
vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_taskset_signal_op>(taskset), [&](spurs_taskset_signal_op& op)
|
||||
{
|
||||
const u32 signalled = op.signalled[taskId / 32];
|
||||
const u32 running = op.running[taskId / 32];
|
||||
@ -4972,7 +4972,7 @@ s32 cellSpursJobGuardNotify(ppu_thread& ppu, vm::ptr<CellSpursJobGuard> jobGuard
|
||||
u32 allow_jobchain_run = 0; // Affects cellSpursJobChainRun execution
|
||||
u32 old = 0;
|
||||
|
||||
const bool ok = vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
|
||||
const bool ok = vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
|
||||
{
|
||||
allow_jobchain_run = jg.zero;
|
||||
old = jg.ncount0;
|
||||
@ -5136,7 +5136,7 @@ s32 cellSpursAddUrgentCommand(ppu_thread& ppu, vm::ptr<CellSpursJobChain> jobCha
|
||||
|
||||
s32 result = CELL_OK;
|
||||
|
||||
vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
|
||||
vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
|
||||
{
|
||||
for (auto& cmd : jch.urgentCmds)
|
||||
{
|
||||
|
@ -2074,7 +2074,7 @@ void spursJobchainPopUrgentCommand(spu_thread& spu)
|
||||
const auto jc = vm::unsafe_ptr_cast<CellSpursJobChain_x00>(+ctxt->jobChain);
|
||||
|
||||
const bool alterQueue = ctxt->unkFlag0;
|
||||
vm::reservation_op(jc, [&](CellSpursJobChain_x00& op)
|
||||
vm::reservation_op(spu, jc, [&](CellSpursJobChain_x00& op)
|
||||
{
|
||||
const auto ls = reinterpret_cast<CellSpursJobChain_x00*>(ctxt->tempAreaJobChain);
|
||||
|
||||
|
@ -1216,6 +1216,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
|
||||
{
|
||||
ppu.rtime = ppu.last_ftime;
|
||||
ppu.raddr = ppu.last_faddr;
|
||||
ppu.last_ftime = 0;
|
||||
return static_cast<T>(rdata << data_off >> size_off);
|
||||
}
|
||||
|
||||
@ -1261,7 +1262,7 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
|
||||
return ppu_load_acquire_reservation<u64>(ppu, addr);
|
||||
}
|
||||
|
||||
const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
@ -1282,6 +1283,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
c.push(x86::r13);
|
||||
c.push(x86::r12);
|
||||
c.push(x86::rbx);
|
||||
c.push(x86::r14);
|
||||
c.push(x86::r15);
|
||||
c.sub(x86::rsp, 40);
|
||||
#ifdef _WIN32
|
||||
if (!s_tsx_avx)
|
||||
@ -1292,6 +1295,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
#endif
|
||||
|
||||
// Prepare registers
|
||||
build_swap_rdx_with(c, args, x86::r12);
|
||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
||||
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
|
||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||
@ -1305,7 +1309,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
c.and_(x86::rbx, -128 / 2);
|
||||
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||
c.and_(args[0].r32(), 63);
|
||||
c.mov(x86::r12d, 1);
|
||||
c.mov(x86::r13, args[1]);
|
||||
|
||||
// Prepare data
|
||||
@ -1328,8 +1331,20 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112));
|
||||
}
|
||||
|
||||
// Alloc r14 to stamp0
|
||||
const auto stamp0 = x86::r14;
|
||||
const auto stamp1 = x86::r15;
|
||||
build_get_tsc(c, stamp0);
|
||||
|
||||
// Begin transaction
|
||||
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4, []{});
|
||||
Label tx0 = build_transaction_enter(c, fall, [&]()
|
||||
{
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, stamp0);
|
||||
c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
c.jae(fall);
|
||||
});
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
c.mov(x86::eax, _XABORT_EXPLICIT);
|
||||
c.jc(fall);
|
||||
@ -1380,7 +1395,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
// Update reservation
|
||||
c.sub(x86::qword_ptr(x86::rbx), -128);
|
||||
c.xend();
|
||||
c.mov(x86::eax, x86::r12d);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
// XABORT is expensive so finish with xend instead
|
||||
@ -1411,6 +1427,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
|
||||
c.bind(skip);
|
||||
c.xend();
|
||||
build_get_tsc(c, stamp1);
|
||||
c.mov(x86::eax, _XABORT_EXPLICIT);
|
||||
//c.jmp(fall);
|
||||
|
||||
@ -1436,11 +1453,28 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||
c.jnz(fail2);
|
||||
|
||||
// Allow only first shared lock to proceed
|
||||
// Check if already updated
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail2);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, []{});
|
||||
// Exclude some time spent on touching memory: stamp1 contains last success or failure
|
||||
c.mov(x86::rax, stamp1);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
|
||||
c.jae(fall2);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, x86::rax);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, [&]()
|
||||
{
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp1);
|
||||
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
|
||||
c.jae(fall2);
|
||||
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
|
||||
c.jnz(fall2);
|
||||
});
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
@ -1448,8 +1482,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall2);
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||
c.test(x86::rax, 127 - 1);
|
||||
c.jnz(fall2);
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail2);
|
||||
@ -1493,7 +1525,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
|
||||
c.xend();
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||
c.mov(x86::eax, x86::r12d);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
// XABORT is expensive so try to finish with xend instead
|
||||
@ -1523,7 +1556,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
c.jmp(fail2);
|
||||
|
||||
c.bind(fall2);
|
||||
c.mov(x86::eax, -1);
|
||||
c.mov(x86::rax, -1);
|
||||
c.jmp(_ret);
|
||||
|
||||
c.bind(fail2);
|
||||
@ -1550,6 +1583,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
|
||||
}
|
||||
|
||||
c.mov(x86::rax, -1);
|
||||
c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
//c.jmp(_ret);
|
||||
|
||||
@ -1569,6 +1604,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
}
|
||||
|
||||
c.add(x86::rsp, 40);
|
||||
c.pop(x86::r15);
|
||||
c.pop(x86::r14);
|
||||
c.pop(x86::rbx);
|
||||
c.pop(x86::r12);
|
||||
c.pop(x86::r13);
|
||||
@ -1634,9 +1671,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
|
||||
{
|
||||
if (g_use_rtm) [[likely]]
|
||||
{
|
||||
switch (u32 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
|
||||
switch (u64 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
|
||||
{
|
||||
case UINT32_MAX:
|
||||
case UINT64_MAX:
|
||||
{
|
||||
auto& all_data = *vm::get_super_ptr<spu_rdata_t>(addr & -128);
|
||||
auto& sdata = *vm::get_super_ptr<atomic_be_t<u64>>(addr & -8);
|
||||
@ -1660,6 +1697,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
|
||||
break;
|
||||
}
|
||||
|
||||
ppu.last_ftime = -1;
|
||||
[[fallthrough]];
|
||||
}
|
||||
case 0:
|
||||
@ -1669,6 +1707,12 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
|
||||
ppu.last_fail++;
|
||||
}
|
||||
|
||||
if (ppu.last_ftime != umax)
|
||||
{
|
||||
ppu.last_faddr = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
_m_prefetchw(ppu.rdata);
|
||||
_m_prefetchw(ppu.rdata + 64);
|
||||
ppu.last_faddr = addr;
|
||||
@ -1678,9 +1722,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
|
||||
}
|
||||
default:
|
||||
{
|
||||
if (count > 60 && g_cfg.core.perf_report) [[unlikely]]
|
||||
if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
|
||||
{
|
||||
perf_log.warning("STCX: took too long: %u", count);
|
||||
perf_log.warning(u8"STCX: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -371,7 +371,7 @@ namespace spu
|
||||
}
|
||||
}
|
||||
|
||||
const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
@ -415,6 +415,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
#endif
|
||||
|
||||
// Prepare registers
|
||||
build_swap_rdx_with(c, args, x86::r12);
|
||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
||||
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
|
||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||
@ -425,7 +426,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.shr(args[0].r32(), 1);
|
||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||
c.mov(x86::r12d, 1);
|
||||
c.mov(x86::r13, args[1]);
|
||||
|
||||
// Prepare data
|
||||
@ -460,10 +460,20 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.movaps(x86::xmm15, x86::oword_ptr(args[3], 112));
|
||||
}
|
||||
|
||||
// Alloc args[0] to stamp0
|
||||
const auto stamp0 = args[0];
|
||||
const auto stamp1 = args[1];
|
||||
build_get_tsc(c, stamp0);
|
||||
|
||||
// Begin transaction
|
||||
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4, [&]()
|
||||
Label tx0 = build_transaction_enter(c, fall, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, stamp0);
|
||||
c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
c.jae(fall);
|
||||
});
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
c.mov(x86::eax, _XABORT_EXPLICIT);
|
||||
@ -531,7 +541,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.sub(x86::qword_ptr(x86::rbx), -128);
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
c.mov(x86::eax, x86::r12d);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
// XABORT is expensive so finish with xend instead
|
||||
@ -564,6 +575,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.bind(skip);
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.mov(x86::eax, _XABORT_EXPLICIT);
|
||||
//c.jmp(fall);
|
||||
|
||||
@ -589,13 +601,28 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||
c.jnz(fail2);
|
||||
|
||||
// Allow only first shared lock to proceed
|
||||
// Check if already updated
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail2);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, [&]()
|
||||
// Exclude some time spent on touching memory: stamp1 contains last success or failure
|
||||
c.mov(x86::rax, stamp1);
|
||||
c.sub(x86::rax, stamp0);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, x86::rax);
|
||||
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
|
||||
c.jae(fall2);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp1);
|
||||
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
|
||||
c.jae(fall2);
|
||||
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
|
||||
c.jnz(fall2);
|
||||
});
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
@ -604,8 +631,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall2);
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||
c.test(x86::rax, 127 - 1);
|
||||
c.jnz(fall2);
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail2);
|
||||
@ -666,7 +691,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||
c.mov(x86::eax, x86::r12d);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
// XABORT is expensive so try to finish with xend instead
|
||||
@ -697,7 +723,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.jmp(fail2);
|
||||
|
||||
c.bind(fall2);
|
||||
c.mov(x86::eax, -1);
|
||||
c.mov(x86::rax, -1);
|
||||
c.jmp(_ret);
|
||||
|
||||
c.bind(fail2);
|
||||
@ -724,6 +750,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
|
||||
}
|
||||
|
||||
c.mov(x86::rax, -1);
|
||||
c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
//c.jmp(_ret);
|
||||
|
||||
@ -763,7 +791,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
|
||||
c.ret();
|
||||
});
|
||||
|
||||
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
@ -792,6 +820,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
||||
#endif
|
||||
|
||||
// Prepare registers
|
||||
build_swap_rdx_with(c, args, x86::r12);
|
||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
||||
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
|
||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||
@ -802,7 +831,6 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
||||
c.shr(args[0].r32(), 1);
|
||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||
c.mov(x86::r12d, 1);
|
||||
c.mov(x86::r13, args[1]);
|
||||
|
||||
// Prepare data
|
||||
@ -825,10 +853,20 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
||||
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
|
||||
}
|
||||
|
||||
// Alloc args[0] to stamp0
|
||||
const auto stamp0 = args[0];
|
||||
const auto stamp1 = args[1];
|
||||
build_get_tsc(c, stamp0);
|
||||
|
||||
// Begin transaction
|
||||
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8, [&]()
|
||||
Label tx0 = build_transaction_enter(c, fall, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, stamp0);
|
||||
c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
c.jae(fall);
|
||||
});
|
||||
c.xbegin(tx0);
|
||||
c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock);
|
||||
@ -856,12 +894,15 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
||||
c.sub(x86::qword_ptr(x86::rbx), -128);
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
|
||||
c.mov(x86::eax, 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
c.bind(skip);
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.mov(x86::eax, _XABORT_EXPLICIT);
|
||||
//c.jmp(fall);
|
||||
|
||||
c.bind(fall);
|
||||
@ -881,12 +922,24 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
||||
// Lock reservation
|
||||
c.mov(x86::eax, 1);
|
||||
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
|
||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||
c.test(x86::eax, 127 - 1);
|
||||
c.jnz(fall2);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, [&]()
|
||||
// Exclude some time spent on touching memory: stamp1 contains last success or failure
|
||||
c.mov(x86::rax, stamp1);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
|
||||
c.jae(fall2);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, x86::rax);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp1);
|
||||
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
|
||||
c.jae(fall2);
|
||||
});
|
||||
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
@ -922,7 +975,8 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||
c.mov(x86::eax, x86::r12d);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
c.bind(fall2);
|
||||
@ -952,7 +1006,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
||||
c.ret();
|
||||
});
|
||||
|
||||
const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const extern auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
@ -979,6 +1033,7 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
|
||||
#endif
|
||||
|
||||
// Prepare registers
|
||||
build_swap_rdx_with(c, args, x86::r12);
|
||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
||||
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
|
||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||
@ -986,13 +1041,20 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
|
||||
c.and_(args[0].r32(), 0xff80);
|
||||
c.shr(args[0].r32(), 1);
|
||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
||||
c.mov(x86::r12d, 1);
|
||||
c.mov(x86::r13, args[1]);
|
||||
|
||||
// Alloc args[0] to stamp0
|
||||
const auto stamp0 = args[0];
|
||||
build_get_tsc(c, stamp0);
|
||||
|
||||
// Begin transaction
|
||||
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8, [&]()
|
||||
Label tx0 = build_transaction_enter(c, fall, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit1));
|
||||
c.jae(fall);
|
||||
});
|
||||
|
||||
// Check pause flag
|
||||
@ -1026,6 +1088,8 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
|
||||
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
|
||||
// Store data
|
||||
if (s_tsx_avx)
|
||||
@ -1047,9 +1111,7 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
|
||||
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7);
|
||||
}
|
||||
|
||||
c.mov(x86::eax, 1);
|
||||
c.jmp(_ret);
|
||||
|
||||
c.bind(fall);
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
//c.jmp(_ret);
|
||||
@ -1546,6 +1608,7 @@ void spu_thread::push_snr(u32 number, u32 value)
|
||||
const u32 event_bit = SPU_EVENT_S1 >> (number & 1);
|
||||
const u32 bitor_bit = (snr_config >> number) & 1;
|
||||
|
||||
// Redundant, g_use_rtm is checked inside tx_start now.
|
||||
if (g_use_rtm)
|
||||
{
|
||||
bool channel_notify = false;
|
||||
@ -2422,9 +2485,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
|
||||
if (g_use_rtm) [[likely]]
|
||||
{
|
||||
switch (u32 count = spu_putllc_tx(addr, rtime, rdata, to_write))
|
||||
switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write))
|
||||
{
|
||||
case UINT32_MAX:
|
||||
case UINT64_MAX:
|
||||
{
|
||||
auto& data = *vm::get_super_ptr<spu_rdata_t>(addr);
|
||||
|
||||
@ -2451,6 +2514,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
break;
|
||||
}
|
||||
|
||||
last_ftime = -1;
|
||||
[[fallthrough]];
|
||||
}
|
||||
case 0:
|
||||
@ -2460,6 +2524,12 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
last_fail++;
|
||||
}
|
||||
|
||||
if (last_ftime != umax)
|
||||
{
|
||||
last_faddr = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
_m_prefetchw(rdata);
|
||||
_m_prefetchw(rdata + 64);
|
||||
last_faddr = addr;
|
||||
@ -2469,9 +2539,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
}
|
||||
default:
|
||||
{
|
||||
if (count > 60 && g_cfg.core.perf_report) [[unlikely]]
|
||||
if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
|
||||
{
|
||||
perf_log.warning("PUTLLC: took too long: %u", count);
|
||||
perf_log.warning(u8"PUTLLC: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
|
||||
}
|
||||
|
||||
break;
|
||||
@ -2566,7 +2636,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
|
||||
|
||||
if (g_use_rtm) [[likely]]
|
||||
{
|
||||
const u32 result = spu_putlluc_tx(addr, to_write, cpu);
|
||||
const u64 result = spu_putlluc_tx(addr, to_write, cpu);
|
||||
|
||||
if (result == 0)
|
||||
{
|
||||
@ -2579,9 +2649,9 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
|
||||
res += 127;
|
||||
});
|
||||
}
|
||||
else if (result > 60 && g_cfg.core.perf_report) [[unlikely]]
|
||||
else if (result > 20000 && g_cfg.core.perf_report) [[unlikely]]
|
||||
{
|
||||
perf_log.warning("STORE128: took too long: %u", result);
|
||||
perf_log.warning(u8"STORE128: took too long: %.3fµs (%u c)", result / (utils::get_tsc_freq() / 1000'000.), result);
|
||||
}
|
||||
|
||||
static_cast<void>(cpu->test_stopped());
|
||||
@ -2796,6 +2866,7 @@ bool spu_thread::process_mfc_cmd()
|
||||
{
|
||||
rtime = last_ftime;
|
||||
raddr = last_faddr;
|
||||
last_ftime = 0;
|
||||
mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
|
||||
|
||||
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
|
||||
|
@ -550,17 +550,19 @@ namespace vm
|
||||
|
||||
void reservation_op_internal(u32 addr, std::function<bool()> func)
|
||||
{
|
||||
auto& res = vm::reservation_acquire(addr, 128);
|
||||
auto& res = vm::reservation_acquire(addr, 1);
|
||||
auto* ptr = vm::get_super_ptr(addr & -128);
|
||||
|
||||
cpu_thread::suspend_all(get_current_cpu_thread(), {&res}, [&]
|
||||
cpu_thread::suspend_all(get_current_cpu_thread(), {ptr, ptr + 64, &res}, [&]
|
||||
{
|
||||
if (func())
|
||||
{
|
||||
// Success, release all locks if necessary
|
||||
// Success, release the lock and progress
|
||||
res += 127;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Only release the lock on failure
|
||||
res -= 1;
|
||||
}
|
||||
});
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <functional>
|
||||
|
||||
extern bool g_use_rtm;
|
||||
extern u64 g_rtm_tx_limit2;
|
||||
|
||||
namespace vm
|
||||
{
|
||||
@ -70,8 +71,8 @@ namespace vm
|
||||
// TODO: remove and make it external
|
||||
void reservation_op_internal(u32 addr, std::function<bool()> func);
|
||||
|
||||
template <bool Ack = false, typename T, typename AT = u32, typename F>
|
||||
SAFE_BUFFERS inline auto reservation_op(_ptr_base<T, AT> ptr, F op)
|
||||
template <bool Ack = false, typename CPU, typename T, typename AT = u32, typename F>
|
||||
SAFE_BUFFERS inline auto reservation_op(CPU& cpu, _ptr_base<T, AT> ptr, F op)
|
||||
{
|
||||
// Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply
|
||||
static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_op: unsupported type");
|
||||
@ -94,9 +95,10 @@ namespace vm
|
||||
{
|
||||
// Stage 1: single optimistic transaction attempt
|
||||
unsigned status = _XBEGIN_STARTED;
|
||||
unsigned count = 0;
|
||||
u64 _old = 0;
|
||||
|
||||
auto stamp0 = __rdtsc(), stamp1 = stamp0, stamp2 = stamp0;
|
||||
|
||||
#ifndef _MSC_VER
|
||||
__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
|
||||
#else
|
||||
@ -157,6 +159,7 @@ namespace vm
|
||||
#ifndef _MSC_VER
|
||||
__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
|
||||
#endif
|
||||
stamp1 = __rdtsc();
|
||||
|
||||
// Touch memory if transaction failed with status 0
|
||||
if (!status)
|
||||
@ -167,12 +170,17 @@ namespace vm
|
||||
// Stage 2: try to lock reservation first
|
||||
_old = res.fetch_add(1);
|
||||
|
||||
// Also identify atomic op
|
||||
count = 1;
|
||||
// Compute stamps excluding memory touch
|
||||
stamp2 = __rdtsc() - (stamp1 - stamp0);
|
||||
|
||||
// Start lightened transaction (TODO: tweaking)
|
||||
for (; !(_old & rsrv_unique_lock) && count < 60; count++)
|
||||
// Start lightened transaction
|
||||
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = __rdtsc())
|
||||
{
|
||||
if (cpu.has_pause_flag())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
#ifndef _MSC_VER
|
||||
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
|
||||
#else
|
||||
|
@ -55,7 +55,9 @@ LOG_CHANNEL(sys_log, "SYS");
|
||||
|
||||
stx::manual_fixed_typemap<void> g_fixed_typemap;
|
||||
|
||||
bool g_use_rtm;
|
||||
bool g_use_rtm = false;
|
||||
u64 g_rtm_tx_limit1 = 0;
|
||||
u64 g_rtm_tx_limit2 = 0;
|
||||
|
||||
std::string g_cfg_defaults;
|
||||
|
||||
@ -1019,6 +1021,14 @@ game_boot_result Emulator::Load(const std::string& title_id, bool add_only, bool
|
||||
}
|
||||
}
|
||||
|
||||
if (g_use_rtm)
|
||||
{
|
||||
// Update supplementary settings
|
||||
const f64 _1ns = utils::get_tsc_freq() / 1000'000'000.;
|
||||
g_rtm_tx_limit1 = g_cfg.core.tx_limit1_ns * _1ns;
|
||||
g_rtm_tx_limit2 = g_cfg.core.tx_limit2_ns * _1ns;
|
||||
}
|
||||
|
||||
// Load patches from different locations
|
||||
g_fxo->get<patch_engine>()->append_title_patches(m_title_id);
|
||||
|
||||
|
@ -240,3 +240,5 @@ private:
|
||||
extern Emulator Emu;
|
||||
|
||||
extern bool g_use_rtm;
|
||||
extern u64 g_rtm_tx_limit1;
|
||||
extern u64 g_rtm_tx_limit2;
|
||||
|
@ -66,6 +66,8 @@ struct cfg_root : cfg::node
|
||||
cfg::_bool hle_lwmutex{ this, "HLE lwmutex" }; // Force alternative lwmutex/lwcond implementation
|
||||
cfg::uint64 spu_llvm_lower_bound{ this, "SPU LLVM Lower Bound" };
|
||||
cfg::uint64 spu_llvm_upper_bound{ this, "SPU LLVM Upper Bound", 0xffffffffffffffff };
|
||||
cfg::uint64 tx_limit1_ns{this, "TSX Transaction First Limit", 800}; // In nanoseconds
|
||||
cfg::uint64 tx_limit2_ns{this, "TSX Transaction Second Limit", 2000}; // In nanoseconds
|
||||
|
||||
cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100, true }; // Changing this from 100 (percentage) may affect game speed in unexpected ways
|
||||
cfg::_enum<sleep_timers_accuracy_level> sleep_timers_accuracy{ this, "Sleep Timers Accuracy",
|
||||
|
Loading…
Reference in New Issue
Block a user