mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-25 04:02:42 +01:00
TSX: refactoring M
Remove first stage 'optimistic' transactions.
This commit is contained in:
parent
3e1e1a683c
commit
61c64d1060
@ -1823,8 +1823,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
|
||||
Label fall = c.newLabel();
|
||||
Label fail = c.newLabel();
|
||||
Label _ret = c.newLabel();
|
||||
Label skip = c.newLabel();
|
||||
Label next = c.newLabel();
|
||||
Label load = c.newLabel();
|
||||
|
||||
//if (utils::has_avx() && !s_tsx_avx)
|
||||
@ -1885,27 +1883,24 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
|
||||
|
||||
// Alloc r14 to stamp0
|
||||
const auto stamp0 = x86::r14;
|
||||
const auto stamp1 = x86::r15;
|
||||
build_get_tsc(c, stamp0);
|
||||
|
||||
// Begin transaction
|
||||
Label tx0 = build_transaction_enter(c, fall, [&]()
|
||||
Label fail2 = c.newLabel();
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall, [&]()
|
||||
{
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, stamp0);
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
c.cmp(stamp1, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall);
|
||||
});
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
// Check pause flag
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall);
|
||||
c.xbegin(tx0);
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||
c.jnz(skip);
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail);
|
||||
c.xbegin(tx1);
|
||||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
@ -1943,137 +1938,14 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
|
||||
// Store 8 bytes
|
||||
c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]);
|
||||
|
||||
// Update reservation
|
||||
c.sub(x86::qword_ptr(x86::rbx), -128);
|
||||
c.xend();
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
// XABORT is expensive so finish with xend instead
|
||||
c.bind(fail);
|
||||
|
||||
// Load old data to store back in rdata
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||
}
|
||||
else
|
||||
{
|
||||
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||
}
|
||||
|
||||
c.xend();
|
||||
c.jmp(load);
|
||||
|
||||
c.bind(skip);
|
||||
c.xend();
|
||||
build_get_tsc(c, stamp1);
|
||||
//c.jmp(fall);
|
||||
|
||||
c.bind(fall);
|
||||
|
||||
Label fall2 = c.newLabel();
|
||||
Label fail2 = c.newLabel();
|
||||
Label fail3 = c.newLabel();
|
||||
|
||||
// Lightened transaction: only compare and swap data
|
||||
c.bind(next);
|
||||
|
||||
// Try to "lock" reservation
|
||||
c.mov(x86::eax, 1);
|
||||
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
|
||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||
c.jnz(fail2);
|
||||
|
||||
// Check if already updated
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail2);
|
||||
|
||||
// Exclude some time spent on touching memory: stamp1 contains last success or failure
|
||||
c.mov(x86::rax, stamp1);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall2);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, x86::rax);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, [&]()
|
||||
{
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp1);
|
||||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall2);
|
||||
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
|
||||
c.jnz(fall2);
|
||||
});
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
// Check pause flag
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall2);
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail2);
|
||||
c.xbegin(tx1);
|
||||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
|
||||
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
|
||||
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
|
||||
c.vptest(x86::ymm0, x86::ymm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||
c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||
c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||
c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||
c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||
c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||
c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||
c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||
c.orps(x86::xmm0, x86::xmm1);
|
||||
c.orps(x86::xmm2, x86::xmm3);
|
||||
c.orps(x86::xmm4, x86::xmm5);
|
||||
c.orps(x86::xmm6, x86::xmm7);
|
||||
c.orps(x86::xmm0, x86::xmm2);
|
||||
c.orps(x86::xmm4, x86::xmm6);
|
||||
c.orps(x86::xmm0, x86::xmm4);
|
||||
c.ptest(x86::xmm0, x86::xmm0);
|
||||
}
|
||||
|
||||
c.jnz(fail3);
|
||||
|
||||
// Store 8 bytes
|
||||
c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]);
|
||||
|
||||
c.xend();
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 64);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
// XABORT is expensive so try to finish with xend instead
|
||||
c.bind(fail3);
|
||||
c.bind(fail);
|
||||
|
||||
// Load old data to store back in rdata
|
||||
if (s_tsx_avx)
|
||||
@ -2098,7 +1970,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
|
||||
c.xend();
|
||||
c.jmp(fail2);
|
||||
|
||||
c.bind(fall2);
|
||||
c.bind(fall);
|
||||
c.mov(x86::rax, -1);
|
||||
c.jmp(_ret);
|
||||
|
||||
@ -2212,6 +2084,23 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
|
||||
{
|
||||
if (ppu.use_full_rdata) [[unlikely]]
|
||||
{
|
||||
auto [_oldd, _ok] = res.fetch_op([&](u64& r)
|
||||
{
|
||||
if ((r & -128) != rtime || (r & 127))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
r += vm::rsrv_unique_lock;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (!_ok)
|
||||
{
|
||||
// Already locked or updated: give up
|
||||
return false;
|
||||
}
|
||||
|
||||
if (g_use_rtm) [[likely]]
|
||||
{
|
||||
switch (u64 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
|
||||
@ -2226,12 +2115,12 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
|
||||
if ((res & -128) == rtime && cmp_rdata(ppu.rdata, all_data))
|
||||
{
|
||||
sdata.release(new_data);
|
||||
res += 127;
|
||||
res += 64;
|
||||
return true;
|
||||
}
|
||||
|
||||
mov_rdata_nt(ppu.rdata, all_data);
|
||||
res -= 1;
|
||||
res -= 64;
|
||||
return false;
|
||||
});
|
||||
|
||||
@ -2283,23 +2172,6 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
|
||||
return true;
|
||||
}
|
||||
|
||||
auto [_oldd, _ok] = res.fetch_op([&](u64& r)
|
||||
{
|
||||
if ((r & -128) != rtime || (r & 127))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
r += vm::rsrv_unique_lock;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (!_ok)
|
||||
{
|
||||
// Already locked or updated: give up
|
||||
return false;
|
||||
}
|
||||
|
||||
// Align address: we do not need the lower 7 bits anymore
|
||||
addr &= -128;
|
||||
|
||||
|
@ -412,8 +412,6 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
|
||||
Label fall = c.newLabel();
|
||||
Label fail = c.newLabel();
|
||||
Label _ret = c.newLabel();
|
||||
Label skip = c.newLabel();
|
||||
Label next = c.newLabel();
|
||||
Label load = c.newLabel();
|
||||
|
||||
//if (utils::has_avx() && !s_tsx_avx)
|
||||
@ -494,28 +492,25 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
|
||||
|
||||
// Alloc args[0] to stamp0
|
||||
const auto stamp0 = args[0];
|
||||
const auto stamp1 = args[1];
|
||||
build_get_tsc(c, stamp0);
|
||||
|
||||
// Begin transaction
|
||||
Label tx0 = build_transaction_enter(c, fall, [&]()
|
||||
Label fail2 = c.newLabel();
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, stamp0);
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
c.cmp(stamp1, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall);
|
||||
});
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
// Check pause flag
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall);
|
||||
c.xbegin(tx0);
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||
c.jnz(skip);
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail);
|
||||
c.xbegin(tx1);
|
||||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
@ -569,157 +564,15 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15);
|
||||
}
|
||||
|
||||
c.sub(x86::qword_ptr(x86::rbx), -128);
|
||||
c.xend();
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 64);
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
// XABORT is expensive so finish with xend instead
|
||||
c.bind(fail);
|
||||
|
||||
// Load old data to store back in rdata
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||
}
|
||||
else
|
||||
{
|
||||
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||
}
|
||||
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
c.jmp(load);
|
||||
|
||||
c.bind(skip);
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
build_get_tsc(c, stamp1);
|
||||
//c.jmp(fall);
|
||||
|
||||
c.bind(fall);
|
||||
|
||||
Label fall2 = c.newLabel();
|
||||
Label fail2 = c.newLabel();
|
||||
Label fail3 = c.newLabel();
|
||||
|
||||
// Lightened transaction: only compare and swap data
|
||||
c.bind(next);
|
||||
|
||||
// Try to "lock" reservation
|
||||
c.mov(x86::eax, 1);
|
||||
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
|
||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||
c.jnz(fail2);
|
||||
|
||||
// Check if already updated
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail2);
|
||||
|
||||
// Exclude some time spent on touching memory: stamp1 contains last success or failure
|
||||
c.mov(x86::rax, stamp1);
|
||||
c.sub(x86::rax, stamp0);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, x86::rax);
|
||||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall2);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp1);
|
||||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall2);
|
||||
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
|
||||
c.jnz(fall2);
|
||||
});
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
// Check pause flag
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall2);
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, x86::r13);
|
||||
c.jne(fail2);
|
||||
c.xbegin(tx1);
|
||||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
|
||||
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
|
||||
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
|
||||
c.vptest(x86::ymm0, x86::ymm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||
c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||
c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||
c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||
c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||
c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||
c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||
c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||
c.orps(x86::xmm0, x86::xmm1);
|
||||
c.orps(x86::xmm2, x86::xmm3);
|
||||
c.orps(x86::xmm4, x86::xmm5);
|
||||
c.orps(x86::xmm6, x86::xmm7);
|
||||
c.orps(x86::xmm0, x86::xmm2);
|
||||
c.orps(x86::xmm4, x86::xmm6);
|
||||
c.orps(x86::xmm0, x86::xmm4);
|
||||
c.ptest(x86::xmm0, x86::xmm0);
|
||||
}
|
||||
|
||||
c.jnz(fail3);
|
||||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7);
|
||||
}
|
||||
else
|
||||
{
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15);
|
||||
}
|
||||
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
// XABORT is expensive so try to finish with xend instead
|
||||
c.bind(fail3);
|
||||
c.bind(fail);
|
||||
|
||||
// Load previous data to store back to rdata
|
||||
if (s_tsx_avx)
|
||||
@ -745,12 +598,12 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
c.jmp(fail2);
|
||||
|
||||
c.bind(fall2);
|
||||
c.bind(fall);
|
||||
c.mov(x86::rax, -1);
|
||||
c.jmp(_ret);
|
||||
|
||||
c.bind(fail2);
|
||||
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
|
||||
c.lock().sub(x86::qword_ptr(x86::rbx), 64);
|
||||
c.bind(load);
|
||||
|
||||
// Store previous data back to rdata
|
||||
@ -814,14 +667,12 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
|
||||
c.ret();
|
||||
});
|
||||
|
||||
const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>([](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
Label fall = c.newLabel();
|
||||
Label _ret = c.newLabel();
|
||||
Label skip = c.newLabel();
|
||||
Label next = c.newLabel();
|
||||
|
||||
//if (utils::has_avx() && !s_tsx_avx)
|
||||
//{
|
||||
@ -876,94 +727,24 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
|
||||
|
||||
// Alloc args[0] to stamp0
|
||||
const auto stamp0 = args[0];
|
||||
const auto stamp1 = args[1];
|
||||
build_get_tsc(c, stamp0);
|
||||
|
||||
// Begin transaction
|
||||
Label tx0 = build_transaction_enter(c, fall, [&]()
|
||||
Label tx1 = build_transaction_enter(c, fall, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, stamp0);
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
c.cmp(stamp1, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
|
||||
c.jae(fall);
|
||||
});
|
||||
c.xbegin(tx0);
|
||||
c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock);
|
||||
c.jnz(skip);
|
||||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7);
|
||||
}
|
||||
|
||||
c.sub(x86::qword_ptr(x86::rbx), -128);
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
c.bind(skip);
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
|
||||
build_get_tsc(c, stamp1);
|
||||
//c.jmp(fall);
|
||||
|
||||
c.bind(fall);
|
||||
|
||||
Label fall2 = c.newLabel();
|
||||
|
||||
// Lightened transaction
|
||||
c.bind(next);
|
||||
|
||||
// Lock reservation
|
||||
c.mov(x86::eax, 1);
|
||||
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
|
||||
c.test(x86::eax, 127 - 1);
|
||||
c.jnz(fall2);
|
||||
|
||||
// Exclude some time spent on touching memory: stamp1 contains last success or failure
|
||||
c.mov(x86::rax, stamp1);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall2);
|
||||
build_get_tsc(c, stamp1);
|
||||
c.sub(stamp1, x86::rax);
|
||||
|
||||
Label tx1 = build_transaction_enter(c, fall2, [&]()
|
||||
{
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
|
||||
// ftx++;
|
||||
c.add(x86::qword_ptr(args[3]), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp1);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall2);
|
||||
c.jae(fall);
|
||||
});
|
||||
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
// Check pause flag
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall2);
|
||||
// Check contention
|
||||
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
|
||||
c.jc(fall2);
|
||||
// // Check pause flag
|
||||
// c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
|
||||
// c.jc(fall);
|
||||
c.xbegin(tx1);
|
||||
|
||||
if (s_tsx_avx)
|
||||
@ -986,13 +767,14 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
|
||||
}
|
||||
|
||||
c.xend();
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 32);
|
||||
// stx++
|
||||
c.add(x86::qword_ptr(args[2]), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
||||
c.bind(fall2);
|
||||
c.bind(fall);
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
//c.jmp(_ret);
|
||||
|
||||
@ -1072,7 +854,7 @@ const extern auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rd
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall);
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||
c.and_(x86::rax, ~vm::rsrv_shared_mask);
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, args[3]);
|
||||
c.jne(fall);
|
||||
c.xbegin(tx0);
|
||||
@ -2784,6 +2566,23 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
return cmp_rdata(rdata, vm::_ref<spu_rdata_t>(addr)) && res.compare_and_swap_test(rtime, rtime + 128);
|
||||
}
|
||||
|
||||
auto [_oldd, _ok] = res.fetch_op([&](u64& r)
|
||||
{
|
||||
if ((r & -128) != rtime || (r & 127))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
r += vm::rsrv_unique_lock;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (!_ok)
|
||||
{
|
||||
// Already locked or updated: give up
|
||||
return false;
|
||||
}
|
||||
|
||||
if (g_use_rtm) [[likely]]
|
||||
{
|
||||
switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write))
|
||||
@ -2799,14 +2598,14 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
if (cmp_rdata(rdata, data))
|
||||
{
|
||||
mov_rdata(data, to_write);
|
||||
res += 127;
|
||||
res += 64;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Save previous data
|
||||
mov_rdata_nt(rdata, data);
|
||||
res -= 1;
|
||||
res -= 64;
|
||||
return false;
|
||||
});
|
||||
|
||||
@ -2865,23 +2664,6 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
return true;
|
||||
}
|
||||
|
||||
auto [_oldd, _ok] = res.fetch_op([&](u64& r)
|
||||
{
|
||||
if ((r & -128) != rtime || (r & 127))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
r += vm::rsrv_unique_lock;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (!_ok)
|
||||
{
|
||||
// Already locked or updated: give up
|
||||
return false;
|
||||
}
|
||||
|
||||
vm::_ref<atomic_t<u32>>(addr) += 0;
|
||||
|
||||
auto& super_data = *vm::get_super_ptr<spu_rdata_t>(addr);
|
||||
@ -2943,24 +2725,104 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
|
||||
const auto cpu = get_current_cpu_thread();
|
||||
rsx::reservation_lock rsx_lock(addr, 128);
|
||||
|
||||
u64 shared_mem = vm::g_shmem[addr >> 16];
|
||||
|
||||
if (!shared_mem)
|
||||
{
|
||||
shared_mem = addr;
|
||||
}
|
||||
|
||||
if (g_use_rtm) [[likely]]
|
||||
{
|
||||
u64 result = spu_putlluc_tx(addr, to_write, cpu);
|
||||
auto& sdata = *vm::get_super_ptr<spu_rdata_t>(addr);
|
||||
auto& res = *utils::bless<atomic_t<u128>>(vm::g_reservations + (addr & 0xff80) / 2);
|
||||
|
||||
for (u64 j = 0;; j++)
|
||||
{
|
||||
auto [_oldd, _ok] = res.fetch_op([&](u128& r)
|
||||
{
|
||||
if (r & 127)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
r &= static_cast<u64>(r);
|
||||
r |= u128{shared_mem} << 64;
|
||||
r |= u128{vm::rsrv_unique_lock | vm::rsrv_putunc_flag};
|
||||
return true;
|
||||
});
|
||||
|
||||
if (_ok)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (static_cast<u64>(_oldd) & vm::rsrv_putunc_flag && static_cast<u64>(_oldd >> 64) == shared_mem)
|
||||
{
|
||||
// Abandon store
|
||||
for (u64 k = 0;; k++)
|
||||
{
|
||||
if (res ^ _oldd)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (auto cpu = get_current_cpu_thread(); cpu && cpu->state)
|
||||
{
|
||||
cpu->check_state();
|
||||
}
|
||||
else if (k < 15)
|
||||
{
|
||||
busy_wait(500);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::this_thread::yield();
|
||||
}
|
||||
}
|
||||
|
||||
return static_cast<void>(cpu->test_stopped());
|
||||
}
|
||||
|
||||
if (auto cpu = get_current_cpu_thread(); cpu && cpu->state)
|
||||
{
|
||||
cpu->check_state();
|
||||
}
|
||||
else if (j < 15)
|
||||
{
|
||||
busy_wait(500);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::this_thread::yield();
|
||||
}
|
||||
}
|
||||
|
||||
u64 result = 0;
|
||||
|
||||
if (cpu->state & cpu_flag::pause)
|
||||
{
|
||||
result = 0;
|
||||
}
|
||||
else if (cpu->id_type() != 2)
|
||||
{
|
||||
u64 stx, ftx;
|
||||
result = spu_putlluc_tx(addr, to_write, &stx, &ftx);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto _spu = static_cast<spu_thread*>(cpu);
|
||||
result = spu_putlluc_tx(addr, to_write, &_spu->stx, &_spu->ftx);
|
||||
}
|
||||
|
||||
if (result == 0)
|
||||
{
|
||||
auto& sdata = *vm::get_super_ptr<spu_rdata_t>(addr);
|
||||
auto& res = vm::reservation_acquire(addr);
|
||||
|
||||
cpu_thread::suspend_all<+2>(cpu, {&res}, [&]
|
||||
cpu_thread::suspend_all<+2>(cpu, {}, [&]
|
||||
{
|
||||
mov_rdata_nt(sdata, *static_cast<const spu_rdata_t*>(to_write));
|
||||
res += 127;
|
||||
mov_rdata(sdata, *static_cast<const spu_rdata_t*>(to_write));
|
||||
});
|
||||
}
|
||||
|
||||
if (!result)
|
||||
{
|
||||
vm::reservation_acquire(addr) += 32;
|
||||
result = __rdtsc() - perf0.get();
|
||||
}
|
||||
|
||||
@ -3359,11 +3221,6 @@ bool spu_thread::process_mfc_cmd()
|
||||
// See previous ntime check.
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
// If succeeded, only need to check unique lock bit
|
||||
test_mask = ~vm::rsrv_shared_mask;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -32,7 +32,7 @@ namespace vm
|
||||
{
|
||||
rsrv_lock_mask = 127,
|
||||
rsrv_unique_lock = 64,
|
||||
rsrv_shared_mask = 63,
|
||||
rsrv_putunc_flag = 32,
|
||||
};
|
||||
|
||||
// Get reservation status for further atomic update: last update timestamp
|
||||
|
Loading…
Reference in New Issue
Block a user