1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 02:32:36 +01:00

Implement cpu_thread::suspend_all

Remove Accurate PUTLLC option.
Implement fallback path for SPU transactions.
This commit is contained in:
Nekotekina 2019-06-06 21:32:35 +03:00
parent 17d0dcb7a2
commit 5d45a3e47d
18 changed files with 843 additions and 362 deletions

View File

@ -190,18 +190,34 @@ asmjit::JitRuntime& asmjit::get_global_runtime()
return g_rt;
}
asmjit::Label asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback)
void asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, const asmjit::X86Gp& ctr, uint less_than)
{
Label fall = c.newLabel();
Label begin = c.newLabel();
c.jmp(begin);
c.bind(fall);
c.test(x86::eax, _XABORT_RETRY);
c.jz(fallback);
if (less_than < 65)
{
c.add(ctr, 1);
c.test(x86::eax, _XABORT_RETRY);
c.jz(fallback);
}
else
{
// Count an attempt without RETRY flag as 65 normal attempts and continue
c.not_(x86::eax);
c.and_(x86::eax, _XABORT_RETRY);
c.shl(x86::eax, 5);
c.add(x86::eax, 1); // eax = RETRY ? 1 : 65
c.add(ctr, x86::rax);
}
c.cmp(ctr, less_than);
c.jae(fallback);
c.align(kAlignCode, 16);
c.bind(begin);
c.xbegin(fall);
return begin;
}
void asmjit::build_transaction_abort(asmjit::X86Assembler& c, unsigned char code)

View File

@ -43,7 +43,7 @@ namespace asmjit
asmjit::JitRuntime& get_global_runtime();
// Emit xbegin and adjacent loop, return label at xbegin
Label build_transaction_enter(X86Assembler& c, Label fallback);
void build_transaction_enter(X86Assembler& c, Label fallback, const X86Gp& ctr, uint less_than);
// Emit xabort
void build_transaction_abort(X86Assembler& c, unsigned char code);

View File

@ -3,6 +3,7 @@
#include "Emu/System.h"
#include "Emu/IdManager.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/Cell/PPUThread.h"
#include "Emu/Cell/RawSPUThread.h"
#include "Emu/Cell/lv2/sys_mmapper.h"
#include "Emu/Cell/lv2/sys_event.h"
@ -1101,6 +1102,11 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context)
try
{
if (cpu)
{
vm::temporary_unlock(*cpu);
}
handled = rsx::g_access_violation_handler(addr, is_writing);
}
catch (const std::exception& e)
@ -1109,7 +1115,6 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context)
if (cpu)
{
vm::temporary_unlock(*cpu);
cpu->state += cpu_flag::dbg_pause;
if (cpu->test_stopped())
@ -1131,6 +1136,10 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context)
return true;
}
if (cpu && cpu->test_stopped())
{
}
}
auto code = (const u8*)RIP(context);

View File

@ -273,6 +273,177 @@ void shared_cond::imp_notify() noexcept
balanced_awaken<true>(m_cvx32, utils::popcnt32(wait_mask));
}
void shared_cond::wait_all() noexcept
{
// Try to acquire waiting state without locking but only if there are other locks
const auto [old_, result] = m_cvx32.fetch_op([](u64& cvx32) -> u64
{
// Check waiting alone
if ((cvx32 & 0xffffffff) == 0)
{
return 0;
}
// Combine used bits and invert to find least significant bit unused
const u32 slot = utils::cnttz64(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
// Set waiting bit (does nothing if all slots are used)
cvx32 |= (1ull << slot) & 0xffffffff;
return 1ull << slot;
});
if (!result)
{
return;
}
if (result > 0xffffffffu)
{
// All slots are used, fallback to spin wait
while (m_cvx32 & 0xffffffff)
{
busy_wait();
}
return;
}
const u64 wait_bit = result;
const u64 lock_bit = wait_bit | (wait_bit << 32);
balanced_wait_until(m_cvx32, -1, [&](u64& cvx32, auto... ret) -> int
{
if ((cvx32 & wait_bit) == 0)
{
// Remove signal and unlock at once
cvx32 &= ~lock_bit;
return +1;
}
if constexpr (sizeof...(ret))
{
cvx32 &= ~lock_bit;
return -1;
}
return 0;
});
}
bool shared_cond::wait_all(shared_cond::shared_lock& lock) noexcept
{
AUDIT(lock.m_this == this);
if (lock.m_slot >= 32)
{
// Invalid argument, assume notified
return true;
}
const u64 wait_bit = c_wait << lock.m_slot;
const u64 lock_bit = c_lock << lock.m_slot;
// Try to acquire waiting state only if there are other locks
const auto [old_, not_alone] = m_cvx32.fetch_op([&](u64& cvx32)
{
// Check locking alone
if (((cvx32 >> 32) & cvx32) == (lock_bit >> 32))
{
return false;
}
// c_lock -> c_wait, c_sig -> unlock
cvx32 &= ~(lock_bit & ~wait_bit);
return true;
});
if (!not_alone)
{
return false;
}
else
{
// Set invalid slot to acknowledge unlocking
lock.m_slot = 33;
}
if ((old_ & wait_bit) == 0)
{
// Already signaled, return without waiting
return true;
}
balanced_wait_until(m_cvx32, -1, [&](u64& cvx32, auto... ret) -> int
{
if ((cvx32 & wait_bit) == 0)
{
// Remove signal and unlock at once
cvx32 &= ~lock_bit;
return +1;
}
if constexpr (sizeof...(ret))
{
cvx32 &= ~lock_bit;
return -1;
}
return 0;
});
return true;
}
bool shared_cond::notify_all(shared_cond::shared_lock& lock) noexcept
{
AUDIT(lock.m_this == this);
if (lock.m_slot >= 32)
{
// Invalid argument
return false;
}
const u64 slot_mask = c_sig << lock.m_slot;
auto [old, ok] = m_cvx32.fetch_op([&](u64& cvx32)
{
if (((cvx32 << 32) & cvx32) != slot_mask)
{
return false;
}
if (const u64 sig_mask = cvx32 & 0xffffffff)
{
cvx32 &= (0xffffffffull << 32) & ~slot_mask;
cvx32 |= (sig_mask << 32) & ~slot_mask;
return true;
}
return false;
});
if (!ok)
{
// Not an exclusive reader
return false;
}
// Set invalid slot to acknowledge unlocking
lock.m_slot = 34;
// Determine if some waiters need a syscall notification
const u64 wait_mask = old & (~old >> 32);
if (UNLIKELY(!wait_mask))
{
return true;
}
balanced_awaken<true>(m_cvx32, utils::popcnt32(wait_mask));
return true;
}
bool lf_queue_base::wait(u64 _timeout)
{
auto _old = m_head.compare_and_swap(0, 1);

View File

@ -206,7 +206,7 @@ class shared_cond
m_slot = m_this->m_cvx32.atomic_op([](u64& cvx32)
{
// Combine used bits and invert to find least significant bit unused
const u32 slot = utils::cnttz32(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
const u32 slot = utils::cnttz64(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
// Set lock bits (does nothing if all slots are used)
const u64 bit = (1ull << slot) & 0xffffffff;
@ -217,6 +217,13 @@ class shared_cond
shared_lock(const shared_lock&) = delete;
shared_lock(shared_lock&& rhs)
: m_this(rhs.m_this)
, m_slot(rhs.m_slot)
{
rhs.m_slot = 32;
}
shared_lock& operator=(const shared_lock&) = delete;
~shared_lock()
@ -261,6 +268,10 @@ public:
return imp_wait(lock.m_slot, usec_timeout);
}
void wait_all() noexcept;
bool wait_all(shared_lock& lock) noexcept;
void notify_all() noexcept
{
if (LIKELY(!m_cvx32))
@ -268,4 +279,6 @@ public:
imp_notify();
}
bool notify_all(shared_lock& lock) noexcept;
};

2
llvm

@ -1 +1 @@
Subproject commit b860b5e8f4ee90d6eb567d83ce8ed1a3e71e496f
Subproject commit 99b5284463025849c59067e79a3c08899049757e

View File

@ -19,10 +19,13 @@ void fmt_class_string<cpu_flag>::format(std::string& out, u64 arg)
{
case cpu_flag::stop: return "STOP";
case cpu_flag::exit: return "EXIT";
case cpu_flag::wait: return "w";
case cpu_flag::pause: return "p";
case cpu_flag::suspend: return "s";
case cpu_flag::ret: return "ret";
case cpu_flag::signal: return "sig";
case cpu_flag::memory: return "mem";
case cpu_flag::jit_return: return "JIT";
case cpu_flag::dbg_global_pause: return "G-PAUSE";
case cpu_flag::dbg_global_stop: return "G-EXIT";
case cpu_flag::dbg_pause: return "PAUSE";
@ -42,10 +45,43 @@ void fmt_class_string<bs_t<cpu_flag>>::format(std::string& out, u64 arg)
thread_local cpu_thread* g_tls_current_cpu_thread = nullptr;
// For coordination and notification
alignas(64) shared_cond g_cpu_array_lock;
// For cpu_flag::pause bit setting/removing
alignas(64) shared_mutex g_cpu_pause_lock;
// For cpu_flag::pause
alignas(64) atomic_t<u64> g_cpu_pause_ctr{0};
// Semaphore for global thread array (global counter)
alignas(64) atomic_t<u32> g_cpu_array_sema{0};
// Semaphore subdivision for each array slot (64 x N in total)
atomic_t<u64> g_cpu_array_bits[6]{};
// All registered threads
atomic_t<cpu_thread*> g_cpu_array[sizeof(g_cpu_array_bits) * 8]{};
template <typename F>
void for_all_cpu(F&& func) noexcept
{
for (u32 i = 0; i < ::size32(g_cpu_array_bits); i++)
{
for (u64 bits = g_cpu_array_bits[i]; bits; bits &= bits - 1)
{
const u64 index = i * 64 + utils::cnttz64(bits, true);
if (cpu_thread* cpu = g_cpu_array[index].load())
{
func(cpu);
}
}
}
}
void cpu_thread::operator()()
{
state -= cpu_flag::exit;
g_tls_current_cpu_thread = this;
if (g_cfg.core.thread_scheduler_enabled)
@ -58,6 +94,48 @@ void cpu_thread::operator()()
thread_ctrl::set_native_priority(-1);
}
// Register thread in g_cpu_array
if (!g_cpu_array_sema.try_inc(sizeof(g_cpu_array_bits) * 8))
{
LOG_FATAL(GENERAL, "Too many threads");
Emu.Pause();
return;
}
u64 array_slot = -1;
for (u32 i = 0;; i = (i + 1) % ::size32(g_cpu_array_bits))
{
if (LIKELY(~g_cpu_array_bits[i]))
{
const u64 found = g_cpu_array_bits[i].atomic_op([](u64& bits) -> u64
{
// Find empty array slot and set its bit
if (LIKELY(~bits))
{
const u64 bit = utils::cnttz64(~bits, true);
bits |= 1ull << bit;
return bit;
}
return 64;
});
if (LIKELY(found < 64))
{
// Fixup
array_slot = i * 64 + found;
break;
}
}
}
// Register and wait if necessary
verify("g_cpu_array[...] -> this" HERE), g_cpu_array[array_slot].exchange(this) == nullptr;
state += cpu_flag::wait;
g_cpu_array_lock.wait_all();
// Check thread status
while (!(state & (cpu_flag::exit + cpu_flag::dbg_global_stop)))
{
@ -86,6 +164,13 @@ void cpu_thread::operator()()
thread_ctrl::wait();
}
// Unregister and wait if necessary
state += cpu_flag::wait;
verify("g_cpu_array[...] -> null" HERE), g_cpu_array[array_slot].exchange(nullptr) == this;
g_cpu_array_bits[array_slot / 64] &= ~(1ull << (array_slot % 64));
g_cpu_array_sema--;
g_cpu_array_lock.wait_all();
}
void cpu_thread::on_abort()
@ -105,7 +190,7 @@ cpu_thread::cpu_thread(u32 id)
g_threads_created++;
}
bool cpu_thread::check_state()
bool cpu_thread::check_state() noexcept
{
#ifdef WITH_GDB_DEBUGGER
if (state & cpu_flag::dbg_pause)
@ -117,6 +202,11 @@ bool cpu_thread::check_state()
bool cpu_sleep_called = false;
bool cpu_flag_memory = false;
if (!(state & cpu_flag::wait))
{
state += cpu_flag::wait;
}
while (true)
{
if (state & cpu_flag::memory)
@ -131,8 +221,9 @@ bool cpu_thread::check_state()
state -= cpu_flag::memory;
}
if (state & cpu_flag::exit + cpu_flag::jit_return + cpu_flag::dbg_global_stop)
if (state & (cpu_flag::exit + cpu_flag::jit_return + cpu_flag::dbg_global_stop))
{
state += cpu_flag::wait;
return true;
}
@ -141,7 +232,24 @@ bool cpu_thread::check_state()
cpu_sleep_called = false;
}
if (!is_paused())
const auto [state0, escape] = state.fetch_op([&](bs_t<cpu_flag>& flags)
{
// Check pause flags which hold thread inside check_state
if (flags & (cpu_flag::pause + cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause))
{
return false;
}
// Atomically clean wait flag and escape
if (!(flags & (cpu_flag::exit + cpu_flag::jit_return + cpu_flag::dbg_global_stop + cpu_flag::ret + cpu_flag::stop)))
{
flags -= cpu_flag::wait;
}
return true;
});
if (escape)
{
if (cpu_flag_memory)
{
@ -150,14 +258,43 @@ bool cpu_thread::check_state()
break;
}
else if (!cpu_sleep_called && state & cpu_flag::suspend)
else if (!cpu_sleep_called && state0 & cpu_flag::suspend)
{
cpu_sleep();
cpu_sleep_called = true;
continue;
}
thread_ctrl::wait();
if (state & cpu_flag::wait)
{
// Spin wait once for a bit before resorting to thread_ctrl::wait
for (u32 i = 0; i < 10; i++)
{
if (state0 & (cpu_flag::pause + cpu_flag::suspend))
{
busy_wait(500);
}
else
{
break;
}
}
if (!(state0 & (cpu_flag::pause + cpu_flag::suspend)))
{
continue;
}
}
if (state0 & (cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause))
{
thread_ctrl::wait();
}
else
{
// If only cpu_flag::pause was set, notification won't arrive
g_cpu_array_lock.wait_all();
}
}
const auto state_ = state.load();
@ -196,3 +333,90 @@ std::string cpu_thread::dump() const
{
return fmt::format("Type: %s\n" "State: %s\n", typeid(*this).name(), state.load());
}
cpu_thread::suspend_all::suspend_all(cpu_thread* _this) noexcept
: m_lock(g_cpu_array_lock.try_shared_lock())
, m_this(_this)
{
// TODO
if (!m_lock)
{
LOG_FATAL(GENERAL, "g_cpu_array_lock: too many concurrent accesses");
Emu.Pause();
return;
}
if (m_this)
{
m_this->state += cpu_flag::wait;
}
g_cpu_pause_ctr++;
reader_lock lock(g_cpu_pause_lock);
for_all_cpu([](cpu_thread* cpu)
{
cpu->state += cpu_flag::pause;
});
busy_wait(500);
while (true)
{
bool ok = true;
for_all_cpu([&](cpu_thread* cpu)
{
if (!(cpu->state & cpu_flag::wait))
{
ok = false;
}
});
if (LIKELY(ok))
{
break;
}
busy_wait(500);
}
}
cpu_thread::suspend_all::~suspend_all()
{
// Make sure the latest thread does the cleanup and notifies others
u64 pause_ctr = 0;
while ((pause_ctr = g_cpu_pause_ctr), !g_cpu_array_lock.wait_all(m_lock))
{
if (pause_ctr)
{
std::lock_guard lock(g_cpu_pause_lock);
// Detect possible unfortunate reordering of flag clearing after suspend_all's reader lock
if (g_cpu_pause_ctr != pause_ctr)
{
continue;
}
for_all_cpu([&](cpu_thread* cpu)
{
if (g_cpu_pause_ctr == pause_ctr)
{
cpu->state -= cpu_flag::pause;
}
});
}
if (g_cpu_array_lock.notify_all(m_lock))
{
break;
}
}
if (m_this)
{
m_this->check_state();
}
}

View File

@ -2,12 +2,15 @@
#include "../Utilities/Thread.h"
#include "../Utilities/bit_set.h"
#include "../Utilities/cond.h"
// Thread state flags
enum class cpu_flag : u32
{
stop, // Thread not running (HLE, initial state)
exit, // Irreversible exit
wait, // Indicates waiting state, set by the thread itself
pause, // Thread suspended by suspend_all technique
suspend, // Thread suspended
ret, // Callback return requested
signal, // Thread received a signal (HLE)
@ -39,15 +42,15 @@ public:
const u32 id;
// Public thread state
atomic_bs_t<cpu_flag> state{+cpu_flag::stop};
atomic_bs_t<cpu_flag> state{cpu_flag::stop + cpu_flag::wait};
// Process thread state, return true if the checker must return
bool check_state();
bool check_state() noexcept;
// Process thread state (pause)
[[nodiscard]] bool test_stopped()
{
if (UNLIKELY(state))
if (state)
{
if (check_state())
{
@ -99,6 +102,20 @@ public:
// Callback for vm::temporary_unlock
virtual void cpu_unmem() {}
// Thread locker
class suspend_all
{
decltype(std::declval<shared_cond&>().try_shared_lock()) m_lock;
cpu_thread* m_this;
public:
suspend_all(cpu_thread* _this) noexcept;
suspend_all(const suspend_all&) = delete;
suspend_all& operator=(const suspend_all&) = delete;
~suspend_all();
};
};
inline cpu_thread* get_current_cpu_thread() noexcept

View File

@ -1064,11 +1064,12 @@ const auto ppu_stwcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rd
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.shr(args[0], 7);
c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0], 3));
c.xor_(args[0].r32(), args[0].r32());
c.bswap(args[2].r32());
c.bswap(args[3].r32());
// Begin transaction
Label begin = build_transaction_enter(c, fall);
build_transaction_enter(c, fall, args[0], 16);
c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.and_(x86::rax, -128);
c.cmp(x86::rax, args[1]);
@ -1184,11 +1185,12 @@ const auto ppu_stdcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rd
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.shr(args[0], 7);
c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0], 3));
c.xor_(args[0].r32(), args[0].r32());
c.bswap(args[2]);
c.bswap(args[3]);
// Begin transaction
Label begin = build_transaction_enter(c, fall);
build_transaction_enter(c, fall, args[0], 16);
c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.and_(x86::rax, -128);
c.cmp(x86::rax, args[1]);

View File

@ -1349,6 +1349,12 @@ void spu_stop(spu_thread* _spu, u32 code)
{
spu_runtime::g_escape(_spu);
}
if (_spu->test_stopped())
{
_spu->pc += 4;
spu_runtime::g_escape(_spu);
}
}
void spu_recompiler::STOP(spu_opcode_t op)
@ -1407,7 +1413,7 @@ void spu_recompiler::MFSPR(spu_opcode_t op)
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
}
static s64 spu_rdch(spu_thread* _spu, u32 ch)
static u32 spu_rdch(spu_thread* _spu, u32 ch)
{
const s64 result = _spu->get_ch_value(ch);
@ -1416,7 +1422,13 @@ static s64 spu_rdch(spu_thread* _spu, u32 ch)
spu_runtime::g_escape(_spu);
}
return result;
if (_spu->test_stopped())
{
_spu->pc += 4;
spu_runtime::g_escape(_spu);
}
return static_cast<u32>(result & 0xffffffff);
}
void spu_recompiler::RDCH(spu_opcode_t op)
@ -2319,14 +2331,26 @@ static void spu_wrch(spu_thread* _spu, u32 ch, u32 value)
{
spu_runtime::g_escape(_spu);
}
if (_spu->test_stopped())
{
_spu->pc += 4;
spu_runtime::g_escape(_spu);
}
}
static void spu_wrch_mfc(spu_thread* _spu, spu_function_t _ret)
static void spu_wrch_mfc(spu_thread* _spu)
{
if (!_spu->process_mfc_cmd())
{
spu_runtime::g_escape(_spu);
}
if (_spu->test_stopped())
{
_spu->pc += 4;
spu_runtime::g_escape(_spu);
}
}
void spu_recompiler::WRCH(spu_opcode_t op)

View File

@ -167,6 +167,13 @@ bool spu_interpreter::RDCH(spu_thread& spu, spu_opcode_t op)
}
spu.gpr[op.rt] = v128::from32r(static_cast<u32>(result));
if (spu.state)
{
spu.pc += 4;
return false;
}
return true;
}
@ -414,7 +421,18 @@ bool spu_interpreter::MTSPR(spu_thread& spu, spu_opcode_t op)
bool spu_interpreter::WRCH(spu_thread& spu, spu_opcode_t op)
{
return spu.set_ch_value(op.ra, spu.gpr[op.rt]._u32[3]);
if (!spu.set_ch_value(op.ra, spu.gpr[op.rt]._u32[3]))
{
return false;
}
if (spu.state)
{
spu.pc += 4;
return false;
}
return true;
}
bool spu_interpreter::BIZ(spu_thread& spu, spu_opcode_t op)

View File

@ -5125,34 +5125,30 @@ public:
call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode));
}
static bool exec_stop(spu_thread* _spu, u32 code)
static void exec_stop(spu_thread* _spu, u32 code)
{
return _spu->stop_and_signal(code);
if (!_spu->stop_and_signal(code))
{
spu_runtime::g_escape(_spu);
}
if (_spu->test_stopped())
{
_spu->pc += 4;
spu_runtime::g_escape(_spu);
}
}
void STOP(spu_opcode_t op) //
{
if (m_interp_magn)
{
const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff)));
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(succ, next, stop);
m_ir->SetInsertPoint(stop);
m_ir->CreateRetVoid();
m_ir->SetInsertPoint(next);
call("spu_syscall", &exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff)));
return;
}
update_pc();
const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff));
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(succ, next, stop);
m_ir->SetInsertPoint(stop);
m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
m_ir->CreateBr(next);
m_ir->SetInsertPoint(next);
call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff));
if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
{
@ -5167,28 +5163,35 @@ public:
{
if (m_interp_magn)
{
const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(0x3fff));
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(succ, next, stop);
m_ir->SetInsertPoint(stop);
m_ir->CreateRetVoid();
m_ir->SetInsertPoint(next);
call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(0x3fff));
return;
}
STOP(spu_opcode_t{0x3fff});
}
static s64 exec_rdch(spu_thread* _spu, u32 ch)
static u32 exec_rdch(spu_thread* _spu, u32 ch)
{
return _spu->get_ch_value(ch);
const s64 result = _spu->get_ch_value(ch);
if (result < 0)
{
spu_runtime::g_escape(_spu);
}
if (_spu->test_stopped())
{
_spu->pc += 4;
spu_runtime::g_escape(_spu);
}
return static_cast<u32>(result & 0xffffffff);
}
static s64 exec_read_in_mbox(spu_thread* _spu)
static u32 exec_read_in_mbox(spu_thread* _spu)
{
// TODO
return _spu->get_ch_value(SPU_RdInMbox);
return exec_rdch(_spu, SPU_RdInMbox);
}
static u32 exec_read_dec(spu_thread* _spu)
@ -5203,7 +5206,7 @@ public:
return res;
}
static s64 exec_read_events(spu_thread* _spu)
static u32 exec_read_events(spu_thread* _spu)
{
if (const u32 events = _spu->get_events())
{
@ -5211,7 +5214,7 @@ public:
}
// TODO
return _spu->get_ch_value(SPU_RdEventStat);
return exec_rdch(_spu, SPU_RdEventStat);
}
llvm::Value* get_rdch(spu_opcode_t op, u32 off, bool atomic)
@ -5234,20 +5237,17 @@ public:
const auto _cur = m_ir->GetInsertBlock();
const auto done = llvm::BasicBlock::Create(m_context, "", m_function);
const auto wait = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(m_ir->CreateICmpSLT(val0, m_ir->getInt64(0)), done, wait);
const auto cond = m_ir->CreateICmpSLT(val0, m_ir->getInt64(0));
val0 = m_ir->CreateTrunc(val0, get_type<u32>());
m_ir->CreateCondBr(cond, done, wait);
m_ir->SetInsertPoint(wait);
const auto val1 = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra));
m_ir->CreateCondBr(m_ir->CreateICmpSLT(val1, m_ir->getInt64(0)), stop, done);
m_ir->SetInsertPoint(stop);
m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
m_ir->CreateBr(done);
m_ir->SetInsertPoint(done);
const auto rval = m_ir->CreatePHI(get_type<u64>(), 2);
const auto rval = m_ir->CreatePHI(get_type<u32>(), 2);
rval->addIncoming(val0, _cur);
rval->addIncoming(val1, wait);
rval->addIncoming(m_ir->getInt64(0), stop);
return m_ir->CreateTrunc(rval, get_type<u32>());
return rval;
}
void RDCH(spu_opcode_t op) //
@ -5257,13 +5257,6 @@ public:
if (m_interp_magn)
{
res.value = call("spu_read_channel", &exec_rdch, m_thread, get_imm<u32>(op.ra).value);
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
m_ir->SetInsertPoint(stop);
m_ir->CreateRetVoid();
m_ir->SetInsertPoint(next);
res.value = m_ir->CreateTrunc(res.value, get_type<u32>());
set_vr(op.rt, insert(splat<u32[4]>(0), 3, res));
return;
}
@ -5279,14 +5272,6 @@ public:
{
update_pc();
res.value = call("spu_read_in_mbox", &exec_read_in_mbox, m_thread);
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
m_ir->SetInsertPoint(stop);
m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
m_ir->CreateBr(next);
m_ir->SetInsertPoint(next);
res.value = m_ir->CreateTrunc(res.value, get_type<u32>());
break;
}
case MFC_RdTagStat:
@ -5333,14 +5318,6 @@ public:
{
update_pc();
res.value = call("spu_read_events", &exec_read_events, m_thread);
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
m_ir->SetInsertPoint(stop);
m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
m_ir->CreateBr(next);
m_ir->SetInsertPoint(next);
res.value = m_ir->CreateTrunc(res.value, get_type<u32>());
break;
}
case SPU_RdMachStat:
@ -5353,14 +5330,6 @@ public:
{
update_pc();
res.value = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra));
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
m_ir->SetInsertPoint(stop);
m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
m_ir->CreateBr(next);
m_ir->SetInsertPoint(next);
res.value = m_ir->CreateTrunc(res.value, get_type<u32>());
break;
}
}
@ -5471,14 +5440,18 @@ public:
set_vr(op.rt, insert(splat<u32[4]>(0), 3, res));
}
static bool exec_wrch(spu_thread* _spu, u32 ch, u32 value)
static void exec_wrch(spu_thread* _spu, u32 ch, u32 value)
{
return _spu->set_ch_value(ch, value);
}
if (!_spu->set_ch_value(ch, value))
{
spu_runtime::g_escape(_spu);
}
static void exec_mfc(spu_thread* _spu)
{
return _spu->do_mfc();
if (_spu->test_stopped())
{
_spu->pc += 4;
spu_runtime::g_escape(_spu);
}
}
static void exec_list_unstall(spu_thread* _spu, u32 tag)
@ -5491,12 +5464,21 @@ public:
}
}
return exec_mfc(_spu);
_spu->do_mfc();
}
static bool exec_mfc_cmd(spu_thread* _spu)
static void exec_mfc_cmd(spu_thread* _spu)
{
return _spu->process_mfc_cmd();
if (!_spu->process_mfc_cmd())
{
spu_runtime::g_escape(_spu);
}
if (_spu->test_stopped())
{
_spu->pc += 4;
spu_runtime::g_escape(_spu);
}
}
void WRCH(spu_opcode_t op) //
@ -5505,13 +5487,7 @@ public:
if (m_interp_magn)
{
const auto succ = call("spu_write_channel", &exec_wrch, m_thread, get_imm<u32>(op.ra).value, val.value);
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(succ, next, stop);
m_ir->SetInsertPoint(stop);
m_ir->CreateRetVoid();
m_ir->SetInsertPoint(next);
call("spu_write_channel", &exec_wrch, m_thread, get_imm<u32>(op.ra).value, val.value);
return;
}
@ -5922,14 +5898,7 @@ public:
}
update_pc();
const auto succ = call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value);
const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
m_ir->CreateCondBr(succ, next, stop);
m_ir->SetInsertPoint(stop);
m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
m_ir->CreateBr(next);
m_ir->SetInsertPoint(next);
call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value);
}
void LNOP(spu_opcode_t op) //

View File

@ -29,36 +29,39 @@ static const bool s_tsx_avx = utils::has_avx();
// For special case
static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
#ifdef _MSC_VER
bool operator ==(const u128& lhs, const u128& rhs)
static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const decltype(spu_thread::rdata)& rhs)
{
return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
const v128 a = (lhs[0] ^ rhs[0]) | (lhs[1] ^ rhs[1]);
const v128 b = (lhs[2] ^ rhs[2]) | (lhs[3] ^ rhs[3]);
const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]);
const v128 d = (lhs[6] ^ rhs[6]) | (lhs[7] ^ rhs[7]);
const v128 r = (a | b) | (c | d);
return !(r._u64[0] | r._u64[1]);
}
#endif
static FORCE_INLINE void mov_rdata(u128* const dst, const u128* const src)
static FORCE_INLINE void mov_rdata(decltype(spu_thread::rdata)& dst, const decltype(spu_thread::rdata)& src)
{
{
const u128 data0 = src[0];
const u128 data1 = src[1];
const u128 data2 = src[2];
const v128 data0 = src[0];
const v128 data1 = src[1];
const v128 data2 = src[2];
dst[0] = data0;
dst[1] = data1;
dst[2] = data2;
}
{
const u128 data0 = src[3];
const u128 data1 = src[4];
const u128 data2 = src[5];
const v128 data0 = src[3];
const v128 data1 = src[4];
const v128 data2 = src[5];
dst[3] = data0;
dst[4] = data1;
dst[5] = data2;
}
{
const u128 data0 = src[6];
const u128 data1 = src[7];
const v128 data0 = src[6];
const v128 data1 = src[7];
dst[6] = data0;
dst[7] = data1;
}
@ -182,13 +185,15 @@ namespace spu
}
}
const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label fail = c.newLabel();
Label _ret = c.newLabel();
Label skip = c.newLabel();
Label next = c.newLabel();
if (utils::has_avx() && !s_tsx_avx)
{
@ -197,8 +202,6 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
@ -234,8 +237,6 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
c.xor_(x86::r12d, x86::r12d);
c.mov(x86::r13, args[1]);
c.mov(x86::r14, args[2]);
c.mov(x86::r15, args[3]);
// Prepare data
if (s_tsx_avx)
@ -270,10 +271,13 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
}
// Begin transaction
build_transaction_enter(c, fall);
build_transaction_enter(c, fall, x86::r12, 4);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail);
c.test(x86::qword_ptr(x86::rbx), 127);
c.jnz(skip);
if (s_tsx_avx)
{
@ -329,24 +333,34 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
c.sub(x86::qword_ptr(x86::rbx), -128);
c.xend();
c.xor_(x86::eax, x86::eax);
c.mov(x86::eax, 1);
c.jmp(_ret);
// Touch memory after transaction failure
c.bind(skip);
c.xor_(x86::eax, x86::eax);
c.xor_(x86::r12d, x86::r12d);
build_transaction_abort(c, 0);
//c.jmp(fall);
c.bind(fall);
c.sar(x86::eax, 24);
c.js(fail);
c.xor_(x86::rbp, 0xf80);
c.lock().add(x86::qword_ptr(x86::rbp), 0);
c.xor_(x86::rbp, 0xf80);
c.lock().add(x86::qword_ptr(x86::rbx), 1);
c.mov(x86::r12d, 1);
c.lock().bts(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::wait));
// Touch memory if transaction failed without RETRY flag on the first attempt
c.cmp(x86::r12, 1);
c.jne(next);
c.xor_(x86::rbp, 0xf80);
c.lock().add(x86::dword_ptr(x86::rbp), 0);
c.xor_(x86::rbp, 0xf80);
Label fall2 = c.newLabel();
Label next2 = c.newLabel();
Label fail2 = c.newLabel();
// Lightened transaction: only compare and swap data
Label retry = build_transaction_enter(c, fall2);
c.bind(next);
build_transaction_enter(c, fall2, x86::r12, 666);
if (s_tsx_avx)
{
@ -379,7 +393,7 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
c.ptest(x86::xmm0, x86::xmm0);
}
c.jnz(fail);
c.jnz(fail2);
if (s_tsx_avx)
{
@ -402,86 +416,24 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 127);
c.mov(x86::rax, x86::r12);
c.mov(x86::eax, 1);
c.jmp(_ret);
// Touch memory after transaction failure
c.bind(fall2);
c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
if (s_tsx_haswell || std::thread::hardware_concurrency() < 12)
{
// Call yield and restore data
c.call(imm_ptr(&std::this_thread::yield));
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::yword_ptr(x86::r14, 0));
c.vmovups(x86::ymm1, x86::yword_ptr(x86::r14, 32));
c.vmovups(x86::ymm2, x86::yword_ptr(x86::r14, 64));
c.vmovups(x86::ymm3, x86::yword_ptr(x86::r14, 96));
c.vmovups(x86::ymm4, x86::yword_ptr(x86::r15, 0));
c.vmovups(x86::ymm5, x86::yword_ptr(x86::r15, 32));
c.vmovups(x86::ymm6, x86::yword_ptr(x86::r15, 64));
c.vmovups(x86::ymm7, x86::yword_ptr(x86::r15, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::r14, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::r14, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::r14, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::r14, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::r14, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::r14, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::r14, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::r14, 112));
c.movaps(x86::xmm8, x86::oword_ptr(x86::r15, 0));
c.movaps(x86::xmm9, x86::oword_ptr(x86::r15, 16));
c.movaps(x86::xmm10, x86::oword_ptr(x86::r15, 32));
c.movaps(x86::xmm11, x86::oword_ptr(x86::r15, 48));
c.movaps(x86::xmm12, x86::oword_ptr(x86::r15, 64));
c.movaps(x86::xmm13, x86::oword_ptr(x86::r15, 80));
c.movaps(x86::xmm14, x86::oword_ptr(x86::r15, 96));
c.movaps(x86::xmm15, x86::oword_ptr(x86::r15, 112));
}
}
else
{
Label loop1 = c.newLabel();
c.mov(x86::eax, x86::r12d);
c.and_(x86::eax, 0xf);
c.shl(x86::eax, 3);
c.or_(x86::eax, 1);
c.bind(loop1);
c.pause();
c.dec(x86::eax);
c.jnz(loop1);
}
c.movzx(x86::eax, x86::r12b);
c.not_(x86::al);
c.shl(x86::eax, 4);
c.xor_(x86::rbp, x86::rax);
c.lock().add(x86::qword_ptr(x86::rbp), 0);
c.xor_(x86::rbp, x86::rax);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail);
c.cmp(x86::r12, 16);
c.jb(retry);
c.mov(x86::rax, imm_ptr(&g_cfg.core.spu_accurate_putllc.get()));
c.test(x86::byte_ptr(x86::rax), 1);
c.jnz(retry);
c.sar(x86::eax, 24);
c.js(fail2);
c.mov(x86::eax, 2);
c.jmp(_ret);
c.bind(fail);
build_transaction_abort(c, 0xff);
c.test(x86::r12, x86::r12);
c.jz(next2);
c.xor_(x86::eax, x86::eax);
c.jmp(_ret);
c.bind(fail2);
build_transaction_abort(c, 0xff);
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
c.bind(next2);
c.mov(x86::rax, x86::r12);
c.not_(x86::rax);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
@ -516,13 +468,11 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
c.pop(x86::rbp);
c.ret();
});
const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64* rtime)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -558,10 +508,9 @@ const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64*
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
c.xor_(x86::r12d, x86::r12d);
c.mov(x86::r13, args[1]);
c.mov(x86::qword_ptr(x86::rsp, 64), args[2]);
// Begin transaction
Label begin = build_transaction_enter(c, fall);
build_transaction_enter(c, fall, x86::r12, 16);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
if (s_tsx_avx)
@ -605,32 +554,12 @@ const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64*
}
c.and_(x86::rax, -128);
c.mov(args[2], x86::qword_ptr(x86::rsp, 64));
c.mov(x86::qword_ptr(args[2]), x86::rax);
c.mov(x86::rax, x86::r12);
c.jmp(_ret);
// Touch memory after transaction failure
c.bind(fall);
c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
c.mov(x86::eax, 1);
//c.jmp(_ret);
if (s_tsx_haswell || std::thread::hardware_concurrency() < 12)
{
c.call(imm_ptr(&std::this_thread::yield));
}
else
{
c.mov(args[0], 500);
c.call(imm_ptr(&::busy_wait));
}
c.xor_(x86::rbp, 0xf80);
c.xor_(x86::rbx, 0xf80);
c.mov(x86::rax, x86::qword_ptr(x86::rbp));
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.xor_(x86::rbp, 0xf80);
c.xor_(x86::rbx, 0xf80);
c.jmp(begin);
c.bind(_ret);
#ifdef _WIN32
@ -654,7 +583,7 @@ const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64*
c.ret();
});
const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u64* rtime)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_getll_inexact = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -691,7 +620,6 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
c.xor_(x86::r12d, x86::r12d);
c.mov(x86::r13, args[1]);
c.mov(x86::qword_ptr(x86::rsp, 64), args[2]);
// Begin copying
Label begin = c.newLabel();
@ -719,14 +647,15 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
}
// Verify and retry if necessary.
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.je(test0);
c.pause();
c.mov(args[0], x86::rax);
c.xor_(args[0], x86::qword_ptr(x86::rbx));
c.test(args[0], -128);
c.jz(test0);
c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
c.jmp(begin);
c.bind(test0);
c.test(x86::eax, 0x7f);
c.test(x86::eax, 127);
c.jz(_ret);
c.and_(x86::rax, -128);
@ -774,8 +703,6 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
c.jz(_ret);
c.lea(x86::r12, x86::qword_ptr(x86::r12, 2));
c.mov(args[0], 500);
c.call(imm_ptr(&::busy_wait));
c.jmp(begin);
c.bind(_ret);
@ -799,10 +726,6 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
c.movaps(x86::oword_ptr(x86::r13, 112), x86::xmm7);
}
c.mov(args[2], x86::qword_ptr(x86::rsp, 64));
c.mov(x86::qword_ptr(args[2]), x86::rax);
c.mov(x86::rax, x86::r12);
#ifdef _WIN32
if (!s_tsx_avx)
{
@ -826,12 +749,14 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
c.ret();
});
const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, spu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label _ret = c.newLabel();
Label skip = c.newLabel();
Label next = c.newLabel();
if (utils::has_avx() && !s_tsx_avx)
{
@ -884,7 +809,9 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
}
// Begin transaction
build_transaction_enter(c, fall);
build_transaction_enter(c, fall, x86::r12, 8);
c.test(x86::dword_ptr(x86::rbx), 127);
c.jnz(skip);
if (s_tsx_avx)
{
@ -907,21 +834,31 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
c.sub(x86::qword_ptr(x86::rbx), -128);
c.xend();
c.xor_(x86::eax, x86::eax);
c.mov(x86::eax, 1);
c.jmp(_ret);
// Touch memory after transaction failure
c.bind(skip);
c.xor_(x86::eax, x86::eax);
c.xor_(x86::r12d, x86::r12d);
build_transaction_abort(c, 0);
//c.jmp(fall);
c.bind(fall);
c.xor_(x86::rbp, 0xf80);
c.lock().add(x86::qword_ptr(x86::rbp), 0);
c.xor_(x86::rbp, 0xf80);
c.lock().add(x86::qword_ptr(x86::rbx), 1);
c.mov(x86::r12d, 1);
c.lock().bts(x86::dword_ptr(args[2], ::offset32(&spu_thread::state)), static_cast<u32>(cpu_flag::wait));
// Touch memory if transaction failed without RETRY flag on the first attempt
c.cmp(x86::r12, 1);
c.jne(next);
c.xor_(x86::rbp, 0xf80);
c.lock().add(x86::dword_ptr(x86::rbp), 0);
c.xor_(x86::rbp, 0xf80);
Label fall2 = c.newLabel();
// Lightened transaction
Label retry = build_transaction_enter(c, fall2);
c.bind(next);
build_transaction_enter(c, fall2, x86::r12, 666);
if (s_tsx_avx)
{
@ -944,57 +881,12 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 127);
c.mov(x86::rax, x86::r12);
c.mov(x86::eax, 1);
c.jmp(_ret);
// Touch memory after transaction failure
c.bind(fall2);
c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
if (s_tsx_haswell || std::thread::hardware_concurrency() < 12)
{
// Call yield and restore data
c.call(imm_ptr(&std::this_thread::yield));
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::yword_ptr(x86::r13, 0));
c.vmovups(x86::ymm1, x86::yword_ptr(x86::r13, 32));
c.vmovups(x86::ymm2, x86::yword_ptr(x86::r13, 64));
c.vmovups(x86::ymm3, x86::yword_ptr(x86::r13, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::r13, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::r13, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::r13, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::r13, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::r13, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::r13, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::r13, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::r13, 112));
}
}
else
{
Label loop1 = c.newLabel();
c.mov(x86::eax, x86::r12d);
c.and_(x86::eax, 0xf);
c.shl(x86::eax, 3);
c.or_(x86::eax, 1);
c.bind(loop1);
c.pause();
c.dec(x86::eax);
c.jnz(loop1);
}
c.movzx(x86::eax, x86::r12b);
c.not_(x86::al);
c.shl(x86::eax, 4);
c.xor_(x86::rbp, x86::rax);
c.lock().add(x86::qword_ptr(x86::rbp), 0);
c.xor_(x86::rbp, x86::rax);
c.jmp(retry);
c.mov(x86::eax, 2);
//c.jmp(_ret);
c.bind(_ret);
@ -1486,7 +1378,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
while (size)
{
*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
@ -1501,7 +1393,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
while (size >= 128)
{
mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
dst += 128;
src += 128;
@ -1510,7 +1402,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
while (size)
{
*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
@ -1556,7 +1448,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{
while (size >= 128)
{
mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
dst += 128;
src += 128;
@ -1565,7 +1457,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
while (size)
{
*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
@ -1690,7 +1582,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
if (raddr && addr == raddr)
{
// Last check for event before we clear the reservation
if ((vm::reservation_acquire(addr, 128) & -128) != rtime || rdata != vm::_ref<decltype(rdata)>(addr))
if ((vm::reservation_acquire(addr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(addr)))
{
ch_event_stat |= SPU_EVENT_LR;
}
@ -1703,11 +1595,31 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
// Store unconditionally
if (LIKELY(g_use_rtm))
{
const u64 count = spu_putlluc_tx(addr, to_write.data());
const u32 result = spu_putlluc_tx(addr, to_write.data(), this);
if (count >= 10)
if (result == 2)
{
LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count);
cpu_thread::suspend_all cpu_lock(this);
// Try to obtain bit 7 (+64)
if (!atomic_storage<u64>::bts(vm::reservation_acquire(addr, 128).raw(), 6))
{
auto& data = vm::_ref<decltype(rdata)>(addr);
mov_rdata(data, to_write);
// Keep checking written data against a rogue transaction sneak in
while (std::atomic_thread_fence(std::memory_order_seq_cst), !cmp_rdata(data, to_write))
{
mov_rdata(data, to_write);
}
vm::reservation_acquire(addr, 128) += 63;
}
else
{
// Give up if another PUTLLUC command took precedence
vm::reservation_acquire(addr, 128) -= 1;
}
}
}
else
@ -1722,12 +1634,12 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(addr);
mov_rdata(data.data(), to_write.data());
mov_rdata(data, to_write);
res.release(res.load() + 127);
}
else
{
mov_rdata(data.data(), to_write.data());
mov_rdata(data, to_write);
res.release(res.load() + 127);
}
}
@ -1847,6 +1759,8 @@ bool spu_thread::process_mfc_cmd()
// Stall infinitely if MFC queue is full
while (UNLIKELY(mfc_size >= 16))
{
state += cpu_flag::wait;
if (is_stopped())
{
return false;
@ -1873,8 +1787,10 @@ bool spu_thread::process_mfc_cmd()
{
rtime = vm::reservation_acquire(addr, 128) & -128;
while (rdata == data && (vm::reservation_acquire(addr, 128)) == rtime)
while (cmp_rdata(rdata, data) && (vm::reservation_acquire(addr, 128)) == rtime)
{
state += cpu_flag::wait;
if (is_stopped())
{
break;
@ -1882,15 +1798,40 @@ bool spu_thread::process_mfc_cmd()
thread_ctrl::wait_for(100);
}
if (test_stopped())
{
return false;
}
}
if (LIKELY(g_use_rtm))
if (LIKELY(g_use_rtm && !g_cfg.core.spu_accurate_getllar && raddr != addr))
{
const u64 count = g_cfg.core.spu_accurate_getllar ? spu_getll_tx(addr, dst.data(), &ntime) : spu_getll_fast(addr, dst.data(), &ntime);
// TODO: maybe always start from a transaction
ntime = spu_getll_inexact(addr, dst.data());
}
else if (g_use_rtm)
{
ntime = spu_getll_tx(addr, dst.data());
if (count >= 10)
if (ntime == 1)
{
LOG_ERROR(SPU, "%s took too long: %u", ch_mfc_cmd.cmd, count);
if (!g_cfg.core.spu_accurate_getllar)
{
ntime = spu_getll_inexact(addr, dst.data());
}
else
{
cpu_thread::suspend_all cpu_lock(this);
while (vm::reservation_acquire(addr, 128) & 127)
{
busy_wait(100);
}
ntime = vm::reservation_acquire(addr, 128);
mov_rdata(dst, data);
}
}
}
else
@ -1907,37 +1848,37 @@ bool spu_thread::process_mfc_cmd()
vm::writer_lock lock(addr);
ntime = old_time;
mov_rdata(dst.data(), data.data());
mov_rdata(dst, data);
res.release(old_time);
}
else
{
ntime = old_time;
mov_rdata(dst.data(), data.data());
mov_rdata(dst, data);
res.release(old_time);
}
}
if (const u32 _addr = raddr)
if (raddr && raddr != addr)
{
// Last check for event before we replace the reservation with a new one
if ((vm::reservation_acquire(_addr, 128) & -128) != rtime || rdata != vm::_ref<decltype(rdata)>(_addr))
if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
{
ch_event_stat |= SPU_EVENT_LR;
}
}
else if (raddr == addr)
{
// Lost previous reservation on polling
if (ntime != rtime || !cmp_rdata(rdata, dst))
{
ch_event_stat |= SPU_EVENT_LR;
if (_addr == addr)
{
// Lost current reservation
raddr = 0;
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
return true;
}
}
}
raddr = addr;
rtime = ntime;
mov_rdata(rdata.data(), dst.data());
mov_rdata(rdata, dst);
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
return true;
@ -1949,29 +1890,39 @@ bool spu_thread::process_mfc_cmd()
const u32 addr = ch_mfc_cmd.eal & -128u;
u32 result = 0;
if (raddr == addr && rtime == (vm::reservation_acquire(raddr, 128) & -128))
if (raddr == addr)
{
const auto& to_write = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
if (LIKELY(g_use_rtm))
{
u64 count = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data());
result = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data());
if ((count >> 63) == 0)
if (result == 2)
{
result = 1;
}
else
{
count = ~count;
}
result = 0;
if (count >= 10)
{
LOG_ERROR(SPU, "%s took too long: %u (r=%u)", ch_mfc_cmd.cmd, count, result);
cpu_thread::suspend_all cpu_lock(this);
// Give up if other PUTLLC/PUTLLUC commands are in progress
if (!vm::reservation_acquire(addr, 128).try_dec(rtime + 1))
{
auto& data = vm::_ref<decltype(rdata)>(addr);
if ((vm::reservation_acquire(addr, 128) & -128) == rtime && cmp_rdata(rdata, data))
{
mov_rdata(data, to_write);
vm::reservation_acquire(addr, 128) += 127;
result = 1;
}
else
{
vm::reservation_acquire(addr, 128) -= 1;
}
}
}
}
else if (auto& data = vm::_ref<decltype(rdata)>(addr); rdata == data)
else if (auto& data = vm::_ref<decltype(rdata)>(addr); rtime == (vm::reservation_acquire(raddr, 128) & -128) && cmp_rdata(rdata, data))
{
auto& res = vm::reservation_lock(raddr, 128);
const u64 old_time = res.load() & -128;
@ -1984,9 +1935,9 @@ bool spu_thread::process_mfc_cmd()
// TODO: vm::check_addr
vm::writer_lock lock(addr);
if (rdata == data)
if (cmp_rdata(rdata, data))
{
mov_rdata(data.data(), to_write.data());
mov_rdata(data, to_write);
res.release(old_time + 128);
result = 1;
}
@ -2012,7 +1963,7 @@ bool spu_thread::process_mfc_cmd()
if (raddr)
{
// Last check for event before we clear the reservation
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || rdata != vm::_ref<decltype(rdata)>(raddr))
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
{
ch_event_stat |= SPU_EVENT_LR;
}
@ -2164,7 +2115,7 @@ u32 spu_thread::get_events(bool waiting)
}
// Check reservation status and set SPU_EVENT_LR if lost
if (raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & -128) != rtime || rdata != vm::_ref<decltype(rdata)>(raddr)))
if (raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & -128) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr))))
{
ch_event_stat |= SPU_EVENT_LR;
raddr = 0;
@ -2256,6 +2207,11 @@ s64 spu_thread::get_ch_value(u32 ch)
auto read_channel = [&](spu_channel& channel) -> s64
{
if (channel.get_count() == 0)
{
state += cpu_flag::wait;
}
for (int i = 0; i < 10 && channel.get_count() == 0; i++)
{
busy_wait();
@ -2273,6 +2229,7 @@ s64 spu_thread::get_ch_value(u32 ch)
thread_ctrl::wait();
}
check_state();
return out;
};
@ -2284,6 +2241,11 @@ s64 spu_thread::get_ch_value(u32 ch)
}
case SPU_RdInMbox:
{
if (ch_in_mbox.get_count() == 0)
{
state += cpu_flag::wait;
}
while (true)
{
for (int i = 0; i < 10 && ch_in_mbox.get_count() == 0; i++)
@ -2300,6 +2262,7 @@ s64 spu_thread::get_ch_value(u32 ch)
int_ctrl[2].set(SPU_INT2_STAT_SPU_MAILBOX_THRESHOLD_INT);
}
check_state();
return out;
}
@ -2410,6 +2373,8 @@ s64 spu_thread::get_ch_value(u32 ch)
while (res = get_events(), !res)
{
state += cpu_flag::wait;
if (is_stopped())
{
return -1;
@ -2418,11 +2383,14 @@ s64 spu_thread::get_ch_value(u32 ch)
pseudo_lock.wait(100);
}
check_state();
return res;
}
while (res = get_events(true), !res)
{
state += cpu_flag::wait;
if (is_stopped())
{
return -1;
@ -2431,6 +2399,7 @@ s64 spu_thread::get_ch_value(u32 ch)
thread_ctrl::wait_for(100);
}
check_state();
return res;
}
@ -2463,6 +2432,8 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
{
while (!ch_out_intr_mbox.try_push(value))
{
state += cpu_flag::wait;
if (is_stopped())
{
return false;
@ -2472,9 +2443,12 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
}
int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT);
check_state();
return true;
}
state += cpu_flag::wait;
const u32 code = value >> 24;
{
if (code < 64)
@ -2609,6 +2583,8 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
{
while (!ch_out_mbox.try_push(value))
{
state += cpu_flag::wait;
if (is_stopped())
{
return false;
@ -2617,6 +2593,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
thread_ctrl::wait();
}
check_state();
return true;
}
@ -2770,6 +2747,7 @@ bool spu_thread::stop_and_signal(u32 code)
if (offset >= RAW_SPU_BASE_ADDR)
{
state += cpu_flag::wait;
status.atomic_op([code](u32& status)
{
status = (status & 0xffff) | (code << 16);
@ -2779,6 +2757,7 @@ bool spu_thread::stop_and_signal(u32 code)
int_ctrl[2].set(SPU_INT2_STAT_SPU_STOP_AND_SIGNAL_INT);
state += cpu_flag::stop;
check_state();
return true;
}
@ -2808,6 +2787,8 @@ bool spu_thread::stop_and_signal(u32 code)
// HACK: wait for executable code
while (!_ref<u32>(pc))
{
state += cpu_flag::wait;
if (is_stopped())
{
return false;
@ -2816,12 +2797,15 @@ bool spu_thread::stop_and_signal(u32 code)
thread_ctrl::wait_for(1000);
}
check_state();
return false;
}
case 0x001:
{
state += cpu_flag::wait;
thread_ctrl::wait_for(1000); // hack
check_state();
return true;
}
@ -2857,6 +2841,8 @@ bool spu_thread::stop_and_signal(u32 code)
std::shared_ptr<lv2_event_queue> queue;
state += cpu_flag::wait;
while (true)
{
queue.reset();
@ -2897,6 +2883,7 @@ bool spu_thread::stop_and_signal(u32 code)
if (!queue)
{
check_state();
return ch_in_mbox.set_values(1, CELL_EINVAL), true; // TODO: check error value
}
@ -2927,6 +2914,7 @@ bool spu_thread::stop_and_signal(u32 code)
const auto data3 = static_cast<u32>(std::get<3>(event));
ch_in_mbox.set_values(4, CELL_OK, data1, data2, data3);
queue->events.pop_front();
check_state();
return true;
}
}
@ -2972,6 +2960,7 @@ bool spu_thread::stop_and_signal(u32 code)
}
}
check_state();
return true;
}
@ -3045,6 +3034,8 @@ bool spu_thread::stop_and_signal(u32 code)
{
/* ===== sys_spu_thread_group_exit ===== */
state += cpu_flag::wait;
u32 value = 0;
if (!ch_out_mbox.try_pop(value))
@ -3069,6 +3060,7 @@ bool spu_thread::stop_and_signal(u32 code)
group->join_state = SYS_SPU_THREAD_GROUP_JOIN_GROUP_EXIT;
state += cpu_flag::stop;
check_state();
return true;
}
@ -3076,6 +3068,8 @@ bool spu_thread::stop_and_signal(u32 code)
{
/* ===== sys_spu_thread_exit ===== */
state += cpu_flag::wait;
if (!ch_out_mbox.get_count())
{
fmt::throw_exception("sys_spu_thread_exit(): Out_MBox is empty" HERE);
@ -3084,6 +3078,7 @@ bool spu_thread::stop_and_signal(u32 code)
LOG_TRACE(SPU, "sys_spu_thread_exit(status=0x%x)", ch_out_mbox.get_value());
status |= SPU_STATUS_STOPPED_BY_STOP;
state += cpu_flag::stop;
check_state();
return true;
}
}

View File

@ -529,7 +529,7 @@ public:
// Reservation Data
u64 rtime = 0;
std::array<u128, 8> rdata{};
std::array<v128, 8> rdata{};
u32 raddr = 0;
u32 srr0;

View File

@ -357,6 +357,11 @@ s32 sys_net_bnet_accept(ppu_thread& ppu, s32 s, vm::ptr<sys_net_sockaddr> addr,
}
}
if (ppu.is_stopped())
{
return 0;
}
auto newsock = std::make_shared<lv2_socket>(native_socket);
result = idm::import_existing<lv2_socket>(newsock);
@ -975,6 +980,11 @@ s32 sys_net_bnet_recvfrom(ppu_thread& ppu, s32 s, vm::ptr<void> buf, u32 len, s3
}
}
if (ppu.is_stopped())
{
return 0;
}
// TODO
if (addr)
{
@ -1796,6 +1806,11 @@ s32 sys_net_bnet_select(ppu_thread& ppu, s32 nfds, vm::ptr<sys_net_fd_set> readf
}
}
if (ppu.is_stopped())
{
return 0;
}
if (readfds)
*readfds = rread;
if (writefds)

View File

@ -172,6 +172,8 @@ namespace vm
void temporary_unlock(cpu_thread& cpu) noexcept
{
cpu.state += cpu_flag::wait;
if (g_tls_locked && g_tls_locked->compare_and_swap_test(&cpu, nullptr))
{
cpu.cpu_unmem();

View File

@ -936,11 +936,18 @@ void Emulator::Load(const std::string& title_id, bool add_only, bool force_globa
// Set RTM usage
g_use_rtm = utils::has_rtm() && ((utils::has_mpx() && g_cfg.core.enable_TSX == tsx_usage::enabled) || g_cfg.core.enable_TSX == tsx_usage::forced);
if (g_use_rtm && !utils::has_mpx())
{
LOG_WARNING(GENERAL, "TSX forced by User");
}
if (g_use_rtm && g_cfg.core.preferred_spu_threads)
{
g_cfg.core.preferred_spu_threads.set(0);
LOG_ERROR(GENERAL, "Preferred SPU Threads forcefully disabled - not compatible with TSX in this version.");
}
// Load patches from different locations
fxm::check_unlocked<patch_engine>()->append(fs::get_config_dir() + "data/" + m_title_id + "/patch.yml");

View File

@ -385,7 +385,6 @@ struct cfg_root : cfg::node
cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};
cfg::_bool spu_accurate_putllc{this, "Accurate PUTLLC", false};
cfg::_bool spu_verification{this, "SPU Verification", true}; // Should be enabled
cfg::_bool spu_cache{this, "SPU Cache", true};
cfg::_enum<tsx_usage> enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully