From 367f039523fb696ae2adda60b0eb0ac29eca75cb Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 14 May 2018 23:07:36 +0300 Subject: [PATCH] Build transactions at runtime Drop _xbegin family intrinsics due to bad codegen Implemented `notifier` class, replacing vm::notify Minor optimization: detach transactions from global mutex on TSX path Minor optimization: don't acquire vm::passive_lock on PPU on TSX path --- Utilities/JIT.cpp | 6 +- Utilities/JIT.h | 4 +- Utilities/cond.cpp | 8 +- Utilities/cond.h | 93 ++++++++ Utilities/sysinfo.h | 23 -- rpcs3/CMakeLists.txt | 2 +- rpcs3/Emu/Cell/MFC.cpp | 2 - rpcs3/Emu/Cell/PPUThread.cpp | 185 ++++++++++++--- rpcs3/Emu/Cell/SPUThread.cpp | 421 +++++++++++++++++++--------------- rpcs3/Emu/Memory/vm.cpp | 69 +----- rpcs3/Emu/Memory/vm.h | 32 +-- rpcs3/Emu/RSX/rsx_methods.cpp | 18 +- rpcs3/Emu/System.cpp | 3 + rpcs3/Emu/System.h | 2 + 14 files changed, 529 insertions(+), 339 deletions(-) diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index 83a5df8953..da5716bd28 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -7,14 +7,14 @@ asmjit::JitRuntime& asmjit::get_global_runtime() return g_rt; } -void asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label abort) +void asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback) { Label fall = c.newLabel(); Label begin = c.newLabel(); c.jmp(begin); c.bind(fall); c.test(x86::eax, _XABORT_RETRY); - c.jz(abort); + c.jz(fallback); c.align(kAlignCode, 16); c.bind(begin); c.xbegin(fall); @@ -25,8 +25,6 @@ void asmjit::build_transaction_abort(asmjit::X86Assembler& c, unsigned char code c.db(0xc6); c.db(0xf8); c.db(code); - c.xor_(x86::eax, x86::eax); - c.ret(); } #ifdef LLVM_AVAILABLE diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 39797d83b3..367c52678a 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -12,9 +12,9 @@ namespace asmjit JitRuntime& get_global_runtime(); // Emit xbegin and adjacent loop - void build_transaction_enter(X86Assembler& c, Label abort); + void build_transaction_enter(X86Assembler& c, Label fallback); - // Emit xabort and return zero + // Emit xabort void build_transaction_abort(X86Assembler& c, unsigned char code); } diff --git a/Utilities/cond.cpp b/Utilities/cond.cpp index e3ecf847bd..b9cddf5af1 100644 --- a/Utilities/cond.cpp +++ b/Utilities/cond.cpp @@ -16,7 +16,7 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept LARGE_INTEGER timeout; timeout.QuadPart = _timeout * -10; - if (HRESULT rc = NtWaitForKeyedEvent(nullptr, &m_value, false, is_inf ? nullptr : &timeout)) + if (HRESULT rc = _timeout ? NtWaitForKeyedEvent(nullptr, &m_value, false, is_inf ? nullptr : &timeout) : WAIT_TIMEOUT) { verify(HERE), rc == WAIT_TIMEOUT; @@ -32,6 +32,12 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept return true; #else + if (!_timeout) + { + verify(HERE), m_value--; + return false; + } + timespec timeout; timeout.tv_sec = _timeout / 1000000; timeout.tv_nsec = (_timeout % 1000000) * 1000; diff --git a/Utilities/cond.h b/Utilities/cond.h index 74dad2d741..02c7908915 100644 --- a/Utilities/cond.h +++ b/Utilities/cond.h @@ -9,6 +9,8 @@ class cond_variable // Internal waiter counter atomic_t m_value{0}; + friend class notifier; + protected: // Internal waiting function bool imp_wait(u32 _old, u64 _timeout) noexcept; @@ -50,3 +52,94 @@ public: static constexpr u64 max_timeout = u64{UINT32_MAX} / 1000 * 1000000; }; + +// Pair of a fake shared mutex (only limited shared locking) and a condition variable +class notifier +{ + atomic_t m_counter{0}; + cond_variable m_cond; + +public: + constexpr notifier() = default; + + void lock_shared() + { + m_counter++; + } + + void unlock_shared() + { + const u32 counter = --m_counter; + + if (counter & 0x7f) + { + return; + } + + if (counter >= 0x80) + { + const u32 _old = m_counter.atomic_op([](u32& value) -> u32 + { + if (value & 0x7f) + { + return 0; + } + + return std::exchange(value, 0) >> 7; + }); + + if (_old && m_cond.m_value) + { + m_cond.imp_wake(_old); + } + } + } + + explicit_bool_t wait(u64 usec_timeout = -1) + { + const u32 _old = m_cond.m_value.fetch_add(1); + + if (0x80 <= m_counter.fetch_op([](u32& value) + { + value--; + + if (value >= 0x80) + { + value -= 0x80; + } + })) + { + // Return without waiting + m_cond.imp_wait(_old, 0); + m_counter++; + return true; + } + + const bool res = m_cond.imp_wait(_old, usec_timeout); + m_counter++; + return res; + } + + void notify_all() + { + if (m_counter) + { + m_counter.atomic_op([](u32& value) + { + if (const u32 add = value & 0x7f) + { + // Mutex is locked in shared mode + value += add << 7; + } + else + { + // Mutex is unlocked + value = 0; + } + }); + } + + // Notify after imaginary "exclusive" lock+unlock + m_cond.notify_all(); + } +}; diff --git a/Utilities/sysinfo.h b/Utilities/sysinfo.h index 1fbdd9e5f9..dce29c77ba 100644 --- a/Utilities/sysinfo.h +++ b/Utilities/sysinfo.h @@ -41,28 +41,5 @@ namespace utils bool has_xop(); - FORCE_INLINE bool transaction_enter(uint* out = nullptr) - { - while (true) - { - const uint status = _xbegin(); - - if (status == _XBEGIN_STARTED) - { - return true; - } - - if (!(status & _XABORT_RETRY)) - { - if (out) - { - *out = status; - } - - return false; - } - } - } - std::string get_system_info(); } diff --git a/rpcs3/CMakeLists.txt b/rpcs3/CMakeLists.txt index 8302a01ac7..90e4a258d4 100644 --- a/rpcs3/CMakeLists.txt +++ b/rpcs3/CMakeLists.txt @@ -122,7 +122,7 @@ if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--allow-multiple-definition") endif() - add_compile_options(-msse -msse2 -mcx16 -mrtm) + add_compile_options(-msse -msse2 -mcx16) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") # This fixes 'some' of the st11range issues. See issue #2516 diff --git a/rpcs3/Emu/Cell/MFC.cpp b/rpcs3/Emu/Cell/MFC.cpp index b50ba1a176..a8826e6793 100644 --- a/rpcs3/Emu/Cell/MFC.cpp +++ b/rpcs3/Emu/Cell/MFC.cpp @@ -6,8 +6,6 @@ #include "Emu/System.h" #include "MFC.h" -const bool s_use_rtm = utils::has_rtm(); - template <> void fmt_class_string::format(std::string& out, u64 arg) { diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index d919365d08..1952bcd80c 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1,6 +1,7 @@ #include "stdafx.h" #include "Utilities/VirtualMemory.h" #include "Utilities/sysinfo.h" +#include "Utilities/JIT.h" #include "Crypto/sha1.h" #include "Emu/Memory/Memory.h" #include "Emu/System.h" @@ -46,7 +47,6 @@ #endif #include "define_new_memleakdetect.h" -#include "Utilities/JIT.h" #include "PPUTranslator.h" #include "Modules/cellMsgDialog.h" #endif @@ -55,8 +55,6 @@ #include #include "Utilities/GSL.h" -const bool s_use_rtm = utils::has_rtm(); - const bool s_use_ssse3 = #ifdef _MSC_VER utils::has_ssse3(); @@ -713,7 +711,12 @@ ppu_thread::ppu_thread(const std::string& name, u32 prio, u32 stack) , m_name(name) { // Trigger the scheduler - state += cpu_flag::suspend + cpu_flag::memory; + state += cpu_flag::suspend; + + if (!g_use_rtm) + { + state += cpu_flag::memory; + } } void ppu_thread::cmd_push(cmd64 cmd) @@ -942,7 +945,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) ppu.raddr = addr; // Do several attemps - for (uint i = 0; i < 5; i++) + for (uint i = 0; g_use_rtm || i < 5; i++) { ppu.rtime = vm::reservation_acquire(addr, sizeof(T)); _mm_lfence(); @@ -978,6 +981,57 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr) return ppu_load_acquire_reservation(ppu, addr); } +const auto ppu_stwcx_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + using namespace asmjit; + + Label fall = c.newLabel(); + Label fail = c.newLabel(); + + // Prepare registers + c.mov(x86::rax, imm_ptr(&vm::g_reservations)); + c.mov(x86::r10, x86::qword_ptr(x86::rax)); + c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); + c.mov(x86::r11, x86::qword_ptr(x86::rax)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); + c.shr(args[0], 7); + c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0], 3)); + c.bswap(args[2].r32()); + c.bswap(args[3].r32()); + + // Touch memory (heavyweight) + c.lock().add(x86::dword_ptr(x86::r11), 0); + c.xor_(x86::eax, x86::eax); + c.lock().xadd(x86::qword_ptr(x86::r10), x86::rax); + c.cmp(x86::rax, args[1]); + c.jne(fail); + + // Begin transaction + build_transaction_enter(c, fall); + c.cmp(x86::qword_ptr(x86::r10), args[1]); + c.jne(fail); + c.cmp(x86::dword_ptr(x86::r11), args[2].r32()); + c.jne(fail); + c.mov(x86::dword_ptr(x86::r11), args[3].r32()); + c.rdtsc(); // destroys args[1] or args[2] + c.shl(x86::rdx, 33); + c.shl(x86::rax, 1); + c.or_(x86::rax, x86::rdx); + c.mov(x86::qword_ptr(x86::r10), x86::rax); + c.xend(); + c.mov(x86::eax, 1); + c.ret(); + + c.bind(fall); + c.sar(x86::eax, 24); + c.ret(); + + c.bind(fail); + build_transaction_abort(c, 0xff); + c.or_(x86::eax, -1); + c.ret(); +}); + extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value) { atomic_be_t& data = vm::_ref>(addr); @@ -988,24 +1042,31 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value) return false; } - if (s_use_rtm && utils::transaction_enter()) + if (g_use_rtm) { - if (!vm::g_mutex.is_lockable() || vm::g_mutex.is_reading()) + // Do several attempts (TODO) + for (u32 i = 0; i < 5; i++) { - _xabort(0); + const int r = ppu_stwcx_tx(addr, ppu.rtime, ppu.rdata, reg_value); + + if (r > 0) + { + vm::reservation_notifier(addr, sizeof(u32)).notify_all(); + ppu.raddr = 0; + return true; + } + + if (r < 0) + { + // Reservation lost + ppu.raddr = 0; + return false; + } } - const bool result = ppu.rtime == vm::reservation_acquire(addr, sizeof(u32)) && data.compare_and_swap_test(static_cast(ppu.rdata), reg_value); - - if (result) - { - vm::reservation_update(addr, sizeof(u32)); - vm::notify(addr, sizeof(u32)); - } - - _xend(); + // Give up ppu.raddr = 0; - return result; + return false; } vm::writer_lock lock(0); @@ -1015,13 +1076,64 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value) if (result) { vm::reservation_update(addr, sizeof(u32)); - vm::notify(addr, sizeof(u32)); + vm::reservation_notifier(addr, sizeof(u32)).notify_all(); } ppu.raddr = 0; return result; } +const auto ppu_stdcx_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + using namespace asmjit; + + Label fall = c.newLabel(); + Label fail = c.newLabel(); + + // Prepare registers + c.mov(x86::rax, imm_ptr(&vm::g_reservations)); + c.mov(x86::r10, x86::qword_ptr(x86::rax)); + c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); + c.mov(x86::r11, x86::qword_ptr(x86::rax)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); + c.shr(args[0], 7); + c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0], 3)); + c.bswap(args[2]); + c.bswap(args[3]); + + // Touch memory (heavyweight) + c.lock().add(x86::qword_ptr(x86::r11), 0); + c.xor_(x86::eax, x86::eax); + c.lock().xadd(x86::qword_ptr(x86::r10), x86::rax); + c.cmp(x86::rax, args[1]); + c.jne(fail); + + // Begin transaction + build_transaction_enter(c, fall); + c.cmp(x86::qword_ptr(x86::r10), args[1]); + c.jne(fail); + c.cmp(x86::qword_ptr(x86::r11), args[2]); + c.jne(fail); + c.mov(x86::qword_ptr(x86::r11), args[3]); + c.rdtsc(); // destroys args[1] or args[2] + c.shl(x86::rdx, 33); + c.shl(x86::rax, 1); + c.or_(x86::rax, x86::rdx); + c.mov(x86::qword_ptr(x86::r10), x86::rax); + c.xend(); + c.mov(x86::eax, 1); + c.ret(); + + c.bind(fall); + c.sar(x86::eax, 24); + c.ret(); + + c.bind(fail); + build_transaction_abort(c, 0xff); + c.or_(x86::eax, -1); + c.ret(); +}); + extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value) { atomic_be_t& data = vm::_ref>(addr); @@ -1032,24 +1144,31 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value) return false; } - if (s_use_rtm && utils::transaction_enter()) + if (g_use_rtm) { - if (!vm::g_mutex.is_lockable() || vm::g_mutex.is_reading()) + // Do several attempts (TODO) + for (u32 i = 0; i < 5; i++) { - _xabort(0); + const int r = ppu_stdcx_tx(addr, ppu.rtime, ppu.rdata, reg_value); + + if (r > 0) + { + vm::reservation_notifier(addr, sizeof(u64)).notify_all(); + ppu.raddr = 0; + return true; + } + + if (r < 0) + { + // Reservation lost + ppu.raddr = 0; + return false; + } } - const bool result = ppu.rtime == vm::reservation_acquire(addr, sizeof(u64)) && data.compare_and_swap_test(ppu.rdata, reg_value); - - if (result) - { - vm::reservation_update(addr, sizeof(u64)); - vm::notify(addr, sizeof(u64)); - } - - _xend(); + // Give up ppu.raddr = 0; - return result; + return false; } vm::writer_lock lock(0); @@ -1059,7 +1178,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value) if (result) { vm::reservation_update(addr, sizeof(u64)); - vm::notify(addr, sizeof(u64)); + vm::reservation_notifier(addr, sizeof(u64)).notify_all(); } ppu.raddr = 0; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 9abc347eb5..9c9113dc41 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1,4 +1,5 @@ #include "stdafx.h" +#include "Utilities/JIT.h" #include "Utilities/lockless.h" #include "Utilities/sysinfo.h" #include "Emu/Memory/Memory.h" @@ -22,8 +23,7 @@ #include #include #include - -const bool s_use_rtm = utils::has_rtm(); +#include const bool s_use_ssse3 = #ifdef _MSC_VER @@ -213,6 +213,175 @@ namespace spu } } +const auto spu_putllc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + using namespace asmjit; + + Label fall = c.newLabel(); + Label fail = c.newLabel(); + + // Prepare registers + c.mov(x86::rax, imm_ptr(&vm::g_reservations)); + c.mov(x86::r10, x86::qword_ptr(x86::rax)); + c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); + c.mov(x86::r11, x86::qword_ptr(x86::rax)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); + c.shr(args[0], 4); + c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0])); + + // Touch memory (heavyweight) + c.mov(x86::eax, x86::dword_ptr(args[2])); + c.mov(x86::eax, x86::dword_ptr(args[3])); + c.lock().add(x86::qword_ptr(x86::r11), 0); + c.xor_(x86::eax, x86::eax); + c.lock().xadd(x86::qword_ptr(x86::r10), x86::rax); + c.cmp(x86::rax, args[1]); + c.jne(fail); + + c.vmovups(x86::ymm0, x86::yword_ptr(args[2], 0)); + c.vmovups(x86::ymm1, x86::yword_ptr(args[2], 32)); + c.vmovups(x86::ymm2, x86::yword_ptr(args[2], 64)); + c.vmovups(x86::ymm3, x86::yword_ptr(args[2], 96)); +#ifndef _WIN32 + c.vmovups(x86::ymm6, x86::yword_ptr(args[3], 0)); + c.vmovups(x86::ymm7, x86::yword_ptr(args[3], 32)); + c.vmovups(x86::ymm8, x86::yword_ptr(args[3], 64)); + c.vmovups(x86::ymm9, x86::yword_ptr(args[3], 96)); +#endif + + // Begin transaction + build_transaction_enter(c, fall); + c.cmp(x86::qword_ptr(x86::r10), args[1]); + c.jne(fail); + c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::r11, 0)); + c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::r11, 32)); + c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::r11, 64)); + c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::r11, 96)); + c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); + c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); + c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); + c.vptest(x86::ymm0, x86::ymm0); + c.jnz(fail); +#ifdef _WIN32 + c.vmovups(x86::ymm0, x86::yword_ptr(args[3], 0)); + c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm0); + c.vmovups(x86::ymm1, x86::yword_ptr(args[3], 32)); + c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1); + c.vmovups(x86::ymm2, x86::yword_ptr(args[3], 64)); + c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2); + c.vmovups(x86::ymm3, x86::yword_ptr(args[3], 96)); + c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3); +#else + c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm6); + c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm7); + c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm8); + c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm9); +#endif + c.rdtsc(); // destroys args[1] or args[2] + c.shl(x86::rdx, 33); + c.shl(x86::rax, 1); + c.or_(x86::rax, x86::rdx); + c.mov(x86::qword_ptr(x86::r10), x86::rax); + c.xend(); + c.vzeroupper(); + c.mov(x86::eax, 1); + c.ret(); + + c.bind(fall); + c.sar(x86::eax, 24); + c.ret(); + + c.bind(fail); + build_transaction_abort(c, 0xff); + c.or_(x86::eax, -1); + c.ret(); +}); + +const auto spu_getll_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + using namespace asmjit; + + Label fall = c.newLabel(); + + // Prepare registers + c.mov(x86::rax, imm_ptr(&vm::g_reservations)); + c.mov(x86::r10, x86::qword_ptr(x86::rax)); + c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); + c.mov(x86::r11, x86::qword_ptr(x86::rax)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); + c.shr(args[0], 4); + c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0])); + + // Touch memory + c.mov(x86::rax, x86::qword_ptr(x86::r11)); + c.mov(x86::rax, x86::qword_ptr(x86::r10)); + + // Begin transaction + build_transaction_enter(c, fall); + c.mov(x86::rax, x86::qword_ptr(x86::r10)); + c.vmovaps(x86::ymm0, x86::yword_ptr(x86::r11, 0)); + c.vmovaps(x86::ymm1, x86::yword_ptr(x86::r11, 32)); + c.vmovaps(x86::ymm2, x86::yword_ptr(x86::r11, 64)); + c.vmovaps(x86::ymm3, x86::yword_ptr(x86::r11, 96)); + c.xend(); + c.vmovups(x86::yword_ptr(args[1], 0), x86::ymm0); + c.vmovups(x86::yword_ptr(args[1], 32), x86::ymm1); + c.vmovups(x86::yword_ptr(args[1], 64), x86::ymm2); + c.vmovups(x86::yword_ptr(args[1], 96), x86::ymm3); + c.vzeroupper(); + c.ret(); + + c.bind(fall); + c.mov(x86::eax, 1); + c.ret(); +}); + +const auto spu_putlluc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + using namespace asmjit; + + Label fall = c.newLabel(); + + // Prepare registers + c.mov(x86::rax, imm_ptr(&vm::g_reservations)); + c.mov(x86::r10, x86::qword_ptr(x86::rax)); + c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); + c.mov(x86::r11, x86::qword_ptr(x86::rax)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); + c.shr(args[0], 4); + c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0])); + + // Touch memory (heavyweight) + c.lock().add(x86::qword_ptr(x86::r11), 0); + c.lock().add(x86::qword_ptr(x86::r10), 0); + + // Prepare data + c.vmovups(x86::ymm0, x86::yword_ptr(args[1], 0)); + c.vmovups(x86::ymm1, x86::yword_ptr(args[1], 32)); + c.vmovups(x86::ymm2, x86::yword_ptr(args[1], 64)); + c.vmovups(x86::ymm3, x86::yword_ptr(args[1], 96)); + + // Begin transaction + build_transaction_enter(c, fall); + c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm0); + c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1); + c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2); + c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3); + c.rdtsc(); // destroys args[1] or args[2] + c.shl(x86::rdx, 33); + c.shl(x86::rax, 1); + c.or_(x86::rax, x86::rdx); + c.mov(x86::qword_ptr(x86::r10), x86::rax); + c.xend(); + c.vzeroupper(); + c.mov(x86::eax, 1); + c.ret(); + + c.bind(fall); + c.xor_(x86::eax, x86::eax); + c.ret(); +}); + void spu_int_ctrl_t::set(u64 ints) { // leave only enabled interrupts @@ -516,10 +685,12 @@ void SPUThread::cpu_task() void SPUThread::cpu_mem() { + //vm::passive_lock(*this); } void SPUThread::cpu_unmem() { + //state.test_and_set(cpu_flag::memory); } SPUThread::~SPUThread() @@ -881,42 +1052,17 @@ void SPUThread::do_putlluc(const spu_mfc_cmd& args) vm::reservation_acquire(addr, 128); // Store unconditionally - if (s_use_rtm && utils::transaction_enter()) + while (g_use_rtm) { - // First transaction attempt - if (!vm::g_mutex.is_lockable() || vm::g_mutex.is_reading()) + if (spu_putlluc_tx(addr, to_write.data())) { - _xabort(0); + vm::reservation_notifier(addr, 128).notify_all(); + tx_success++; + return; } - data = to_write; - vm::reservation_update(addr, 128); - vm::notify(addr, 128); - _xend(); - return; - } - else if (s_use_rtm) - { - vm::writer_lock lock(0); - - if (utils::transaction_enter()) - { - // Second transaction attempt - data = to_write; - vm::reservation_update(addr, 128); - _xend(); - } - else - { - vm::reservation_update(addr, 128, true); - _mm_sfence(); - data = to_write; - _mm_sfence(); - vm::reservation_update(addr, 128); - } - - vm::notify(addr, 128); - return; + busy_wait(300); + tx_failure++; } vm::writer_lock lock(0); @@ -925,7 +1071,7 @@ void SPUThread::do_putlluc(const spu_mfc_cmd& args) data = to_write; _mm_sfence(); vm::reservation_update(addr, 128); - vm::notify(addr, 128); + vm::reservation_notifier(addr, 128).notify_all(); } void SPUThread::do_mfc(bool wait) @@ -970,7 +1116,7 @@ void SPUThread::do_mfc(bool wait) { if (!test(ch_stall_mask, mask)) { - if (s_use_rtm) + if (g_use_rtm) { if (do_list_transfer(args)) { @@ -1002,7 +1148,7 @@ void SPUThread::do_mfc(bool wait) if (args.size) { - if (s_use_rtm) + if (g_use_rtm) { do_dma_transfer(args); } @@ -1067,13 +1213,6 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) // Stall infinitely if MFC queue is full while (UNLIKELY(mfc_size >= 16)) { - do_mfc(); - - if (mfc_size < 16) - { - break; - } - if (test(state, cpu_flag::stop)) { return false; @@ -1102,18 +1241,11 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) if (is_polling) { - vm::waiter waiter; - waiter.owner = this; - waiter.addr = raddr; - waiter.size = 128; - waiter.stamp = rtime; - waiter.data = rdata.data(); - waiter.init(); + rtime = vm::reservation_acquire(raddr, 128); + _mm_lfence(); - while (vm::reservation_acquire(raddr, 128) == waiter.stamp && rdata == data) + while (vm::reservation_acquire(raddr, 128) == rtime && rdata == data) { - vm::temporary_unlock(*this); - if (test(state, cpu_flag::stop)) { break; @@ -1123,8 +1255,23 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) } } + while (g_use_rtm) + { + rtime = spu_getll_tx(raddr, rdata.data()); + + if (rtime & 1) + { + tx_failure++; + busy_wait(300); + continue; + } + + tx_success++; + break; + } + // Do several attemps - for (uint i = 0; i < 5; i++) + for (uint i = 0; !g_use_rtm && i < 5; i++) { rtime = vm::reservation_acquire(raddr, 128); _mm_lfence(); @@ -1147,19 +1294,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) busy_wait(300); } - if (s_use_rtm && utils::transaction_enter()) - { - rtime = vm::reservation_acquire(raddr, 128); - - if (rtime & 1) - { - _xabort(0); - } - - rdata = data; - _xend(); - } - else + if (!g_use_rtm) { vm::reader_lock lock; rtime = vm::reservation_acquire(raddr, 128); @@ -1182,63 +1317,25 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) if (raddr == args.eal && rtime == vm::reservation_acquire(raddr, 128)) { - // TODO: vm::check_addr - if (s_use_rtm && utils::transaction_enter()) + if (g_use_rtm) { - // First transaction attempt - if (!vm::g_mutex.is_lockable() || vm::g_mutex.is_reading()) + // Do several attempts (TODO) + for (u32 i = 0;; i++) { - _xabort(0); - } + const int r = spu_putllc_tx(raddr, rtime, rdata.data(), to_write.data()); - if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data) - { - data = to_write; - result = true; - - vm::reservation_update(raddr, 128); - vm::notify(raddr, 128); - } - - _xend(); - tx_success++; - } - else if (s_use_rtm) - { - // Second transaction attempt - vm::writer_lock lock(0); - - // Touch memory without modifying the value - vm::_ref>(args.eal) += 0; - - // Touch reservation memory area as well - vm::reservation_acquire(raddr, 128) += 0; - - if (utils::transaction_enter(&tx_status)) - { - if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data) + if (r > 0) { - data = to_write; + vm::reservation_notifier(raddr, 128).notify_all(); result = true; - - vm::reservation_update(raddr, 128); + tx_success++; + break; } - _xend(); - tx_success++; - - if (result) + if (r < 0) { - // First transaction attempt usually fails on vm::notify - vm::notify(raddr, 128); - } - } - else - { - // Workaround MSVC - if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data) - { - vm::reservation_update(raddr, 128); + // Reservation lost + break; } // Don't fallback to heavyweight lock, just give up @@ -1248,6 +1345,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) else if (rdata == data) { // Full lock (heavyweight) + // TODO: vm::check_addr vm::writer_lock lock(1); if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data) @@ -1259,12 +1357,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) result = true; vm::reservation_update(raddr, 128); - vm::notify(raddr, 128); - tx_success++; - } - else - { - tx_failure++; + vm::reservation_notifier(raddr, 128).notify_all(); } } } @@ -1332,7 +1425,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) { if (LIKELY(args.size)) { - if (s_use_rtm) + if (g_use_rtm) { do_dma_transfer(args); return true; @@ -1377,7 +1470,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args) { if (LIKELY(do_dma_check(args) && !test(ch_stall_mask, 1u << args.tag))) { - if (s_use_rtm) + if (g_use_rtm) { if (LIKELY(do_list_transfer(args))) { @@ -1531,14 +1624,7 @@ s64 SPUThread::get_ch_value(u32 ch) { for (int i = 0; i < 10 && channel.get_count() == 0; i++) { - // if (!s_use_rtm && mfc_size && !i) - // { - // do_mfc(); - // } - // else - { - busy_wait(); - } + busy_wait(); } u32 out; @@ -1568,14 +1654,7 @@ s64 SPUThread::get_ch_value(u32 ch) { for (int i = 0; i < 10 && ch_in_mbox.get_count() == 0; i++) { - // if (!s_use_rtm && mfc_size && !i) - // { - // do_mfc(); - // } - // else - { - busy_wait(); - } + busy_wait(); } u32 out; @@ -1601,11 +1680,6 @@ s64 SPUThread::get_ch_value(u32 ch) case MFC_RdTagStat: { - // if (!s_use_rtm && mfc_size) - // { - // do_mfc(); - // } - if (ch_tag_stat.get_count()) { u32 out = ch_tag_stat.get_value(); @@ -1676,11 +1750,6 @@ s64 SPUThread::get_ch_value(u32 ch) case SPU_RdEventStat: { - // if (!s_use_rtm && mfc_size) - // { - // do_mfc(); - // } - u32 res = get_events(); if (res) @@ -1688,19 +1757,31 @@ s64 SPUThread::get_ch_value(u32 ch) return res; } - vm::waiter waiter; + const u32 mask1 = ch_event_mask; - if (ch_event_mask & SPU_EVENT_LR) + if (mask1 & SPU_EVENT_LR && raddr) { - waiter.owner = this; - waiter.addr = raddr; - waiter.size = 128; - waiter.stamp = rtime; - waiter.data = rdata.data(); - waiter.init(); + if (mask1 != SPU_EVENT_LR) + { + fmt::throw_exception("Not supported: event mask 0x%x" HERE, mask1); + } + + std::shared_lock pseudo_lock(vm::reservation_notifier(raddr, 128)); + + while (res = get_events(), !res) + { + if (test(state, cpu_flag::stop + cpu_flag::dbg_global_stop)) + { + return -1; + } + + pseudo_lock.mutex()->wait(100); + } + + return res; } - while (!(res = get_events(true))) + while (res = get_events(true), !res) { if (test(state & cpu_flag::stop)) { @@ -1738,11 +1819,6 @@ bool SPUThread::set_ch_value(u32 ch, u32 value) case SPU_WrOutIntrMbox: { - // if (!s_use_rtm && mfc_size) - // { - // do_mfc(false); - // } - if (offset >= RAW_SPU_BASE_ADDR) { while (!ch_out_intr_mbox.try_push(value)) @@ -1891,11 +1967,6 @@ bool SPUThread::set_ch_value(u32 ch, u32 value) case SPU_WrOutMbox: { - // if (!s_use_rtm && mfc_size) - // { - // do_mfc(false); - // } - while (!ch_out_mbox.try_push(value)) { if (test(state & cpu_flag::stop)) @@ -1939,11 +2010,6 @@ bool SPUThread::set_ch_value(u32 ch, u32 value) break; } - // if (!s_use_rtm && mfc_size) - // { - // do_mfc(false); - // } - const u32 completed = get_mfc_completed(); if (!value) @@ -2066,11 +2132,6 @@ bool SPUThread::stop_and_signal(u32 code) { LOG_TRACE(SPU, "stop_and_signal(code=0x%x)", code); - // if (!s_use_rtm && mfc_size) - // { - // do_mfc(); - // } - if (offset >= RAW_SPU_BASE_ADDR) { status.atomic_op([code](u32& status) diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index a00a54eef9..7f9168b5fa 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -2,6 +2,7 @@ #include "Memory.h" #include "Emu/System.h" #include "Utilities/mutex.h" +#include "Utilities/cond.h" #include "Utilities/Thread.h" #include "Utilities/VirtualMemory.h" #include "Emu/CPU/CPUThread.h" @@ -10,6 +11,8 @@ #include #include +static_assert(sizeof(notifier) == 8, "Unexpected size of notifier"); + namespace vm { static u8* memory_reserve_4GiB(std::uintptr_t _addr = 0) @@ -38,12 +41,12 @@ namespace vm // Reservation stats (compressed x16) u8* const g_reservations = memory_reserve_4GiB((std::uintptr_t)g_stat_addr); + // Reservation sync variables + u8* const g_reservations2 = g_reservations + 0x10000000; + // Memory locations std::vector> g_locations; - // Registered waiters - std::deque g_waiters; - // Memory mutex core shared_mutex g_mutex; @@ -239,65 +242,6 @@ namespace vm // Memory pages std::array g_pages{}; - void waiter::init() - { - // Register waiter - vm::writer_lock lock(0); - - g_waiters.emplace_back(this); - } - - void waiter::test() const - { - if (std::memcmp(data, vm::base(addr), size) == 0) - { - return; - } - - if (stamp >= reservation_acquire(addr, size)) - { - return; - } - - if (owner) - { - owner->notify(); - } - } - - waiter::~waiter() - { - // Unregister waiter - vm::writer_lock lock(0); - - // Find waiter - const auto found = std::find(g_waiters.cbegin(), g_waiters.cend(), this); - - if (found != g_waiters.cend()) - { - g_waiters.erase(found); - } - } - - void notify(u32 addr, u32 size) - { - for (const waiter* ptr : g_waiters) - { - if (ptr->addr / 128 == addr / 128) - { - ptr->test(); - } - } - } - - void notify_all() - { - for (const waiter* ptr : g_waiters) - { - ptr->test(); - } - } - static void _page_map(u32 addr, u8 flags, utils::shm& shm) { const u32 size = shm.size(); @@ -539,6 +483,7 @@ namespace vm if (addr != 0xc0000000 && addr != 0xe0000000) { utils::memory_commit(g_reservations + addr / 16, size / 16); + utils::memory_commit(g_reservations2 + addr / 16, size / 16); } } diff --git a/rpcs3/Emu/Memory/vm.h b/rpcs3/Emu/Memory/vm.h index 470ff5c240..a46402af59 100644 --- a/rpcs3/Emu/Memory/vm.h +++ b/rpcs3/Emu/Memory/vm.h @@ -8,6 +8,7 @@ class shared_mutex; class named_thread; class cpu_thread; +class notifier; namespace vm { @@ -15,6 +16,7 @@ namespace vm extern u8* const g_exec_addr; extern u8* const g_stat_addr; extern u8* const g_reservations; + extern u8* const g_reservations2; enum memory_location_t : uint { @@ -41,24 +43,6 @@ namespace vm page_allocated = (1 << 7), }; - struct waiter - { - named_thread* owner; - u32 addr; - u32 size; - u64 stamp; - const void* data; - - waiter() = default; - - waiter(const waiter&) = delete; - - void init(); - void test() const; - - ~waiter(); - }; - // Address type enum addr_t : u32 {}; @@ -112,14 +96,14 @@ namespace vm inline void reservation_update(u32 addr, u32 size, bool lsb = false) { // Update reservation info with new timestamp - reservation_acquire(addr, size) = (__rdtsc() & -2) | u64{lsb}; + reservation_acquire(addr, size) = (__rdtsc() << 1) | u64{lsb}; } - // Check and notify memory changes at address - void notify(u32 addr, u32 size); - - // Check and notify memory changes - void notify_all(); + // Get reservation sync variable + inline notifier& reservation_notifier(u32 addr, u32 size) + { + return *reinterpret_cast(g_reservations2 + addr / 16); + } // Change memory protection of specified memory region bool page_protect(u32 addr, u32 size, u8 flags_test = 0, u8 flags_set = 0, u8 flags_clear = 0); diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 72f112ccfc..948f049f8f 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -118,16 +118,20 @@ namespace rsx rsx->sync_point_request = true; const u32 addr = get_address(method_registers.semaphore_offset_406e(), method_registers.semaphore_context_dma_406e()); - if (addr >> 28 == 0x4) + if (g_use_rtm || addr >> 28 == 0x4) { - // TODO: check no reservation area instead vm::write32(addr, arg); - return; + } + else + { + vm::reader_lock lock; + vm::write32(addr, arg); } - vm::reader_lock lock; - vm::write32(addr, arg); - vm::notify(addr, 4); + if (addr >> 28 != 0x4) + { + vm::reservation_notifier(addr, 4).notify_all(); + } } } @@ -1051,7 +1055,7 @@ namespace rsx } LOG_SUCCESS(RSX, "capture successful: %s", filePath.c_str()); - + frame_capture.reset(); Emu.Pause(); } diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index 34168e60bc..d609b1940a 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -23,6 +23,7 @@ #include "Loader/ELF.h" #include "Utilities/StrUtil.h" +#include "Utilities/sysinfo.h" #include "../Crypto/unself.h" #include "../Crypto/unpkg.h" @@ -40,6 +41,8 @@ cfg_root g_cfg; +bool g_use_rtm = utils::has_rtm(); + std::string g_cfg_defaults; extern atomic_t g_thread_count; diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 203073a013..9c93762bf6 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -456,3 +456,5 @@ struct cfg_root : cfg::node }; extern cfg_root g_cfg; + +extern bool g_use_rtm;