From 75ad56338bff69ef3f02fe90f54da566c6bc2cf3 Mon Sep 17 00:00:00 2001 From: Eladash Date: Mon, 22 Aug 2022 21:31:12 +0300 Subject: [PATCH] SPU/Non-TSX: Implement cuncurrent reservations --- rpcs3/Emu/CPU/CPUThread.cpp | 30 ++-- rpcs3/Emu/Cell/SPUThread.cpp | 35 ++++- rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp | 22 +-- rpcs3/Emu/Memory/vm.cpp | 214 +++++++++++++++++--------- rpcs3/Emu/Memory/vm_locking.h | 12 +- 5 files changed, 210 insertions(+), 103 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index dd26620ab1..031176a948 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -682,6 +682,7 @@ static atomic_t s_dummy_atomic = 0; bool cpu_thread::check_state() noexcept { bool cpu_sleep_called = false; + bool cpu_memory_checked = false; bool cpu_can_stop = true; bool escape{}, retval{}; @@ -770,7 +771,7 @@ bool cpu_thread::check_state() noexcept if (!is_stopped(flags) && flags.none_of(cpu_flag::ret)) { // Check pause flags which hold thread inside check_state (ignore suspend/debug flags on cpu_flag::temp) - if (flags & (cpu_flag::pause + cpu_flag::memory) || (cpu_can_stop && flags & (cpu_flag::dbg_global_pause + cpu_flag::dbg_pause + cpu_flag::suspend + cpu_flag::yield + cpu_flag::preempt))) + if (flags & cpu_flag::pause || (!cpu_memory_checked && flags & cpu_flag::memory) || (cpu_can_stop && flags & (cpu_flag::dbg_global_pause + cpu_flag::dbg_pause + cpu_flag::suspend + cpu_flag::yield + cpu_flag::preempt))) { if (!(flags & cpu_flag::wait)) { @@ -789,12 +790,18 @@ bool cpu_thread::check_state() noexcept return store; } + if (flags & (cpu_flag::wait + cpu_flag::memory)) + { + flags -= (cpu_flag::wait + cpu_flag::memory); + store = true; + } + if (s_tls_thread_slot == umax) { - if (cpu_flag::wait - state) + if (cpu_flag::wait - this->state.load()) { // Force wait flag (must be set during ownership of s_cpu_lock), this makes the atomic op fail as a side effect - state += cpu_flag::wait; + this->state += cpu_flag::wait; store = true; } @@ -802,12 +809,6 @@ bool cpu_thread::check_state() noexcept cpu_counter::add(this); } - if (flags & cpu_flag::wait) - { - flags -= cpu_flag::wait; - store = true; - } - retval = false; } else @@ -856,6 +857,14 @@ bool cpu_thread::check_state() noexcept if (escape) { + if (vm::g_range_lock_bits[1] && vm::g_tls_locked && *vm::g_tls_locked == this) + { + state += cpu_flag::wait + cpu_flag::memory; + cpu_sleep_called = false; + cpu_memory_checked = false; + continue; + } + if (cpu_can_stop && state0 & cpu_flag::pending) { // Execute pending work @@ -866,6 +875,7 @@ bool cpu_thread::check_state() noexcept // Work could have changed flags // Reset internal flags as if check_state() has just been called cpu_sleep_called = false; + cpu_memory_checked = false; continue; } } @@ -883,6 +893,7 @@ bool cpu_thread::check_state() noexcept { cpu_sleep(); cpu_sleep_called = true; + cpu_memory_checked = false; if (s_tls_thread_slot != umax) { @@ -907,6 +918,7 @@ bool cpu_thread::check_state() noexcept if (state0 & cpu_flag::memory) { vm::passive_lock(*this); + cpu_memory_checked = true; continue; } diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 805751a8de..7a36b46565 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -326,6 +326,29 @@ extern thread_local u64 g_tls_fault_spu; const spu_decoder s_spu_itype; +namespace vm +{ + extern atomic_t g_range_lock_set[64]; + + // Defined here for performance reasons + writer_lock::~writer_lock() noexcept + { + if (range_lock) + { + if (!*range_lock) + { + return; + } + + g_range_lock_bits[1] &= ~(1ull << (range_lock - g_range_lock_set)); + range_lock->release(0); + return; + } + + g_range_lock_bits[1].release(0); + } +} + namespace spu { namespace scheduler @@ -3548,19 +3571,18 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) { // Full lock (heavyweight) // TODO: vm::check_addr - vm::writer_lock lock(addr); + vm::writer_lock lock(addr, range_lock); if (cmp_rdata(rdata, super_data)) { mov_rdata(super_data, to_write); - res += 64; return true; } - res -= 64; return false; }(); + res += success ? 64 : 0 - 64; return success; }()) { @@ -3695,7 +3717,8 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) vm::_ref>(addr) += 0; // Hard lock - vm::writer_lock lock(addr); + auto spu = cpu ? cpu->try_get() : nullptr; + vm::writer_lock lock(addr, spu ? spu->range_lock : nullptr); mov_rdata(sdata, *static_cast(to_write)); vm::reservation_acquire(addr) += 32; } @@ -4461,7 +4484,7 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const // Set range_lock first optimistically range_lock->store(u64{128} << 32 | addr); - u64 lock_val = vm::g_range_lock; + u64 lock_val = *std::prev(std::end(vm::g_range_lock_set)); u64 old_lock = 0; while (lock_val != old_lock) @@ -4516,7 +4539,7 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const break; } - old_lock = std::exchange(lock_val, vm::g_range_lock); + old_lock = std::exchange(lock_val, *std::prev(std::end(vm::g_range_lock_set))); } if (!range_lock->load()) [[unlikely]] diff --git a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp index 35661a22cd..1440a32883 100644 --- a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp @@ -79,15 +79,7 @@ constexpr u32 c_max_ppu_name_size = 28; void _sys_ppu_thread_exit(ppu_thread& ppu, u64 errorcode) { ppu.state += cpu_flag::wait; - - // Need to wait until the current writer finish - if (ppu.state & cpu_flag::memory) - { - while (vm::g_range_lock) - { - busy_wait(200); - } - } + u64 writer_mask = 0; sys_ppu_thread.trace("_sys_ppu_thread_exit(errorcode=0x%llx)", errorcode); @@ -126,6 +118,9 @@ void _sys_ppu_thread_exit(ppu_thread& ppu, u64 errorcode) old_ppu = g_fxo->get().clean(std::move(idm::find_unlocked>(ppu.id)->second)); } + // Get writers mask (wait for all current writers to quit) + writer_mask = vm::g_range_lock_bits[1]; + // Unqueue lv2_obj::sleep(ppu); notify.cleanup(); @@ -154,6 +149,15 @@ void _sys_ppu_thread_exit(ppu_thread& ppu, u64 errorcode) // It is detached from IDM now so join must be done explicitly now *static_cast*>(old_ppu.get()) = thread_state::finished; } + + // Need to wait until the current writers finish + if (ppu.state & cpu_flag::memory) + { + for (; writer_mask; writer_mask &= vm::g_range_lock_bits[1]) + { + busy_wait(200); + } + } } s32 sys_ppu_thread_yield(ppu_thread& ppu) diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index a7d4647434..0c2c719957 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -70,14 +70,16 @@ namespace vm // Memory mutex acknowledgement thread_local atomic_t* g_tls_locked = nullptr; - // "Unique locked" range lock, as opposed to "shared" range locks from set - atomic_t g_range_lock = 0; - // Memory mutex: passive locks std::array, g_cfg.core.ppu_threads.max> g_locks{}; // Range lock slot allocation bits - atomic_t g_range_lock_bits{}; + atomic_t g_range_lock_bits[2]{}; + + auto& get_range_lock_bits(bool is_exclusive_range) + { + return g_range_lock_bits[+is_exclusive_range]; + } // Memory range lock slots (sparse atomics) atomic_t g_range_lock_set[64]{}; @@ -138,9 +140,10 @@ namespace vm atomic_t* alloc_range_lock() { - const auto [bits, ok] = g_range_lock_bits.fetch_op([](u64& bits) + const auto [bits, ok] = get_range_lock_bits(false).fetch_op([](u64& bits) { - if (~bits) [[likely]] + // MSB is reserved for locking with memory setting changes + if ((~(bits | (bits + 1))) << 1) [[likely]] { bits |= bits + 1; return true; @@ -157,6 +160,9 @@ namespace vm return &g_range_lock_set[std::countr_one(bits)]; } + template + static u64 for_all_range_locks(u64 input, F func); + void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size) { perf_meter<"RHW_LOCK"_u64> perf0(0); @@ -168,32 +174,44 @@ namespace vm range_lock->store(to_store); } - for (u64 i = 0;; i++) + for (u64 i = 0, to_clear = umax;; i++) { - const u64 lock_val = g_range_lock.load(); const u64 is_share = g_shmem[begin >> 16].load(); + to_clear &= get_range_lock_bits(true); - u64 lock_addr = static_cast(lock_val); // -> u64 - u32 lock_size = static_cast(lock_val << range_bits >> (range_bits + 32)); - - u64 addr = begin; - - if ((lock_val & range_full_mask) == range_locked) [[likely]] + const u64 busy = for_all_range_locks(to_clear, [&](u64 addr_exec, u32 size_exec) { - lock_size = 128; + u64 addr = begin; - if (is_share) + if ((size_exec & (range_full_mask >> 32)) == (range_locked >> 32)) [[likely]] { - addr = static_cast(addr) | is_share; - lock_addr = lock_val; + size_exec = 128; + + if (is_share) + { + addr = static_cast(addr) | is_share; + } } - } - if (addr + size <= lock_addr || addr >= lock_addr + lock_size) [[likely]] + size_exec = (size_exec << range_bits) >> range_bits; + + // TODO (currently not possible): handle 2 64K pages (inverse range), or more pages + if (u64 is_shared = g_shmem[addr_exec >> 16]) [[unlikely]] + { + addr_exec = static_cast(addr_exec) | is_shared; + } + + if (addr <= addr_exec + size_exec - 1 && addr_exec <= addr + size - 1) [[unlikely]] + { + return 1; + } + + return 0; + }); + + if (!busy) [[likely]] { - const u64 new_lock_val = g_range_lock.load(); - - if (vm::check_addr(begin, vm::page_readable, size) && (!new_lock_val || new_lock_val == lock_val)) [[likely]] + if (vm::check_addr(begin, vm::page_readable, size)) [[likely]] { break; } @@ -265,7 +283,7 @@ namespace vm // Use ptr difference to determine location const auto diff = range_lock - g_range_lock_set; - g_range_lock_bits &= ~(1ull << diff); + g_range_lock_bits[0] &= ~(1ull << diff); } template @@ -295,13 +313,13 @@ namespace vm return result; } - static void _lock_main_range_lock(u64 flags, u32 addr, u32 size) + static atomic_t* _lock_main_range_lock(u64 flags, u32 addr, u32 size) { // Shouldn't really happen if (size == 0) { vm_log.warning("Tried to lock empty range (flags=0x%x, addr=0x%x)", flags >> 32, addr); - return; + return {}; } // Limit to <512 MiB at once; make sure if it operates on big amount of data, it's page-aligned @@ -311,7 +329,8 @@ namespace vm } // Block or signal new range locks - g_range_lock = addr | u64{size} << 32 | flags; + auto range_lock = &*std::prev(std::end(vm::g_range_lock_set)); + *range_lock = addr | u64{size} << 32 | flags; utils::prefetch_read(g_range_lock_set + 0); utils::prefetch_read(g_range_lock_set + 2); @@ -319,7 +338,7 @@ namespace vm const auto range = utils::address_range::start_length(addr, size); - u64 to_clear = g_range_lock_bits.load(); + u64 to_clear = get_range_lock_bits(false).load(); while (to_clear) { @@ -340,22 +359,21 @@ namespace vm utils::pause(); } + + return range_lock; } void passive_lock(cpu_thread& cpu) { + ensure(cpu.state & cpu_flag::wait); + bool ok = true; if (!g_tls_locked || *g_tls_locked != &cpu) [[unlikely]] { _register_lock(&cpu); - if (cpu.state & cpu_flag::memory) [[likely]] - { - cpu.state -= cpu_flag::memory; - } - - if (!g_range_lock) + if (!get_range_lock_bits(true)) { return; } @@ -367,19 +385,18 @@ namespace vm { for (u64 i = 0;; i++) { + if (cpu.is_paused()) + { + // Assume called from cpu_thread::check_state(), it can handle the pause flags better + return; + } + if (i < 100) busy_wait(200); else std::this_thread::yield(); - if (g_range_lock) - { - continue; - } - - cpu.state -= cpu_flag::memory; - - if (!g_range_lock) [[likely]] + if (!get_range_lock_bits(true)) [[likely]] { return; } @@ -427,18 +444,22 @@ namespace vm } } - writer_lock::writer_lock() - : writer_lock(0, 1) + writer_lock::writer_lock() noexcept + : writer_lock(0, nullptr, 1) { } - writer_lock::writer_lock(u32 const addr, u32 const size, u64 const flags) + writer_lock::writer_lock(u32 const addr, atomic_t* range_lock, u32 const size, u64 const flags) noexcept + : range_lock(range_lock) { - auto cpu = get_current_cpu_thread(); + cpu_thread* cpu{}; - if (cpu) + if (g_tls_locked) { - if (!g_tls_locked || *g_tls_locked != cpu || cpu->state & cpu_flag::wait) + cpu = get_current_cpu_thread(); + AUDIT(cpu); + + if (*g_tls_locked != cpu || cpu->state & cpu_flag::wait) { cpu = nullptr; } @@ -448,18 +469,51 @@ namespace vm } } + bool to_prepare_memory = addr >= 0x10000; + for (u64 i = 0;; i++) { - if (g_range_lock || !g_range_lock.compare_and_swap_test(0, addr | u64{size} << 32 | flags)) + auto& bits = get_range_lock_bits(true); + + if (!range_lock || addr < 0x10000) { - if (i < 100) - busy_wait(200); - else - std::this_thread::yield(); + if (!bits && bits.compare_and_swap_test(0, u64{umax})) + { + break; + } } else { - break; + range_lock->release(addr | u64{size} << 32 | flags); + + const auto diff = range_lock - g_range_lock_set; + + if (bits != umax && !bits.bit_test_set(diff)) + { + break; + } + + range_lock->release(0); + } + + if (i < 100) + { + if (to_prepare_memory) + { + // We have some spare time, prepare cache lines (todo: reservation tests here) + utils::prefetch_write(vm::get_super_ptr(addr)); + utils::prefetch_write(vm::get_super_ptr(addr) + 64); + to_prepare_memory = false; + } + + busy_wait(200); + } + else + { + std::this_thread::yield(); + + // Thread may have been switched or the cache clue has been undermined, cache needs to be prapred again + to_prepare_memory = true; } } @@ -469,7 +523,7 @@ namespace vm for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++) { - if (auto ptr = +*lock; ptr && !(ptr->state & cpu_flag::memory)) + if (auto ptr = +*lock; ptr && ptr->state.none_of(cpu_flag::wait + cpu_flag::memory)) { ptr->state.test_and_set(cpu_flag::memory); } @@ -487,13 +541,13 @@ namespace vm utils::prefetch_read(g_range_lock_set + 2); utils::prefetch_read(g_range_lock_set + 4); - u64 to_clear = g_range_lock_bits.load(); + u64 to_clear = get_range_lock_bits(false); u64 point = addr1 / 128; while (true) { - to_clear = for_all_range_locks(to_clear, [&](u64 addr2, u32 size2) + to_clear = for_all_range_locks(to_clear & ~get_range_lock_bits(true), [&](u64 addr2, u32 size2) { // Split and check every 64K page separately for (u64 hi = addr2 >> 16, max = (addr2 + size2 - 1) >> 16; hi <= max; hi++) @@ -523,6 +577,13 @@ namespace vm break; } + if (to_prepare_memory) + { + utils::prefetch_write(vm::get_super_ptr(addr)); + utils::prefetch_write(vm::get_super_ptr(addr) + 64); + to_prepare_memory = false; + } + utils::pause(); } @@ -532,6 +593,13 @@ namespace vm { while (!(ptr->state & cpu_flag::wait)) { + if (to_prepare_memory) + { + utils::prefetch_write(vm::get_super_ptr(addr)); + utils::prefetch_write(vm::get_super_ptr(addr) + 64); + to_prepare_memory = false; + } + utils::pause(); } } @@ -544,11 +612,6 @@ namespace vm } } - writer_lock::~writer_lock() - { - g_range_lock = 0; - } - u64 reservation_lock_internal(u32 addr, atomic_t& res) { for (u64 i = 0;; i++) @@ -672,7 +735,7 @@ namespace vm const bool is_noop = bflags & page_size_4k && utils::c_page_size > 4096; // Lock range being mapped - _lock_main_range_lock(range_allocation, addr, size); + auto range_lock = _lock_main_range_lock(range_allocation, addr, size); if (shm && shm->flags() != 0 && shm->info++) { @@ -788,6 +851,8 @@ namespace vm fmt::throw_exception("Concurrent access (addr=0x%x, size=0x%x, flags=0x%x, current_addr=0x%x)", addr, size, flags, i * 4096); } } + + range_lock->release(0); } bool page_protect(u32 addr, u32 size, u8 flags_test, u8 flags_set, u8 flags_clear) @@ -845,7 +910,7 @@ namespace vm safe_bits |= range_writable; // Protect range locks from observing changes in memory protection - _lock_main_range_lock(safe_bits, start * 4096, page_size); + auto range_lock = _lock_main_range_lock(safe_bits, start * 4096, page_size); for (u32 j = start; j < i; j++) { @@ -857,6 +922,8 @@ namespace vm const auto protection = start_value & page_writable ? utils::protection::rw : (start_value & page_readable ? utils::protection::ro : utils::protection::no); utils::memory_protect(g_base_addr + start * 4096, page_size, protection); } + + range_lock->release(0); } start_value = new_val; @@ -904,7 +971,7 @@ namespace vm } // Protect range locks from actual memory protection changes - _lock_main_range_lock(range_allocation, addr, size); + auto range_lock = _lock_main_range_lock(range_allocation, addr, size); if (shm && shm->flags() != 0 && g_shmem[addr >> 16]) { @@ -965,6 +1032,7 @@ namespace vm } } + range_lock->release(0); return size; } @@ -1966,11 +2034,13 @@ namespace vm { auto* range_lock = alloc_range_lock(); // Released at the end of function - range_lock->store(begin | (u64{size} << 32)); + auto mem_lock = &*std::prev(std::end(vm::g_range_lock_set)); while (true) { - const u64 lock_val = g_range_lock.load(); + range_lock->store(begin | (u64{size} << 32)); + + const u64 lock_val = mem_lock->load(); const u64 is_share = g_shmem[begin >> 16].load(); u64 lock_addr = static_cast(lock_val); // -> u64 @@ -1993,7 +2063,7 @@ namespace vm { if (vm::check_addr(begin, is_write ? page_writable : page_readable, size)) [[likely]] { - const u64 new_lock_val = g_range_lock.load(); + const u64 new_lock_val = mem_lock->load(); if (!new_lock_val || new_lock_val == lock_val) [[likely]] { @@ -2026,8 +2096,6 @@ namespace vm range_lock->release(0); busy_wait(200); - - range_lock->store(begin | (u64{size} << 32)); } const bool result = try_access_internal(begin, ptr, size, is_write); @@ -2071,7 +2139,7 @@ namespace vm std::memset(g_reservations, 0, sizeof(g_reservations)); std::memset(g_shmem, 0, sizeof(g_shmem)); std::memset(g_range_lock_set, 0, sizeof(g_range_lock_set)); - g_range_lock_bits = 0; + std::memset(g_range_lock_bits, 0, sizeof(g_range_lock_bits)); #ifdef _WIN32 utils::memory_release(g_hook_addr, 0x800000000); @@ -2104,7 +2172,7 @@ namespace vm #endif std::memset(g_range_lock_set, 0, sizeof(g_range_lock_set)); - g_range_lock_bits = 0; + std::memset(g_range_lock_bits, 0, sizeof(g_range_lock_bits)); } void save(utils::serial& ar) @@ -2209,8 +2277,6 @@ namespace vm loc = std::make_shared(ar, shared); } } - - g_range_lock = 0; } u32 get_shm_addr(const std::shared_ptr& shared) diff --git a/rpcs3/Emu/Memory/vm_locking.h b/rpcs3/Emu/Memory/vm_locking.h index 9923464434..253af406db 100644 --- a/rpcs3/Emu/Memory/vm_locking.h +++ b/rpcs3/Emu/Memory/vm_locking.h @@ -27,7 +27,7 @@ namespace vm range_bits = 3, }; - extern atomic_t g_range_lock; + extern atomic_t g_range_lock_bits[2]; extern atomic_t g_shmem[]; @@ -61,7 +61,7 @@ namespace vm __asm__(""); // Tiny barrier #endif - if (!g_range_lock) + if (!g_range_lock_bits[1]) [[likely]] { return; } @@ -82,10 +82,12 @@ namespace vm struct writer_lock final { + atomic_t* range_lock; + writer_lock(const writer_lock&) = delete; writer_lock& operator=(const writer_lock&) = delete; - writer_lock(); - writer_lock(u32 addr, u32 size = 0, u64 flags = range_locked); - ~writer_lock(); + writer_lock() noexcept; + writer_lock(u32 addr, atomic_t* range_lock = nullptr, u32 size = 128, u64 flags = range_locked) noexcept; + ~writer_lock() noexcept; }; } // namespace vm