From 5f618814f6e0f8c54d723f763823ccd57869b54a Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 20 Dec 2020 09:27:40 +0300 Subject: [PATCH] atomic.hpp: use u128 as storage for masks/values --- Utilities/lockless.h | 12 ---- rpcs3/util/atomic.cpp | 91 ++++++++----------------- rpcs3/util/atomic.hpp | 138 +++++++------------------------------- rpcs3/util/shared_ptr.hpp | 6 +- 4 files changed, 57 insertions(+), 190 deletions(-) diff --git a/Utilities/lockless.h b/Utilities/lockless.h index 0b94557302..45f23232b5 100644 --- a/Utilities/lockless.h +++ b/Utilities/lockless.h @@ -425,18 +425,6 @@ public: } }; -namespace atomic_wait -{ - template - inline __m128i default_mask> = _mm_cvtsi64_si128(-1); - - template - constexpr __m128i get_value(lf_queue&, std::nullptr_t value = nullptr) - { - return _mm_setzero_si128(); - } -} - // Concurrent linked list, elements remain until destroyed. template class lf_bunch final diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index df69964f0e..86a818a600 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -42,11 +42,7 @@ static inline bool operator &(atomic_wait::op lhs, atomic_wait::op_flag rhs) } // Compare data in memory with old value, and return true if they are equal -static NEVER_INLINE bool -#ifdef _WIN32 -__vectorcall -#endif -ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wait::info* ext = nullptr) +static NEVER_INLINE bool ptr_cmp(const void* data, u32 _size, u128 old128, u128 mask128, atomic_wait::info* ext = nullptr) { using atomic_wait::op; using atomic_wait::op_flag; @@ -59,8 +55,8 @@ ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wai if (size <= 8) { u64 new_value = 0; - u64 old_value = _mm_cvtsi128_si64(old128); - u64 mask = _mm_cvtsi128_si64(mask128) & (UINT64_MAX >> ((64 - size * 8) & 63)); + u64 old_value = static_cast(old128); + u64 mask = static_cast(mask128) & (UINT64_MAX >> ((64 - size * 8) & 63)); // Don't load memory on empty mask switch (mask ? size : 0) @@ -145,7 +141,7 @@ ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wai case op::pop: { // Count is taken from least significant byte and ignores some flags - const u64 count = _mm_cvtsi128_si64(old128) & 0xff; + const u64 count = static_cast(old128) & 0xff; u64 bitc = new_value; bitc = (bitc & 0xaaaaaaaaaaaaaaaa) / 2 + (bitc & 0x5555555555555555); @@ -210,23 +206,18 @@ ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wai } // Returns true if mask overlaps, or the argument is invalid -static bool -#ifdef _WIN32 -__vectorcall -#endif -cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m128i val2) +static bool cmp_mask(u32 size1, u128 mask1, u128 val1, u32 size2, u128 mask2, u128 val2) { // Compare only masks, new value is not available in this mode if (size1 == umax) { // Simple mask overlap - const auto v0 = _mm_and_si128(mask1, mask2); - const auto v1 = _mm_packs_epi16(v0, v0); - return !!_mm_cvtsi128_si64(v1); + const u128 v0 = mask1 & mask2; + return !!(v0); } // Generate masked value inequality bits - const auto v0 = _mm_and_si128(_mm_and_si128(mask1, mask2), _mm_xor_si128(val1, val2)); + const u128 v0 = (mask1 & mask2) & (val1 ^ val2); using atomic_wait::op; using atomic_wait::op_flag; @@ -244,14 +235,14 @@ cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m12 // Generate sized mask const u64 mask = UINT64_MAX >> ((64 - size * 8) & 63); - if (!(_mm_cvtsi128_si64(v0) & mask)) + if (!(static_cast(v0) & mask)) { return !!(flag & op_flag::inverse); } } else if (size == 16) { - if (!_mm_cvtsi128_si64(_mm_packs_epi16(v0, v0))) + if (!v0) { return !!(flag & op_flag::inverse); } @@ -328,8 +319,8 @@ namespace // Combined pointer (most significant 47 bits) and ref counter (17 least significant bits) atomic_t ptr_ref; u64 tid; - __m128i mask; - __m128i oldv; + u128 mask; + u128 oldv; u64 tsc0; u16 link; @@ -367,8 +358,8 @@ namespace size = 0; flag = 0; sync.release(0); - mask = _mm_setzero_si128(); - oldv = _mm_setzero_si128(); + mask = 0; + oldv = 0; #ifdef USE_STD mtx.destroy(); @@ -557,11 +548,7 @@ namespace // TLS storage for few allocaded "semaphores" to allow skipping initialization static thread_local tls_cond_handler s_tls_conds{}; -static u32 -#ifdef _WIN32 -__vectorcall -#endif -cond_alloc(uptr iptr, __m128i mask, u32 tls_slot = -1) +static u32 cond_alloc(uptr iptr, u128 mask, u32 tls_slot = -1) { // Try to get cond from tls slot instead u16* ptls = tls_slot >= std::size(s_tls_conds.cond) ? nullptr : s_tls_conds.cond + tls_slot; @@ -672,7 +659,7 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1) { // Fast finalization cond->sync.release(0); - cond->mask = _mm_setzero_si128(); + cond->mask = 0; *ptls = static_cast(cond_id); return; } @@ -709,11 +696,7 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1) }); } -static cond_handle* -#ifdef _WIN32 -__vectorcall -#endif -cond_id_lock(u32 cond_id, u32 size, __m128i mask, u64 thread_id = 0, uptr iptr = 0) +static cond_handle* cond_id_lock(u32 cond_id, u32 size, u128 mask, u64 thread_id = 0, uptr iptr = 0) { if (cond_id - 1 < u32{UINT16_MAX}) { @@ -740,7 +723,7 @@ cond_id_lock(u32 cond_id, u32 size, __m128i mask, u64 thread_id = 0, uptr iptr = return false; } - const __m128i mask12 = _mm_and_si128(mask, _mm_load_si128(&cond->mask)); + const u128 mask12 = mask & cond->mask; if (thread_id) { @@ -749,7 +732,7 @@ cond_id_lock(u32 cond_id, u32 size, __m128i mask, u64 thread_id = 0, uptr iptr = return false; } } - else if (size && _mm_cvtsi128_si64(_mm_packs_epi16(mask12, mask12)) == 0) + else if (size && !mask12) { return false; } @@ -805,7 +788,7 @@ namespace static void slot_free(uptr ptr, atomic_t* slot, u32 tls_slot) noexcept; template - static auto slot_search(uptr iptr, u32 size, u64 thread_id, __m128i mask, F func) noexcept; + static auto slot_search(uptr iptr, u32 size, u64 thread_id, u128 mask, F func) noexcept; }; static_assert(sizeof(root_info) == 64); @@ -991,7 +974,7 @@ void root_info::slot_free(uptr iptr, atomic_t* slot, u32 tls_slot) noexcept } template -FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, __m128i mask, F func) noexcept +FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, u128 mask, F func) noexcept { u32 index = 0; u32 total = 0; @@ -1041,11 +1024,7 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, __m } } -SAFE_BUFFERS void -#ifdef _WIN32 -__vectorcall -#endif -atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 timeout, __m128i mask, atomic_wait::info* ext) +SAFE_BUFFERS void atomic_wait_engine::wait(const void* data, u32 size, u128 old_value, u64 timeout, u128 mask, atomic_wait::info* ext) { const auto stamp0 = atomic_wait::get_unique_tsc(); @@ -1300,11 +1279,7 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time } template -static u32 -#ifdef _WIN32 -__vectorcall -#endif -alert_sema(u32 cond_id, const void* data, u64 tid, u32 size, __m128i mask, __m128i phantom) +static u32 alert_sema(u32 cond_id, const void* data, u64 tid, u32 size, u128 mask, u128 phantom) { ensure(cond_id); @@ -1316,7 +1291,7 @@ alert_sema(u32 cond_id, const void* data, u64 tid, u32 size, __m128i mask, __m12 { // Redirect if necessary const auto _old = cond; - const auto _new = _old->link ? cond_id_lock(_old->link, 0, _mm_set1_epi64x(-1)) : _old; + const auto _new = _old->link ? cond_id_lock(_old->link, 0, u128(-1)) : _old; if (_new && _new->tsc0 == _old->tsc0) { @@ -1488,10 +1463,10 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id) u64 progress = 0; - root_info::slot_search(iptr, 0, thread_id, _mm_set1_epi64x(-1), [&](u32 cond_id) + root_info::slot_search(iptr, 0, thread_id, u128(-1), [&](u32 cond_id) { // Forced notification - if (alert_sema(cond_id, data, thread_id, 0, _mm_setzero_si128(), _mm_setzero_si128())) + if (alert_sema(cond_id, data, thread_id, 0, 0, 0)) { if (s_tls_notify_cb) s_tls_notify_cb(data, ++progress); @@ -1514,11 +1489,7 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id) return progress != 0; } -void -#ifdef _WIN32 -__vectorcall -#endif -atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i new_value) +void atomic_wait_engine::notify_one(const void* data, u32 size, u128 mask, u128 new_value) { const uptr iptr = reinterpret_cast(data) & (~s_ref_mask >> 17); @@ -1543,11 +1514,7 @@ atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i s_tls_notify_cb(data, -1); } -SAFE_BUFFERS void -#ifdef _WIN32 -__vectorcall -#endif -atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask) +SAFE_BUFFERS void atomic_wait_engine::notify_all(const void* data, u32 size, u128 mask) { const uptr iptr = reinterpret_cast(data) & (~s_ref_mask >> 17); @@ -1564,7 +1531,7 @@ atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask) root_info::slot_search(iptr, size, 0, mask, [&](u32 cond_id) { - u32 res = alert_sema(cond_id, data, -1, size, mask, _mm_setzero_si128()); + u32 res = alert_sema(cond_id, data, -1, size, mask, 0); if (res && ~res <= UINT16_MAX) { diff --git a/rpcs3/util/atomic.hpp b/rpcs3/util/atomic.hpp index 68a782b2d8..2518836024 100644 --- a/rpcs3/util/atomic.hpp +++ b/rpcs3/util/atomic.hpp @@ -124,32 +124,22 @@ namespace atomic_wait } any_value; template ().observe())> - inline __m128i default_mask = sizeof(T) <= 8 - ? _mm_cvtsi64_si128(UINT64_MAX >> ((64 - sizeof(T) * 8) & 63)) - : _mm_set1_epi64x(-1); + constexpr u128 default_mask = sizeof(T) <= 8 ? u128{UINT64_MAX >> ((64 - sizeof(T) * 8) & 63)} : u128(-1); template ().observe())> - constexpr __m128i get_value(X&, T value = T{}, ...) + constexpr u128 get_value(X&, T value = T{}, ...) { static_assert((sizeof(T) & (sizeof(T) - 1)) == 0); static_assert(sizeof(T) <= 16); - - if constexpr (sizeof(T) <= 8) - { - return _mm_cvtsi64_si128(std::bit_cast, T>(value)); - } - else if constexpr (sizeof(T) == 16) - { - return std::bit_cast<__m128i>(value); - } + return std::bit_cast, T>(value); } struct info { const void* data; u32 size; - __m128i old; - __m128i mask; + u128 old; + u128 mask; template ().observe())> constexpr void set_value(X& a, T value = T{}) @@ -162,15 +152,7 @@ namespace atomic_wait { static_assert((sizeof(T) & (sizeof(T) - 1)) == 0); static_assert(sizeof(T) <= 16); - - if constexpr (sizeof(T) <= 8) - { - mask = _mm_cvtsi64_si128(std::bit_cast, T>(value)); - } - else if constexpr (sizeof(T) == 16) - { - mask = std::bit_cast<__m128i>(value); - } + mask = std::bit_cast, T>(value); } template ().observe())> @@ -271,23 +253,9 @@ private: template friend class atomic_wait::list; - static void -#ifdef _WIN32 - __vectorcall -#endif - wait(const void* data, u32 size, __m128i old128, u64 timeout, __m128i mask128, atomic_wait::info* extension = nullptr); - - static void -#ifdef _WIN32 - __vectorcall -#endif - notify_one(const void* data, u32 size, __m128i mask128, __m128i val128); - - static void -#ifdef _WIN32 - __vectorcall -#endif - notify_all(const void* data, u32 size, __m128i mask128); + static void wait(const void* data, u32 size, u128 old128, u64 timeout, u128 mask128, atomic_wait::info* extension = nullptr); + static void notify_one(const void* data, u32 size, u128 mask128, u128 val128); + static void notify_all(const void* data, u32 size, u128 mask128); public: static void set_wait_callback(bool(*cb)(const void* data, u64 attempts, u64 stamp0)); @@ -1528,106 +1496,50 @@ public: template void wait(type old_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const noexcept { - if constexpr (sizeof(T) <= 8) - { - const __m128i old = _mm_cvtsi64_si128(std::bit_cast>(old_value)); - const __m128i mask = _mm_cvtsi64_si128(UINT64_MAX >> ((64 - sizeof(T) * 8) & 63)); - atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast(Flags) << 8), old, static_cast(timeout), mask); - } - else if constexpr (sizeof(T) == 16) - { - const __m128i old = std::bit_cast<__m128i>(old_value); - atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast(Flags) << 8), old, static_cast(timeout), _mm_set1_epi64x(-1)); - } + const u128 old = std::bit_cast>(old_value); + const u128 mask = atomic_wait::default_mask; + atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast(Flags) << 8), old, static_cast(timeout), mask); } // Overload with mask (only selected bits are checked), timeout is discouraged template void wait(type old_value, type mask_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const noexcept { - if constexpr (sizeof(T) <= 8) - { - const __m128i old = _mm_cvtsi64_si128(std::bit_cast>(old_value)); - const __m128i mask = _mm_cvtsi64_si128(std::bit_cast>(mask_value)); - atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast(Flags) << 8), old, static_cast(timeout), mask); - } - else if constexpr (sizeof(T) == 16) - { - const __m128i old = std::bit_cast<__m128i>(old_value); - const __m128i mask = std::bit_cast<__m128i>(mask_value); - atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast(Flags) << 8), old, static_cast(timeout), mask); - } + const u128 old = std::bit_cast>(old_value); + const u128 mask = std::bit_cast>(mask_value); + atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast(Flags) << 8), old, static_cast(timeout), mask); } void notify_one() noexcept { - if constexpr (sizeof(T) <= 8) - { - atomic_wait_engine::notify_one(&m_data, -1, _mm_cvtsi64_si128(UINT64_MAX >> ((64 - sizeof(T) * 8) & 63)), _mm_setzero_si128()); - } - else if constexpr (sizeof(T) == 16) - { - atomic_wait_engine::notify_one(&m_data, -1, _mm_set1_epi64x(-1), _mm_setzero_si128()); - } + atomic_wait_engine::notify_one(&m_data, -1, atomic_wait::default_mask, 0); } // Notify with mask, allowing to not wake up thread which doesn't wait on this mask void notify_one(type mask_value) noexcept { - if constexpr (sizeof(T) <= 8) - { - const __m128i mask = _mm_cvtsi64_si128(std::bit_cast>(mask_value)); - atomic_wait_engine::notify_one(&m_data, -1, mask, _mm_setzero_si128()); - } - else if constexpr (sizeof(T) == 16) - { - const __m128i mask = std::bit_cast<__m128i>(mask_value); - atomic_wait_engine::notify_one(&m_data, -1, mask, _mm_setzero_si128()); - } + const u128 mask = std::bit_cast>(mask_value); + atomic_wait_engine::notify_one(&m_data, -1, mask, 0); } // Notify with mask and value, allowing to not wake up thread which doesn't wait on them [[deprecated("Incomplete")]] void notify_one(type mask_value, type phantom_value) noexcept { - if constexpr (sizeof(T) <= 8) - { - const __m128i mask = _mm_cvtsi64_si128(std::bit_cast>(mask_value)); - const __m128i _new = _mm_cvtsi64_si128(std::bit_cast>(phantom_value)); - atomic_wait_engine::notify_one(&m_data, sizeof(T), mask, _new); - } - else if constexpr (sizeof(T) == 16) - { - const __m128i mask = std::bit_cast<__m128i>(mask_value); - const __m128i _new = std::bit_cast<__m128i>(phantom_value); - atomic_wait_engine::notify_one(&m_data, sizeof(T), mask, _new); - } + const u128 mask = std::bit_cast>(mask_value); + const u128 _new = std::bit_cast>(phantom_value); + atomic_wait_engine::notify_one(&m_data, sizeof(T), mask, _new); } void notify_all() noexcept { - if constexpr (sizeof(T) <= 8) - { - atomic_wait_engine::notify_all(&m_data, -1, _mm_cvtsi64_si128(UINT64_MAX >> ((64 - sizeof(T) * 8) & 63))); - } - else if constexpr (sizeof(T) == 16) - { - atomic_wait_engine::notify_all(&m_data, -1, _mm_set1_epi64x(-1)); - } + atomic_wait_engine::notify_all(&m_data, -1, atomic_wait::default_mask); } // Notify all threads with mask, allowing to not wake up threads which don't wait on them void notify_all(type mask_value) noexcept { - if constexpr (sizeof(T) <= 8) - { - const __m128i mask = _mm_cvtsi64_si128(std::bit_cast>(mask_value)); - atomic_wait_engine::notify_all(&m_data, -1, mask); - } - else if constexpr (sizeof(T) == 16) - { - const __m128i mask = std::bit_cast<__m128i>(mask_value); - atomic_wait_engine::notify_all(&m_data, -1, mask); - } + const u128 mask = std::bit_cast>(mask_value); + atomic_wait_engine::notify_all(&m_data, -1, mask); } }; @@ -1724,5 +1636,5 @@ public: namespace atomic_wait { template - inline __m128i default_mask> = _mm_cvtsi32_si128(1); + constexpr u128 default_mask> = 1; } diff --git a/rpcs3/util/shared_ptr.hpp b/rpcs3/util/shared_ptr.hpp index dec6a85b88..1093372faa 100644 --- a/rpcs3/util/shared_ptr.hpp +++ b/rpcs3/util/shared_ptr.hpp @@ -1117,12 +1117,12 @@ namespace stx namespace atomic_wait { template - inline __m128i default_mask> = _mm_cvtsi64_si128(stx::c_ptr_mask); + constexpr u128 default_mask> = stx::c_ptr_mask; template - constexpr __m128i get_value(stx::atomic_ptr&, const volatile void* value = nullptr) + constexpr u128 get_value(stx::atomic_ptr&, const volatile void* value = nullptr) { - return _mm_cvtsi64_si128(reinterpret_cast(value) << stx::c_ref_size); + return reinterpret_cast(value) << stx::c_ref_size; } }