1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-24 19:52:37 +01:00

atomic.hpp: use u128 as storage for masks/values

This commit is contained in:
Nekotekina 2020-12-20 09:27:40 +03:00
parent 43c87e99b0
commit 5f618814f6
4 changed files with 57 additions and 190 deletions

View File

@ -425,18 +425,6 @@ public:
}
};
namespace atomic_wait
{
template <typename T>
inline __m128i default_mask<lf_queue<T>> = _mm_cvtsi64_si128(-1);
template <typename T>
constexpr __m128i get_value(lf_queue<T>&, std::nullptr_t value = nullptr)
{
return _mm_setzero_si128();
}
}
// Concurrent linked list, elements remain until destroyed.
template <typename T>
class lf_bunch final

View File

@ -42,11 +42,7 @@ static inline bool operator &(atomic_wait::op lhs, atomic_wait::op_flag rhs)
}
// Compare data in memory with old value, and return true if they are equal
static NEVER_INLINE bool
#ifdef _WIN32
__vectorcall
#endif
ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wait::info* ext = nullptr)
static NEVER_INLINE bool ptr_cmp(const void* data, u32 _size, u128 old128, u128 mask128, atomic_wait::info* ext = nullptr)
{
using atomic_wait::op;
using atomic_wait::op_flag;
@ -59,8 +55,8 @@ ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wai
if (size <= 8)
{
u64 new_value = 0;
u64 old_value = _mm_cvtsi128_si64(old128);
u64 mask = _mm_cvtsi128_si64(mask128) & (UINT64_MAX >> ((64 - size * 8) & 63));
u64 old_value = static_cast<u64>(old128);
u64 mask = static_cast<u64>(mask128) & (UINT64_MAX >> ((64 - size * 8) & 63));
// Don't load memory on empty mask
switch (mask ? size : 0)
@ -145,7 +141,7 @@ ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wai
case op::pop:
{
// Count is taken from least significant byte and ignores some flags
const u64 count = _mm_cvtsi128_si64(old128) & 0xff;
const u64 count = static_cast<u64>(old128) & 0xff;
u64 bitc = new_value;
bitc = (bitc & 0xaaaaaaaaaaaaaaaa) / 2 + (bitc & 0x5555555555555555);
@ -210,23 +206,18 @@ ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wai
}
// Returns true if mask overlaps, or the argument is invalid
static bool
#ifdef _WIN32
__vectorcall
#endif
cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m128i val2)
static bool cmp_mask(u32 size1, u128 mask1, u128 val1, u32 size2, u128 mask2, u128 val2)
{
// Compare only masks, new value is not available in this mode
if (size1 == umax)
{
// Simple mask overlap
const auto v0 = _mm_and_si128(mask1, mask2);
const auto v1 = _mm_packs_epi16(v0, v0);
return !!_mm_cvtsi128_si64(v1);
const u128 v0 = mask1 & mask2;
return !!(v0);
}
// Generate masked value inequality bits
const auto v0 = _mm_and_si128(_mm_and_si128(mask1, mask2), _mm_xor_si128(val1, val2));
const u128 v0 = (mask1 & mask2) & (val1 ^ val2);
using atomic_wait::op;
using atomic_wait::op_flag;
@ -244,14 +235,14 @@ cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m12
// Generate sized mask
const u64 mask = UINT64_MAX >> ((64 - size * 8) & 63);
if (!(_mm_cvtsi128_si64(v0) & mask))
if (!(static_cast<u64>(v0) & mask))
{
return !!(flag & op_flag::inverse);
}
}
else if (size == 16)
{
if (!_mm_cvtsi128_si64(_mm_packs_epi16(v0, v0)))
if (!v0)
{
return !!(flag & op_flag::inverse);
}
@ -328,8 +319,8 @@ namespace
// Combined pointer (most significant 47 bits) and ref counter (17 least significant bits)
atomic_t<u64> ptr_ref;
u64 tid;
__m128i mask;
__m128i oldv;
u128 mask;
u128 oldv;
u64 tsc0;
u16 link;
@ -367,8 +358,8 @@ namespace
size = 0;
flag = 0;
sync.release(0);
mask = _mm_setzero_si128();
oldv = _mm_setzero_si128();
mask = 0;
oldv = 0;
#ifdef USE_STD
mtx.destroy();
@ -557,11 +548,7 @@ namespace
// TLS storage for few allocaded "semaphores" to allow skipping initialization
static thread_local tls_cond_handler s_tls_conds{};
static u32
#ifdef _WIN32
__vectorcall
#endif
cond_alloc(uptr iptr, __m128i mask, u32 tls_slot = -1)
static u32 cond_alloc(uptr iptr, u128 mask, u32 tls_slot = -1)
{
// Try to get cond from tls slot instead
u16* ptls = tls_slot >= std::size(s_tls_conds.cond) ? nullptr : s_tls_conds.cond + tls_slot;
@ -672,7 +659,7 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
{
// Fast finalization
cond->sync.release(0);
cond->mask = _mm_setzero_si128();
cond->mask = 0;
*ptls = static_cast<u16>(cond_id);
return;
}
@ -709,11 +696,7 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
});
}
static cond_handle*
#ifdef _WIN32
__vectorcall
#endif
cond_id_lock(u32 cond_id, u32 size, __m128i mask, u64 thread_id = 0, uptr iptr = 0)
static cond_handle* cond_id_lock(u32 cond_id, u32 size, u128 mask, u64 thread_id = 0, uptr iptr = 0)
{
if (cond_id - 1 < u32{UINT16_MAX})
{
@ -740,7 +723,7 @@ cond_id_lock(u32 cond_id, u32 size, __m128i mask, u64 thread_id = 0, uptr iptr =
return false;
}
const __m128i mask12 = _mm_and_si128(mask, _mm_load_si128(&cond->mask));
const u128 mask12 = mask & cond->mask;
if (thread_id)
{
@ -749,7 +732,7 @@ cond_id_lock(u32 cond_id, u32 size, __m128i mask, u64 thread_id = 0, uptr iptr =
return false;
}
}
else if (size && _mm_cvtsi128_si64(_mm_packs_epi16(mask12, mask12)) == 0)
else if (size && !mask12)
{
return false;
}
@ -805,7 +788,7 @@ namespace
static void slot_free(uptr ptr, atomic_t<u16>* slot, u32 tls_slot) noexcept;
template <typename F>
static auto slot_search(uptr iptr, u32 size, u64 thread_id, __m128i mask, F func) noexcept;
static auto slot_search(uptr iptr, u32 size, u64 thread_id, u128 mask, F func) noexcept;
};
static_assert(sizeof(root_info) == 64);
@ -991,7 +974,7 @@ void root_info::slot_free(uptr iptr, atomic_t<u16>* slot, u32 tls_slot) noexcept
}
template <typename F>
FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, __m128i mask, F func) noexcept
FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, u128 mask, F func) noexcept
{
u32 index = 0;
u32 total = 0;
@ -1041,11 +1024,7 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, __m
}
}
SAFE_BUFFERS void
#ifdef _WIN32
__vectorcall
#endif
atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 timeout, __m128i mask, atomic_wait::info* ext)
SAFE_BUFFERS void atomic_wait_engine::wait(const void* data, u32 size, u128 old_value, u64 timeout, u128 mask, atomic_wait::info* ext)
{
const auto stamp0 = atomic_wait::get_unique_tsc();
@ -1300,11 +1279,7 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
}
template <bool NoAlert = false>
static u32
#ifdef _WIN32
__vectorcall
#endif
alert_sema(u32 cond_id, const void* data, u64 tid, u32 size, __m128i mask, __m128i phantom)
static u32 alert_sema(u32 cond_id, const void* data, u64 tid, u32 size, u128 mask, u128 phantom)
{
ensure(cond_id);
@ -1316,7 +1291,7 @@ alert_sema(u32 cond_id, const void* data, u64 tid, u32 size, __m128i mask, __m12
{
// Redirect if necessary
const auto _old = cond;
const auto _new = _old->link ? cond_id_lock(_old->link, 0, _mm_set1_epi64x(-1)) : _old;
const auto _new = _old->link ? cond_id_lock(_old->link, 0, u128(-1)) : _old;
if (_new && _new->tsc0 == _old->tsc0)
{
@ -1488,10 +1463,10 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id)
u64 progress = 0;
root_info::slot_search(iptr, 0, thread_id, _mm_set1_epi64x(-1), [&](u32 cond_id)
root_info::slot_search(iptr, 0, thread_id, u128(-1), [&](u32 cond_id)
{
// Forced notification
if (alert_sema(cond_id, data, thread_id, 0, _mm_setzero_si128(), _mm_setzero_si128()))
if (alert_sema(cond_id, data, thread_id, 0, 0, 0))
{
if (s_tls_notify_cb)
s_tls_notify_cb(data, ++progress);
@ -1514,11 +1489,7 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id)
return progress != 0;
}
void
#ifdef _WIN32
__vectorcall
#endif
atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i new_value)
void atomic_wait_engine::notify_one(const void* data, u32 size, u128 mask, u128 new_value)
{
const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 17);
@ -1543,11 +1514,7 @@ atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i
s_tls_notify_cb(data, -1);
}
SAFE_BUFFERS void
#ifdef _WIN32
__vectorcall
#endif
atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask)
SAFE_BUFFERS void atomic_wait_engine::notify_all(const void* data, u32 size, u128 mask)
{
const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 17);
@ -1564,7 +1531,7 @@ atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask)
root_info::slot_search(iptr, size, 0, mask, [&](u32 cond_id)
{
u32 res = alert_sema<true>(cond_id, data, -1, size, mask, _mm_setzero_si128());
u32 res = alert_sema<true>(cond_id, data, -1, size, mask, 0);
if (res && ~res <= UINT16_MAX)
{

View File

@ -124,32 +124,22 @@ namespace atomic_wait
} any_value;
template <typename X, typename T = decltype(std::declval<X>().observe())>
inline __m128i default_mask = sizeof(T) <= 8
? _mm_cvtsi64_si128(UINT64_MAX >> ((64 - sizeof(T) * 8) & 63))
: _mm_set1_epi64x(-1);
constexpr u128 default_mask = sizeof(T) <= 8 ? u128{UINT64_MAX >> ((64 - sizeof(T) * 8) & 63)} : u128(-1);
template <typename X, typename T = decltype(std::declval<X>().observe())>
constexpr __m128i get_value(X&, T value = T{}, ...)
constexpr u128 get_value(X&, T value = T{}, ...)
{
static_assert((sizeof(T) & (sizeof(T) - 1)) == 0);
static_assert(sizeof(T) <= 16);
if constexpr (sizeof(T) <= 8)
{
return _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>, T>(value));
}
else if constexpr (sizeof(T) == 16)
{
return std::bit_cast<__m128i>(value);
}
return std::bit_cast<get_uint_t<sizeof(T)>, T>(value);
}
struct info
{
const void* data;
u32 size;
__m128i old;
__m128i mask;
u128 old;
u128 mask;
template <typename X, typename T = decltype(std::declval<X>().observe())>
constexpr void set_value(X& a, T value = T{})
@ -162,15 +152,7 @@ namespace atomic_wait
{
static_assert((sizeof(T) & (sizeof(T) - 1)) == 0);
static_assert(sizeof(T) <= 16);
if constexpr (sizeof(T) <= 8)
{
mask = _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>, T>(value));
}
else if constexpr (sizeof(T) == 16)
{
mask = std::bit_cast<__m128i>(value);
}
mask = std::bit_cast<get_uint_t<sizeof(T)>, T>(value);
}
template <typename X, typename T = decltype(std::declval<X>().observe())>
@ -271,23 +253,9 @@ private:
template <uint Max, typename... T>
friend class atomic_wait::list;
static void
#ifdef _WIN32
__vectorcall
#endif
wait(const void* data, u32 size, __m128i old128, u64 timeout, __m128i mask128, atomic_wait::info* extension = nullptr);
static void
#ifdef _WIN32
__vectorcall
#endif
notify_one(const void* data, u32 size, __m128i mask128, __m128i val128);
static void
#ifdef _WIN32
__vectorcall
#endif
notify_all(const void* data, u32 size, __m128i mask128);
static void wait(const void* data, u32 size, u128 old128, u64 timeout, u128 mask128, atomic_wait::info* extension = nullptr);
static void notify_one(const void* data, u32 size, u128 mask128, u128 val128);
static void notify_all(const void* data, u32 size, u128 mask128);
public:
static void set_wait_callback(bool(*cb)(const void* data, u64 attempts, u64 stamp0));
@ -1528,106 +1496,50 @@ public:
template <atomic_wait::op Flags = atomic_wait::op::eq>
void wait(type old_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const noexcept
{
if constexpr (sizeof(T) <= 8)
{
const __m128i old = _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>>(old_value));
const __m128i mask = _mm_cvtsi64_si128(UINT64_MAX >> ((64 - sizeof(T) * 8) & 63));
atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast<u8>(Flags) << 8), old, static_cast<u64>(timeout), mask);
}
else if constexpr (sizeof(T) == 16)
{
const __m128i old = std::bit_cast<__m128i>(old_value);
atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast<u8>(Flags) << 8), old, static_cast<u64>(timeout), _mm_set1_epi64x(-1));
}
const u128 old = std::bit_cast<get_uint_t<sizeof(T)>>(old_value);
const u128 mask = atomic_wait::default_mask<atomic_t>;
atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast<u8>(Flags) << 8), old, static_cast<u64>(timeout), mask);
}
// Overload with mask (only selected bits are checked), timeout is discouraged
template <atomic_wait::op Flags = atomic_wait::op::eq>
void wait(type old_value, type mask_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const noexcept
{
if constexpr (sizeof(T) <= 8)
{
const __m128i old = _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>>(old_value));
const __m128i mask = _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>>(mask_value));
atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast<u8>(Flags) << 8), old, static_cast<u64>(timeout), mask);
}
else if constexpr (sizeof(T) == 16)
{
const __m128i old = std::bit_cast<__m128i>(old_value);
const __m128i mask = std::bit_cast<__m128i>(mask_value);
atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast<u8>(Flags) << 8), old, static_cast<u64>(timeout), mask);
}
const u128 old = std::bit_cast<get_uint_t<sizeof(T)>>(old_value);
const u128 mask = std::bit_cast<get_uint_t<sizeof(T)>>(mask_value);
atomic_wait_engine::wait(&m_data, sizeof(T) | (static_cast<u8>(Flags) << 8), old, static_cast<u64>(timeout), mask);
}
void notify_one() noexcept
{
if constexpr (sizeof(T) <= 8)
{
atomic_wait_engine::notify_one(&m_data, -1, _mm_cvtsi64_si128(UINT64_MAX >> ((64 - sizeof(T) * 8) & 63)), _mm_setzero_si128());
}
else if constexpr (sizeof(T) == 16)
{
atomic_wait_engine::notify_one(&m_data, -1, _mm_set1_epi64x(-1), _mm_setzero_si128());
}
atomic_wait_engine::notify_one(&m_data, -1, atomic_wait::default_mask<atomic_t>, 0);
}
// Notify with mask, allowing to not wake up thread which doesn't wait on this mask
void notify_one(type mask_value) noexcept
{
if constexpr (sizeof(T) <= 8)
{
const __m128i mask = _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>>(mask_value));
atomic_wait_engine::notify_one(&m_data, -1, mask, _mm_setzero_si128());
}
else if constexpr (sizeof(T) == 16)
{
const __m128i mask = std::bit_cast<__m128i>(mask_value);
atomic_wait_engine::notify_one(&m_data, -1, mask, _mm_setzero_si128());
}
const u128 mask = std::bit_cast<get_uint_t<sizeof(T)>>(mask_value);
atomic_wait_engine::notify_one(&m_data, -1, mask, 0);
}
// Notify with mask and value, allowing to not wake up thread which doesn't wait on them
[[deprecated("Incomplete")]] void notify_one(type mask_value, type phantom_value) noexcept
{
if constexpr (sizeof(T) <= 8)
{
const __m128i mask = _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>>(mask_value));
const __m128i _new = _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>>(phantom_value));
atomic_wait_engine::notify_one(&m_data, sizeof(T), mask, _new);
}
else if constexpr (sizeof(T) == 16)
{
const __m128i mask = std::bit_cast<__m128i>(mask_value);
const __m128i _new = std::bit_cast<__m128i>(phantom_value);
atomic_wait_engine::notify_one(&m_data, sizeof(T), mask, _new);
}
const u128 mask = std::bit_cast<get_uint_t<sizeof(T)>>(mask_value);
const u128 _new = std::bit_cast<get_uint_t<sizeof(T)>>(phantom_value);
atomic_wait_engine::notify_one(&m_data, sizeof(T), mask, _new);
}
void notify_all() noexcept
{
if constexpr (sizeof(T) <= 8)
{
atomic_wait_engine::notify_all(&m_data, -1, _mm_cvtsi64_si128(UINT64_MAX >> ((64 - sizeof(T) * 8) & 63)));
}
else if constexpr (sizeof(T) == 16)
{
atomic_wait_engine::notify_all(&m_data, -1, _mm_set1_epi64x(-1));
}
atomic_wait_engine::notify_all(&m_data, -1, atomic_wait::default_mask<atomic_t>);
}
// Notify all threads with mask, allowing to not wake up threads which don't wait on them
void notify_all(type mask_value) noexcept
{
if constexpr (sizeof(T) <= 8)
{
const __m128i mask = _mm_cvtsi64_si128(std::bit_cast<get_uint_t<sizeof(T)>>(mask_value));
atomic_wait_engine::notify_all(&m_data, -1, mask);
}
else if constexpr (sizeof(T) == 16)
{
const __m128i mask = std::bit_cast<__m128i>(mask_value);
atomic_wait_engine::notify_all(&m_data, -1, mask);
}
const u128 mask = std::bit_cast<get_uint_t<sizeof(T)>>(mask_value);
atomic_wait_engine::notify_all(&m_data, -1, mask);
}
};
@ -1724,5 +1636,5 @@ public:
namespace atomic_wait
{
template <usz Align>
inline __m128i default_mask<atomic_t<bool, Align>> = _mm_cvtsi32_si128(1);
constexpr u128 default_mask<atomic_t<bool, Align>> = 1;
}

View File

@ -1117,12 +1117,12 @@ namespace stx
namespace atomic_wait
{
template <typename T>
inline __m128i default_mask<stx::atomic_ptr<T>> = _mm_cvtsi64_si128(stx::c_ptr_mask);
constexpr u128 default_mask<stx::atomic_ptr<T>> = stx::c_ptr_mask;
template <typename T>
constexpr __m128i get_value(stx::atomic_ptr<T>&, const volatile void* value = nullptr)
constexpr u128 get_value(stx::atomic_ptr<T>&, const volatile void* value = nullptr)
{
return _mm_cvtsi64_si128(reinterpret_cast<uptr>(value) << stx::c_ref_size);
return reinterpret_cast<uptr>(value) << stx::c_ref_size;
}
}