From 14cca55b50543f573995f017d1cbc7c6d14b0c12 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 17 Jan 2022 11:32:44 +0300 Subject: [PATCH] PPU: refactor vector rounding instructions Fix: nearbyint -> roundeven --- Utilities/JIT.h | 12 +-- rpcs3/Emu/Cell/PPUInterpreter.cpp | 92 +++++++--------- rpcs3/Emu/Cell/PPUTranslator.cpp | 2 +- rpcs3/Emu/RSX/Common/BufferUtils.cpp | 6 +- rpcs3/util/simd.hpp | 150 ++++++++++++++++++++++++++- 5 files changed, 190 insertions(+), 72 deletions(-) diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 9faddb8615..1877b10946 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -270,7 +270,7 @@ public: built_function& operator=(const built_function&) = delete; - template requires (std::is_invocable_v) + template built_function(std::string_view name, F&& builder, u32 line = __builtin_LINE(), u32 col = __builtin_COLUMN(), @@ -280,16 +280,6 @@ public: { } - template requires (std::is_invocable_v) - built_function(std::string_view, F&& getter, - u32 line = __builtin_LINE(), - u32 col = __builtin_COLUMN(), - const char* file = __builtin_FILE(), - const char* func = __builtin_FUNCTION()) - : m_func(ensure(getter(), const_str(), line, col, file, func)) - { - } - operator FT() const noexcept { return m_func; diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index bb909d6cf9..80d285f46f 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -2305,14 +2305,14 @@ auto VREFP() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f); - const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); - const auto result = _mm_div_ps(a, b); - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(result, a, b)); + static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask) + { + auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto b = ppu_flush_denormal(m, std::move(b_)); + d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_divfs(gv_bcstfs(1.0f), b), b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], ppu.jm_mask); } template @@ -2321,19 +2321,14 @@ auto VRFIM() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); - v128 d; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask) { - d._f[w] = std::floor(b._f[w]); - } - - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(d, b)); + auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto b = ppu_flush_denormal(m, std::move(b_)); + d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_roundfs_floor(b), b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], ppu.jm_mask); } template @@ -2342,18 +2337,13 @@ auto VRFIN() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto b = ppu.vr[op.vb]; - v128 d; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& b, auto&& jm_mask) { - d._f[w] = std::nearbyint(b._f[w]); - } - - ppu.vr[op.vd] = ppu_flush_denormal(gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(d, b)); + auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_roundfs_even(b), b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], ppu.jm_mask); } template @@ -2362,19 +2352,14 @@ auto VRFIP() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); - v128 d; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask) { - d._f[w] = std::ceil(b._f[w]); - } - - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(d, b)); + auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto b = ppu_flush_denormal(m, std::move(b_)); + d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_roundfs_ceil(b), b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], ppu.jm_mask); } template @@ -2383,18 +2368,13 @@ auto VRFIZ() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto b = ppu.vr[op.vb]; - v128 d; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& b, auto&& jm_mask) { - d._f[w] = std::truncf(b._f[w]); - } - - ppu.vr[op.vd] = ppu_flush_denormal(gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(d, b)); + auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_roundfs_trunc(b), b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], ppu.jm_mask); } template @@ -2460,14 +2440,14 @@ auto VRSQRTEFP() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f); - const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); - const auto result = _mm_div_ps(a, _mm_sqrt_ps(b)); - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(result, a, b)); + static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask) + { + auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto b = ppu_flush_denormal(m, std::move(b_)); + d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_divfs(gv_bcstfs(1.0f), gv_sqrtfs(b)), b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], ppu.jm_mask); } template diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index f686e72f97..1b0c80aeac 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -1419,7 +1419,7 @@ void PPUTranslator::VRFIM(ppu_opcode_t op) void PPUTranslator::VRFIN(ppu_opcode_t op) { - set_vr(op.vd, vec_handle_result(call(get_intrinsic(Intrinsic::nearbyint), get_vr(op.vb)))); + set_vr(op.vd, vec_handle_result(call(get_intrinsic(Intrinsic::roundeven), get_vr(op.vb)))); } void PPUTranslator::VRFIP(ppu_opcode_t op) diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 57489333a0..34c756b160 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -275,11 +275,11 @@ namespace c.jmp(asmjit::imm_ptr(©_data_swap_u32_naive)); } -#else +#elif defined(ARCH_ARM64) template - constexpr auto build_copy_data_swap_u32() + void build_copy_data_swap_u32(native_asm& c, native_args& args) { - return ©_data_swap_u32_naive; + c.b(asmjit::imm_ptr(©_data_swap_u32_naive)); } #endif } diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp index 36dfde87dd..a6095bc050 100644 --- a/rpcs3/util/simd.hpp +++ b/rpcs3/util/simd.hpp @@ -14,7 +14,6 @@ #include #include -#include #endif #if defined(ARCH_ARM64) @@ -22,6 +21,7 @@ #endif #include +#include #include namespace asmjit @@ -1541,6 +1541,24 @@ inline v128 gv_avgs32(const v128& a, const v128& b) #endif } +inline v128 gv_divfs(const v128& a, const v128& b) +{ +#if defined(ARCH_X64) + return _mm_div_ps(a, b); +#elif defined(ARCH_ARM64) + return vdivq_f32(a, b); +#endif +} + +inline v128 gv_sqrtfs(const v128& a) +{ +#if defined(ARCH_X64) + return _mm_sqrt_ps(a); +#elif defined(ARCH_ARM64) + return vsqrtq_f32(a); +#endif +} + inline v128 gv_fmafs(const v128& a, const v128& b, const v128& c) { #if defined(ARCH_X64) && defined(__FMA__) @@ -1925,6 +1943,136 @@ inline v128 gv_cvtfs_tou32(const v128& src) #endif } +namespace utils +{ + inline f32 roundevenf32(f32 arg) + { + u32 val = std::bit_cast(arg); + u32 exp = (val >> 23) & 0xff; + u32 abs = val & 0x7fffffff; + + if (exp >= 127 + 23) + { + // Big enough, NaN or INF + return arg; + } + else if (exp >= 127) + { + u32 int_pos = (127 + 23) - exp; + u32 half_pos = int_pos - 1; + u32 half_bit = 1u << half_pos; + u32 int_bit = 1u << int_pos; + if (val & (int_bit | (half_bit - 1))) + val += half_bit; + val &= ~(int_bit - 1); + } + else if (exp == 126 && abs > 0x3f000000) + { + val &= 0x80000000; + val |= 0x3f800000; + } + else + { + val &= 0x80000000; + } + + return std::bit_cast(val); + } +} + +#if defined(ARCH_X64) +template +inline built_function<__m128(*)(__m128)> sse41_roundf("sse41_roundf", [](native_asm& c, native_args&) +{ + static_assert(Mode < 4); + using namespace asmjit; + if (utils::has_avx()) + c.vroundps(x86::xmm0, x86::xmm0, 8 + Mode); + else if (utils::has_sse41()) + c.roundps(x86::xmm0, x86::xmm0, 8 + Mode); + else + c.jmp(+[](__m128 a) -> __m128 + { + v128 r = a; + for (u32 i = 0; i < 4; i++) + if constexpr (Mode == 0) + r._f[i] = utils::roundevenf32(r._f[i]); + else if constexpr (Mode == 1) + r._f[i] = ::floorf(r._f[i]); + else if constexpr (Mode == 2) + r._f[i] = ::ceilf(r._f[i]); + else if constexpr (Mode == 3) + r._f[i] = ::truncf(r._f[i]); + return r; + }); + c.ret(); +}); +#endif + +inline v128 gv_roundfs_even(const v128& a) +{ +#if defined(__SSE4_1__) + return _mm_round_ps(a, 8 + 0); +#elif defined(ARCH_ARM64) + return vrndnq_f32(a); +#elif defined(ARCH_X64) + return sse41_roundf<0>(a); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = utils::roundevenf32(a._f[i]); + return r; +#endif +} + +inline v128 gv_roundfs_ceil(const v128& a) +{ +#if defined(__SSE4_1__) + return _mm_round_ps(a, 8 + 2); +#elif defined(ARCH_ARM64) + return vrndpq_f32(a); +#elif defined(ARCH_X64) + return sse41_roundf<2>(a); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = ::ceilf(a._f[i]); + return r; +#endif +} + +inline v128 gv_roundfs_floor(const v128& a) +{ +#if defined(__SSE4_1__) + return _mm_round_ps(a, 8 + 1); +#elif defined(ARCH_ARM64) + return vrndmq_f32(a); +#elif defined(ARCH_X64) + return sse41_roundf<1>(a); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = ::floorf(a._f[i]); + return r; +#endif +} + +inline v128 gv_roundfs_trunc(const v128& a) +{ +#if defined(__SSE4_1__) + return _mm_round_ps(a, 8 + 3); +#elif defined(ARCH_ARM64) + return vrndq_f32(a); +#elif defined(ARCH_X64) + return sse41_roundf<3>(a); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = ::truncf(a._f[i]); + return r; +#endif +} + inline bool gv_testz(const v128& a) { #if defined(__SSE4_1__)