diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index ca9bc435f5..0ca52bb57e 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -2839,12 +2839,9 @@ auto VSUM4SBS() static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) { - //const auto r = _mm_dpbusds_epi32(b, _mm_set1_epi8(1), a); - //const auto s = _mm_dpbusd_epi32(b, _mm_set1_epi8(1), a); - auto x = gv_hadds8x4(a); - auto r = gv_adds_s32(x, b); + auto r = gv_dots_u8s8x4(gv_bcst8(1), a, b); if constexpr (((Flags == set_sat) || ...)) - sat = gv_or32(gv_xor32(gv_add32(std::move(x), std::move(b)), r), std::move(sat)); + sat = gv_or32(gv_xor32(gv_hadds8x4(std::move(a), std::move(b)), r), std::move(sat)); d = std::move(r); }; @@ -2859,12 +2856,9 @@ auto VSUM4SHS() static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) { - //const auto r = _mm_dpwssds_epi32(b, a, _mm_set1_epi16(1)); - //const auto s = _mm_dpwssd_epi32(b, a, _mm_set1_epi16(1)); - auto x = gv_hadds16x2(a); - auto r = gv_adds_s32(x, b); + auto r = gv_dots_s16x2(a, gv_bcst16(1), b); if constexpr (((Flags == set_sat) || ...)) - sat = gv_or32(gv_xor32(gv_add32(std::move(x), std::move(b)), r), std::move(sat)); + sat = gv_or32(gv_xor32(gv_hadds16x2(std::move(a), std::move(b)), r), std::move(sat)); d = std::move(r); }; diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp index 0b9eddc30c..54fcac8020 100644 --- a/rpcs3/util/simd.hpp +++ b/rpcs3/util/simd.hpp @@ -1970,16 +1970,16 @@ inline v128 gv_hadds8x2(const v128& a) #endif } -inline v128 gv_hadds8x4(const v128& a) +inline v128 gv_hadds8x4(const v128& a, const v128& c) { #if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) - return _mm_dpbusd_epi32(_mm_setzero_si128(), _mm_set1_epi8(1), a); + return _mm_dpbusd_epi32(c, _mm_set1_epi8(1), a); #elif defined(__SSSE3__) - return _mm_madd_epi16(_mm_maddubs_epi16(_mm_set1_epi8(1), a), _mm_set1_epi16(1)); + return _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(_mm_set1_epi8(1), a), _mm_set1_epi16(1)), c); #elif defined(ARCH_X64) - return _mm_madd_epi16(_mm_add_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)), _mm_set1_epi16(1)); + return _mm_add_epi32(_mm_madd_epi16(_mm_add_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)), _mm_set1_epi16(1)), c); #elif defined(ARCH_ARM64) - return vpaddlq_s16(vpaddlq_s8(a)); + return vaddq_s32(vpaddlq_s16(vpaddlq_s8(a)), c); #endif } @@ -2007,12 +2007,14 @@ inline v128 gv_haddu8x4(const v128& a) #endif } -inline v128 gv_hadds16x2(const v128& a) +inline v128 gv_hadds16x2(const v128& a, const v128& c) { -#if defined(ARCH_X64) - return _mm_madd_epi16(a, _mm_set1_epi16(1)); +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpwssd_epi32(c, a, _mm_set1_epi8(1)); +#elif defined(ARCH_X64) + return _mm_add_epi32(_mm_madd_epi16(a, _mm_set1_epi16(1)), c); #elif defined(ARCH_ARM64) - return vpaddlq_s16(a); + return vaddq_s32(vpaddlq_s16(a), c); #endif } @@ -2099,6 +2101,26 @@ inline v128 gv_dotu16x2(const v128& a, const v128& b) #endif } +// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c +inline v128 gv_dots_u8s8x4(const v128& a, const v128& b, const v128& c) +{ +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpbusds_epi32(c, a, b); +#elif defined(ARCH_X64) + const __m128i ah = _mm_srli_epi16(a, 8); + const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff)); + const __m128i bh = _mm_srai_epi16(b, 8); + const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8); + const __m128i mh = _mm_madd_epi16(ah, bh); + const __m128i ml = _mm_madd_epi16(al, bl); + return gv_adds_s32(c, _mm_add_epi32(mh, ml)); +#elif defined(ARCH_ARM64) + const auto l = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b)))); + const auto h = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b)))); + return vqaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h))); +#endif +} + // Signed s16 from a and b, 32-bit accumulator c; signed saturation inline v128 gv_dots_s16x2(const v128& a, const v128& b, const v128& c) {