From cfd5cf6bdb56ea52a0b2cf56c1d83b081cb0fadc Mon Sep 17 00:00:00 2001 From: linkmauve Date: Wed, 30 Oct 2019 14:42:44 +0100 Subject: [PATCH] Optimise primitive_restart::upload_untouched() (#6881) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * rsx: Optimise primitive_restart::upload_untouched() with SSE4.1 This optimisation is only applied when skip_restart is false. I’ve only tested the u16 codepath, as it is the one used in NieR. In some very unscientific profiling, this function used to take 2.76% of the total frame time at the save point of the port town, it now takes about 0.40%. * rsx: Mark all SSE4.1 functions with attributes on gcc and clang This assures the compiler we will take care of only calling these functions after having checked that the CPU does support these instructions. * rsx: Add an AVX2 implementation of primitive restart ibo upload * rsx: Remove redefinition of SSE4.1 instructions Now that clang is aware that our functions are compiled with SSE4.1, it lets us generate this code using its intrinsics. * rsx: Optimise vector to scalar conversion This is done using minpos and srli intrinsics and generate less code than before. Thanks Nekotekina for the suggestion! --- rpcs3/Emu/RSX/Common/BufferUtils.cpp | 305 +++++++++++++++++++++------ 1 file changed, 236 insertions(+), 69 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index bda8616edc..077c72807a 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -10,9 +10,12 @@ #if defined(_MSC_VER) #define __SSSE3__ 1 -#define __SSE4_1__ 1 +#define SSE4_1_FUNC +#define AVX2_FUNC #else #define __sse_intrin static FORCE_INLINE +#define SSE4_1_FUNC __attribute__((__target__("sse4.1"))) +#define AVX2_FUNC __attribute__((__target__("avx2"))) #endif // _MSC_VER // NOTE: Clang does not allow to redefine missing intrinsics @@ -26,38 +29,11 @@ __sse_intrin __m128i __mm_shuffle_epi8(__m128i opd, __m128i opa) #define __mm_shuffle_epi8 _mm_shuffle_epi8 #endif // __SSSE3__ -#ifndef __SSE4_1__ -__sse_intrin __m128i __mm_max_epu32(__m128i opd, __m128i opa) -{ - __asm__("pmaxud %1, %0" : "+x" (opd) : "xm" (opa)); - return opd; -} -__sse_intrin __m128i __mm_min_epu32(__m128i opd, __m128i opa) -{ - __asm__("pminud %1, %0" : "+x" (opd) : "xm" (opa)); - return opd; -} -__sse_intrin __m128i __mm_max_epu16(__m128i opd, __m128i opa) -{ - __asm__("pmaxuw %1, %0" : "+x" (opd) : "xm" (opa)); - return opd; -} -__sse_intrin __m128i __mm_min_epu16(__m128i opd, __m128i opa) -{ - __asm__("pminuw %1, %0" : "+x" (opd) : "xm" (opa)); - return opd; -} -#else -#define __mm_max_epu32 _mm_max_epu32 -#define __mm_min_epu32 _mm_min_epu32 -#define __mm_max_epu16 _mm_max_epu16 -#define __mm_min_epu16 _mm_min_epu16 -#endif // __SSE4_1__ - #undef __sse_intrin const bool s_use_ssse3 = utils::has_ssse3(); const bool s_use_sse4_1 = utils::has_sse41(); +const bool s_use_avx2 = utils::has_avx2(); namespace { @@ -602,8 +578,9 @@ namespace struct untouched_impl { + SSE4_1_FUNC static - std::tuple upload_u16_swapped(const void *src, void *dst, u32 count) + std::tuple upload_u16_swapped_sse4_1(const void *src, void *dst, u32 count) { const __m128i mask = _mm_set_epi8( 0xE, 0xF, 0xC, 0xD, @@ -621,9 +598,9 @@ namespace for (unsigned n = 0; n < iterations; ++n) { const __m128i raw = _mm_loadu_si128(src_stream++); - const __m128i value = __mm_shuffle_epi8(raw, mask); - max = __mm_max_epu16(max, value); - min = __mm_min_epu16(min, value); + const __m128i value = _mm_shuffle_epi8(raw, mask); + max = _mm_max_epu16(max, value); + min = _mm_min_epu16(min, value); _mm_storeu_si128(dst_stream++, value); } @@ -639,19 +616,19 @@ namespace 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x3, 0x2); - __m128i tmp = __mm_shuffle_epi8(min, mask_step1); - min = __mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step2); - min = __mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step3); - min = __mm_min_epu16(min, tmp); + __m128i tmp = _mm_shuffle_epi8(min, mask_step1); + min = _mm_min_epu16(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step2); + min = _mm_min_epu16(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step3); + min = _mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(max, mask_step1); - max = __mm_max_epu16(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step2); - max = __mm_max_epu16(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step3); - max = __mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step1); + max = _mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step2); + max = _mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step3); + max = _mm_max_epu16(max, tmp); const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF); const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF); @@ -659,8 +636,9 @@ namespace return std::make_tuple(min_index, max_index, count); } + SSE4_1_FUNC static - std::tuple upload_u32_swapped(const void *src, void *dst, u32 count) + std::tuple upload_u32_swapped_sse4_1(const void *src, void *dst, u32 count) { const __m128i mask = _mm_set_epi8( 0xC, 0xD, 0xE, 0xF, @@ -678,9 +656,9 @@ namespace for (unsigned n = 0; n < iterations; ++n) { const __m128i raw = _mm_loadu_si128(src_stream++); - const __m128i value = __mm_shuffle_epi8(raw, mask); - max = __mm_max_epu32(max, value); - min = __mm_min_epu32(min, value); + const __m128i value = _mm_shuffle_epi8(raw, mask); + max = _mm_max_epu32(max, value); + min = _mm_min_epu32(min, value); _mm_storeu_si128(dst_stream++, value); } @@ -693,15 +671,15 @@ namespace 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4); - __m128i tmp = __mm_shuffle_epi8(min, mask_step1); - min = __mm_min_epu32(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step2); - min = __mm_min_epu32(min, tmp); + __m128i tmp = _mm_shuffle_epi8(min, mask_step1); + min = _mm_min_epu32(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step2); + min = _mm_min_epu32(min, tmp); - tmp = __mm_shuffle_epi8(max, mask_step1); - max = __mm_max_epu32(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step2); - max = __mm_max_epu32(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step1); + max = _mm_max_epu32(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step2); + max = _mm_max_epu32(max, tmp); const u32 min_index = u32(_mm_cvtsi128_si32(min)); const u32 max_index = u32(_mm_cvtsi128_si32(max)); @@ -722,12 +700,12 @@ namespace if constexpr (std::is_same::value) { const auto count = (remaining & ~0x3); - std::tie(min_index, max_index, written) = upload_u32_swapped(src.data(), dst.data(), count); + std::tie(min_index, max_index, written) = upload_u32_swapped_sse4_1(src.data(), dst.data(), count); } else if constexpr (std::is_same::value) { const auto count = (remaining & ~0x7); - std::tie(min_index, max_index, written) = upload_u16_swapped(src.data(), dst.data(), count); + std::tie(min_index, max_index, written) = upload_u16_swapped_sse4_1(src.data(), dst.data(), count); } else { @@ -755,39 +733,228 @@ namespace struct primitive_restart_impl { + AVX2_FUNC + static + std::tuple upload_u16_swapped_avx2(const void *src, void *dst, u32 iterations, u16 restart_index) + { + const __m256i shuffle_mask = _mm256_set_epi8( + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1, + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1); + + auto src_stream = (const __m256i*)src; + auto dst_stream = (__m256i*)dst; + + __m256i restart = _mm256_set1_epi16(restart_index); + __m256i min = _mm256_set1_epi16(0xffff); + __m256i max = _mm256_set1_epi16(0); + + for (unsigned n = 0; n < iterations; ++n) + { + const __m256i raw = _mm256_loadu_si256(src_stream++); + const __m256i value = _mm256_shuffle_epi8(raw, shuffle_mask); + const __m256i mask = _mm256_cmpeq_epi16(restart, value); + const __m256i value_with_min_restart = _mm256_andnot_si256(mask, value); + const __m256i value_with_max_restart = _mm256_or_si256(mask, value); + max = _mm256_max_epu16(max, value_with_min_restart); + min = _mm256_min_epu16(min, value_with_max_restart); + _mm256_storeu_si256(dst_stream++, value_with_max_restart); + } + + __m128i tmp = _mm256_extracti128_si256(min, 1); + __m128i min2 = _mm256_castsi256_si128(min); + min2 = _mm_min_epu16(min2, tmp); + min2 = _mm_minpos_epu16(min2); + + tmp = _mm256_extracti128_si256(max, 1); + __m128i max2 = _mm256_castsi256_si128(max); + max2 = _mm_max_epu16(max2, tmp); + tmp = _mm_srli_si128(max2, 8); + max2 = _mm_max_epu16(max2, tmp); + tmp = _mm_srli_si128(max2, 4); + max2 = _mm_max_epu16(max2, tmp); + tmp = _mm_srli_si128(max2, 2); + max2 = _mm_max_epu16(max2, tmp); + + const u16 min_index = u16(_mm_cvtsi128_si32(min2) & 0xFFFF); + const u16 max_index = u16(_mm_cvtsi128_si32(max2) & 0xFFFF); + + return std::make_tuple(min_index, max_index); + } + + SSE4_1_FUNC + static + std::tuple upload_u16_swapped_sse4_1(const void *src, void *dst, u32 iterations, u16 restart_index) + { + const __m128i shuffle_mask = _mm_set_epi8( + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1); + + auto src_stream = (const __m128i*)src; + auto dst_stream = (__m128i*)dst; + + __m128i restart = _mm_set1_epi16(restart_index); + __m128i min = _mm_set1_epi16(0xffff); + __m128i max = _mm_set1_epi16(0); + + for (unsigned n = 0; n < iterations; ++n) + { + const __m128i raw = _mm_loadu_si128(src_stream++); + const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask); + const __m128i mask = _mm_cmpeq_epi16(restart, value); + const __m128i value_with_min_restart = _mm_andnot_si128(mask, value); + const __m128i value_with_max_restart = _mm_or_si128(mask, value); + max = _mm_max_epu16(max, value_with_min_restart); + min = _mm_min_epu16(min, value_with_max_restart); + _mm_storeu_si128(dst_stream++, value_with_max_restart); + } + + min = _mm_minpos_epu16(min); + + __m128i tmp = _mm_srli_si128(max, 8); + max = _mm_max_epu16(max, tmp); + tmp = _mm_srli_si128(max, 4); + max = _mm_max_epu16(max, tmp); + tmp = _mm_srli_si128(max, 2); + max = _mm_max_epu16(max, tmp); + + const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF); + const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF); + + return std::make_tuple(min_index, max_index); + } + + SSE4_1_FUNC + static + std::tuple upload_u32_swapped_sse4_1(const void *src, void *dst, u32 iterations, u32 restart_index) + { + const __m128i shuffle_mask = _mm_set_epi8( + 0xC, 0xD, 0xE, 0xF, + 0x8, 0x9, 0xA, 0xB, + 0x4, 0x5, 0x6, 0x7, + 0x0, 0x1, 0x2, 0x3); + + auto src_stream = (const __m128i*)src; + auto dst_stream = (__m128i*)dst; + + __m128i restart = _mm_set1_epi32(restart_index); + __m128i min = _mm_set1_epi32(0xffffffff); + __m128i max = _mm_set1_epi32(0); + + for (unsigned n = 0; n < iterations; ++n) + { + const __m128i raw = _mm_loadu_si128(src_stream++); + const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask); + const __m128i mask = _mm_cmpeq_epi32(restart, value); + const __m128i value_with_min_restart = _mm_andnot_si128(mask, value); + const __m128i value_with_max_restart = _mm_or_si128(mask, value); + max = _mm_max_epu32(max, value_with_min_restart); + min = _mm_min_epu32(min, value_with_max_restart); + _mm_storeu_si128(dst_stream++, value_with_max_restart); + } + + __m128i tmp = _mm_srli_si128(min, 8); + min = _mm_min_epu32(min, tmp); + tmp = _mm_srli_si128(min, 4); + min = _mm_min_epu32(min, tmp); + + tmp = _mm_srli_si128(max, 8); + max = _mm_max_epu32(max, tmp); + tmp = _mm_srli_si128(max, 4); + max = _mm_max_epu32(max, tmp); + + const u32 min_index = u32(_mm_cvtsi128_si32(min)); + const u32 max_index = u32(_mm_cvtsi128_si32(max)); + + return std::make_tuple(min_index, max_index); + } + template static - std::tuple upload_untouched(gsl::span> src, gsl::span dst, u32 restart_index, bool skip_restart) + std::tuple upload_untouched(gsl::span> src, gsl::span dst, T restart_index, bool skip_restart) { - T min_index = index_limit(), max_index = 0; - u32 dst_index = 0; + T min_index = index_limit(); + T max_index = 0; + u32 written = 0; + u32 length = src.size(); - for (const T index : src) + if (length >= 32 && !skip_restart) { - if (index == restart_index) + if constexpr (std::is_same::value) { - if (!skip_restart) + if (s_use_avx2) { - dst[dst_index++] = index_limit(); + u32 iterations = length >> 4; + written = length & ~0xF; + std::tie(min_index, max_index) = upload_u16_swapped_avx2(src.data(), dst.data(), iterations, restart_index); + } + else if (s_use_sse4_1) + { + u32 iterations = length >> 3; + written = length & ~0x7; + std::tie(min_index, max_index) = upload_u16_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index); + } + } + else if constexpr (std::is_same::value) + { + if (s_use_sse4_1) + { + u32 iterations = length >> 2; + written = length & ~0x3; + std::tie(min_index, max_index) = upload_u32_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index); } } else { - dst[dst_index++] = min_max(min_index, max_index, index); + fmt::throw_exception("Unreachable" HERE); } } - return std::make_tuple(min_index, max_index, dst_index); + for (u32 i = written; i < length; ++i) + { + T index = src[i]; + if (index == restart_index) + { + if (!skip_restart) + { + dst[written++] = index_limit(); + } + } + else + { + dst[written++] = min_max(min_index, max_index, index); + } + } + + return std::make_tuple(min_index, max_index, written); } }; template std::tuple upload_untouched(gsl::span> src, gsl::span dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index) { - if (LIKELY(!is_primitive_restart_enabled)) + if (!is_primitive_restart_enabled) { return untouched_impl::upload_untouched(src, dst); } + else if constexpr (std::is_same::value) + { + if (primitive_restart_index > 0xffff) + { + return untouched_impl::upload_untouched(src, dst); + } + else + { + return primitive_restart_impl::upload_untouched(src, dst, (u16)primitive_restart_index, is_primitive_disjointed(draw_mode)); + } + } else { return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index, is_primitive_disjointed(draw_mode));