From cfd5cf6bdb56ea52a0b2cf56c1d83b081cb0fadc Mon Sep 17 00:00:00 2001
From: linkmauve <linkmauve@linkmauve.fr>
Date: Wed, 30 Oct 2019 14:42:44 +0100
Subject: [PATCH] Optimise primitive_restart::upload_untouched() (#6881)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* rsx: Optimise primitive_restart::upload_untouched() with SSE4.1

This optimisation is only applied when skip_restart is false.

I’ve only tested the u16 codepath, as it is the one used in NieR.

In some very unscientific profiling, this function used to take 2.76% of
the total frame time at the save point of the port town, it now takes
about 0.40%.

* rsx: Mark all SSE4.1 functions with attributes on gcc and clang

This assures the compiler we will take care of only calling these
functions after having checked that the CPU does support these
instructions.

* rsx: Add an AVX2 implementation of primitive restart ibo upload

* rsx: Remove redefinition of SSE4.1 instructions

Now that clang is aware that our functions are compiled with SSE4.1, it
lets us generate this code using its intrinsics.

* rsx: Optimise vector to scalar conversion

This is done using minpos and srli intrinsics and generate less code
than before.

Thanks Nekotekina for the suggestion!
---
 rpcs3/Emu/RSX/Common/BufferUtils.cpp | 305 +++++++++++++++++++++------
 1 file changed, 236 insertions(+), 69 deletions(-)

diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
index bda8616edc..077c72807a 100644
--- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
@@ -10,9 +10,12 @@
 
 #if defined(_MSC_VER)
 #define __SSSE3__  1
-#define __SSE4_1__ 1
+#define SSE4_1_FUNC
+#define AVX2_FUNC
 #else
 #define __sse_intrin static FORCE_INLINE
+#define SSE4_1_FUNC __attribute__((__target__("sse4.1")))
+#define AVX2_FUNC __attribute__((__target__("avx2")))
 #endif // _MSC_VER
 
 // NOTE: Clang does not allow to redefine missing intrinsics
@@ -26,38 +29,11 @@ __sse_intrin __m128i __mm_shuffle_epi8(__m128i opd, __m128i opa)
 #define __mm_shuffle_epi8 _mm_shuffle_epi8
 #endif // __SSSE3__
 
-#ifndef __SSE4_1__
-__sse_intrin __m128i __mm_max_epu32(__m128i opd, __m128i opa)
-{
-	__asm__("pmaxud %1, %0" : "+x" (opd) : "xm" (opa));
-	return opd;
-}
-__sse_intrin __m128i __mm_min_epu32(__m128i opd, __m128i opa)
-{
-	__asm__("pminud %1, %0" : "+x" (opd) : "xm" (opa));
-	return opd;
-}
-__sse_intrin __m128i __mm_max_epu16(__m128i opd, __m128i opa)
-{
-	__asm__("pmaxuw %1, %0" : "+x" (opd) : "xm" (opa));
-	return opd;
-}
-__sse_intrin __m128i __mm_min_epu16(__m128i opd, __m128i opa)
-{
-	__asm__("pminuw %1, %0" : "+x" (opd) : "xm" (opa));
-	return opd;
-}
-#else
-#define __mm_max_epu32 _mm_max_epu32
-#define __mm_min_epu32 _mm_min_epu32
-#define __mm_max_epu16 _mm_max_epu16
-#define __mm_min_epu16 _mm_min_epu16
-#endif // __SSE4_1__
-
 #undef __sse_intrin
 
 const bool s_use_ssse3 = utils::has_ssse3();
 const bool s_use_sse4_1 = utils::has_sse41();
+const bool s_use_avx2 = utils::has_avx2();
 
 namespace
 {
@@ -602,8 +578,9 @@ namespace
 
 	struct untouched_impl
 	{
+		SSE4_1_FUNC
 		static
-		std::tuple<u16, u16, u32> upload_u16_swapped(const void *src, void *dst, u32 count)
+		std::tuple<u16, u16, u32> upload_u16_swapped_sse4_1(const void *src, void *dst, u32 count)
 		{
 			const __m128i mask = _mm_set_epi8(
 				0xE, 0xF, 0xC, 0xD,
@@ -621,9 +598,9 @@ namespace
 			for (unsigned n = 0; n < iterations; ++n)
 			{
 				const __m128i raw = _mm_loadu_si128(src_stream++);
-				const __m128i value = __mm_shuffle_epi8(raw, mask);
-				max = __mm_max_epu16(max, value);
-				min = __mm_min_epu16(min, value);
+				const __m128i value = _mm_shuffle_epi8(raw, mask);
+				max = _mm_max_epu16(max, value);
+				min = _mm_min_epu16(min, value);
 				_mm_storeu_si128(dst_stream++, value);
 			}
 
@@ -639,19 +616,19 @@ namespace
 				0, 0, 0, 0, 0, 0, 0, 0,
 				0, 0, 0, 0, 0, 0, 0x3, 0x2);
 
-			__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
-			min = __mm_min_epu16(min, tmp);
-			tmp = __mm_shuffle_epi8(min, mask_step2);
-			min = __mm_min_epu16(min, tmp);
-			tmp = __mm_shuffle_epi8(min, mask_step3);
-			min = __mm_min_epu16(min, tmp);
+			__m128i tmp = _mm_shuffle_epi8(min, mask_step1);
+			min = _mm_min_epu16(min, tmp);
+			tmp = _mm_shuffle_epi8(min, mask_step2);
+			min = _mm_min_epu16(min, tmp);
+			tmp = _mm_shuffle_epi8(min, mask_step3);
+			min = _mm_min_epu16(min, tmp);
 
-			tmp = __mm_shuffle_epi8(max, mask_step1);
-			max = __mm_max_epu16(max, tmp);
-			tmp = __mm_shuffle_epi8(max, mask_step2);
-			max = __mm_max_epu16(max, tmp);
-			tmp = __mm_shuffle_epi8(max, mask_step3);
-			max = __mm_max_epu16(max, tmp);
+			tmp = _mm_shuffle_epi8(max, mask_step1);
+			max = _mm_max_epu16(max, tmp);
+			tmp = _mm_shuffle_epi8(max, mask_step2);
+			max = _mm_max_epu16(max, tmp);
+			tmp = _mm_shuffle_epi8(max, mask_step3);
+			max = _mm_max_epu16(max, tmp);
 
 			const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF);
 			const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF);
@@ -659,8 +636,9 @@ namespace
 			return std::make_tuple(min_index, max_index, count);
 		}
 
+		SSE4_1_FUNC
 		static
-		std::tuple<u32, u32, u32> upload_u32_swapped(const void *src, void *dst, u32 count)
+		std::tuple<u32, u32, u32> upload_u32_swapped_sse4_1(const void *src, void *dst, u32 count)
 		{
 			const __m128i mask = _mm_set_epi8(
 				0xC, 0xD, 0xE, 0xF,
@@ -678,9 +656,9 @@ namespace
 			for (unsigned n = 0; n < iterations; ++n)
 			{
 				const __m128i raw = _mm_loadu_si128(src_stream++);
-				const __m128i value = __mm_shuffle_epi8(raw, mask);
-				max = __mm_max_epu32(max, value);
-				min = __mm_min_epu32(min, value);
+				const __m128i value = _mm_shuffle_epi8(raw, mask);
+				max = _mm_max_epu32(max, value);
+				min = _mm_min_epu32(min, value);
 				_mm_storeu_si128(dst_stream++, value);
 			}
 
@@ -693,15 +671,15 @@ namespace
 				0, 0, 0, 0, 0, 0, 0, 0,
 				0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);
 
-			__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
-			min = __mm_min_epu32(min, tmp);
-			tmp = __mm_shuffle_epi8(min, mask_step2);
-			min = __mm_min_epu32(min, tmp);
+			__m128i tmp = _mm_shuffle_epi8(min, mask_step1);
+			min = _mm_min_epu32(min, tmp);
+			tmp = _mm_shuffle_epi8(min, mask_step2);
+			min = _mm_min_epu32(min, tmp);
 
-			tmp = __mm_shuffle_epi8(max, mask_step1);
-			max = __mm_max_epu32(max, tmp);
-			tmp = __mm_shuffle_epi8(max, mask_step2);
-			max = __mm_max_epu32(max, tmp);
+			tmp = _mm_shuffle_epi8(max, mask_step1);
+			max = _mm_max_epu32(max, tmp);
+			tmp = _mm_shuffle_epi8(max, mask_step2);
+			max = _mm_max_epu32(max, tmp);
 
 			const u32 min_index = u32(_mm_cvtsi128_si32(min));
 			const u32 max_index = u32(_mm_cvtsi128_si32(max));
@@ -722,12 +700,12 @@ namespace
 				if constexpr (std::is_same<T, u32>::value)
 				{
 					const auto count = (remaining & ~0x3);
-					std::tie(min_index, max_index, written) = upload_u32_swapped(src.data(), dst.data(), count);
+					std::tie(min_index, max_index, written) = upload_u32_swapped_sse4_1(src.data(), dst.data(), count);
 				}
 				else if constexpr (std::is_same<T, u16>::value)
 				{
 					const auto count = (remaining & ~0x7);
-					std::tie(min_index, max_index, written) = upload_u16_swapped(src.data(), dst.data(), count);
+					std::tie(min_index, max_index, written) = upload_u16_swapped_sse4_1(src.data(), dst.data(), count);
 				}
 				else
 				{
@@ -755,39 +733,228 @@ namespace
 
 	struct primitive_restart_impl
 	{
+		AVX2_FUNC
+		static
+		std::tuple<u16, u16> upload_u16_swapped_avx2(const void *src, void *dst, u32 iterations, u16 restart_index)
+		{
+			const __m256i shuffle_mask = _mm256_set_epi8(
+				0xE, 0xF, 0xC, 0xD,
+				0xA, 0xB, 0x8, 0x9,
+				0x6, 0x7, 0x4, 0x5,
+				0x2, 0x3, 0x0, 0x1,
+				0xE, 0xF, 0xC, 0xD,
+				0xA, 0xB, 0x8, 0x9,
+				0x6, 0x7, 0x4, 0x5,
+				0x2, 0x3, 0x0, 0x1);
+
+			auto src_stream = (const __m256i*)src;
+			auto dst_stream = (__m256i*)dst;
+
+			__m256i restart = _mm256_set1_epi16(restart_index);
+			__m256i min = _mm256_set1_epi16(0xffff);
+			__m256i max = _mm256_set1_epi16(0);
+
+			for (unsigned n = 0; n < iterations; ++n)
+			{
+				const __m256i raw = _mm256_loadu_si256(src_stream++);
+				const __m256i value = _mm256_shuffle_epi8(raw, shuffle_mask);
+				const __m256i mask = _mm256_cmpeq_epi16(restart, value);
+				const __m256i value_with_min_restart = _mm256_andnot_si256(mask, value);
+				const __m256i value_with_max_restart = _mm256_or_si256(mask, value);
+				max = _mm256_max_epu16(max, value_with_min_restart);
+				min = _mm256_min_epu16(min, value_with_max_restart);
+				_mm256_storeu_si256(dst_stream++, value_with_max_restart);
+			}
+
+			__m128i tmp = _mm256_extracti128_si256(min, 1);
+			__m128i min2 = _mm256_castsi256_si128(min);
+			min2 = _mm_min_epu16(min2, tmp);
+			min2 = _mm_minpos_epu16(min2);
+
+			tmp = _mm256_extracti128_si256(max, 1);
+			__m128i max2 = _mm256_castsi256_si128(max);
+			max2 = _mm_max_epu16(max2, tmp);
+			tmp = _mm_srli_si128(max2, 8);
+			max2 = _mm_max_epu16(max2, tmp);
+			tmp = _mm_srli_si128(max2, 4);
+			max2 = _mm_max_epu16(max2, tmp);
+			tmp = _mm_srli_si128(max2, 2);
+			max2 = _mm_max_epu16(max2, tmp);
+
+			const u16 min_index = u16(_mm_cvtsi128_si32(min2) & 0xFFFF);
+			const u16 max_index = u16(_mm_cvtsi128_si32(max2) & 0xFFFF);
+
+			return std::make_tuple(min_index, max_index);
+		}
+
+		SSE4_1_FUNC
+		static
+		std::tuple<u16, u16> upload_u16_swapped_sse4_1(const void *src, void *dst, u32 iterations, u16 restart_index)
+		{
+			const __m128i shuffle_mask = _mm_set_epi8(
+				0xE, 0xF, 0xC, 0xD,
+				0xA, 0xB, 0x8, 0x9,
+				0x6, 0x7, 0x4, 0x5,
+				0x2, 0x3, 0x0, 0x1);
+
+			auto src_stream = (const __m128i*)src;
+			auto dst_stream = (__m128i*)dst;
+
+			__m128i restart = _mm_set1_epi16(restart_index);
+			__m128i min = _mm_set1_epi16(0xffff);
+			__m128i max = _mm_set1_epi16(0);
+
+			for (unsigned n = 0; n < iterations; ++n)
+			{
+				const __m128i raw = _mm_loadu_si128(src_stream++);
+				const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask);
+				const __m128i mask = _mm_cmpeq_epi16(restart, value);
+				const __m128i value_with_min_restart = _mm_andnot_si128(mask, value);
+				const __m128i value_with_max_restart = _mm_or_si128(mask, value);
+				max = _mm_max_epu16(max, value_with_min_restart);
+				min = _mm_min_epu16(min, value_with_max_restart);
+				_mm_storeu_si128(dst_stream++, value_with_max_restart);
+			}
+
+			min = _mm_minpos_epu16(min);
+
+			__m128i tmp = _mm_srli_si128(max, 8);
+			max = _mm_max_epu16(max, tmp);
+			tmp = _mm_srli_si128(max, 4);
+			max = _mm_max_epu16(max, tmp);
+			tmp = _mm_srli_si128(max, 2);
+			max = _mm_max_epu16(max, tmp);
+
+			const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF);
+			const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF);
+
+			return std::make_tuple(min_index, max_index);
+		}
+
+		SSE4_1_FUNC
+		static
+		std::tuple<u32, u32> upload_u32_swapped_sse4_1(const void *src, void *dst, u32 iterations, u32 restart_index)
+		{
+			const __m128i shuffle_mask = _mm_set_epi8(
+				0xC, 0xD, 0xE, 0xF,
+				0x8, 0x9, 0xA, 0xB,
+				0x4, 0x5, 0x6, 0x7,
+				0x0, 0x1, 0x2, 0x3);
+
+			auto src_stream = (const __m128i*)src;
+			auto dst_stream = (__m128i*)dst;
+
+			__m128i restart = _mm_set1_epi32(restart_index);
+			__m128i min = _mm_set1_epi32(0xffffffff);
+			__m128i max = _mm_set1_epi32(0);
+
+			for (unsigned n = 0; n < iterations; ++n)
+			{
+				const __m128i raw = _mm_loadu_si128(src_stream++);
+				const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask);
+				const __m128i mask = _mm_cmpeq_epi32(restart, value);
+				const __m128i value_with_min_restart = _mm_andnot_si128(mask, value);
+				const __m128i value_with_max_restart = _mm_or_si128(mask, value);
+				max = _mm_max_epu32(max, value_with_min_restart);
+				min = _mm_min_epu32(min, value_with_max_restart);
+				_mm_storeu_si128(dst_stream++, value_with_max_restart);
+			}
+
+			__m128i tmp = _mm_srli_si128(min, 8);
+			min = _mm_min_epu32(min, tmp);
+			tmp = _mm_srli_si128(min, 4);
+			min = _mm_min_epu32(min, tmp);
+
+			tmp = _mm_srli_si128(max, 8);
+			max = _mm_max_epu32(max, tmp);
+			tmp = _mm_srli_si128(max, 4);
+			max = _mm_max_epu32(max, tmp);
+
+			const u32 min_index = u32(_mm_cvtsi128_si32(min));
+			const u32 max_index = u32(_mm_cvtsi128_si32(max));
+
+			return std::make_tuple(min_index, max_index);
+		}
+
 		template<typename T>
 		static
-		std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::span<T> dst, u32 restart_index, bool skip_restart)
+		std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::span<T> dst, T restart_index, bool skip_restart)
 		{
-			T min_index = index_limit<T>(), max_index = 0;
-			u32 dst_index = 0;
+			T min_index = index_limit<T>();
+			T max_index = 0;
+			u32 written = 0;
+			u32 length = src.size();
 
-			for (const T index : src)
+			if (length >= 32 && !skip_restart)
 			{
-				if (index == restart_index)
+				if constexpr (std::is_same<T, u16>::value)
 				{
-					if (!skip_restart)
+					if (s_use_avx2)
 					{
-						dst[dst_index++] = index_limit<T>();
+						u32 iterations = length >> 4;
+						written = length & ~0xF;
+						std::tie(min_index, max_index) = upload_u16_swapped_avx2(src.data(), dst.data(), iterations, restart_index);
+					}
+					else if (s_use_sse4_1)
+					{
+						u32 iterations = length >> 3;
+						written = length & ~0x7;
+						std::tie(min_index, max_index) = upload_u16_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index);
+					}
+				}
+				else if constexpr (std::is_same<T, u32>::value)
+				{
+					if (s_use_sse4_1)
+					{
+						u32 iterations = length >> 2;
+						written = length & ~0x3;
+						std::tie(min_index, max_index) = upload_u32_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index);
 					}
 				}
 				else
 				{
-					dst[dst_index++] = min_max(min_index, max_index, index);
+					fmt::throw_exception("Unreachable" HERE);
 				}
 			}
 
-			return std::make_tuple(min_index, max_index, dst_index);
+			for (u32 i = written; i < length; ++i)
+			{
+				T index = src[i];
+				if (index == restart_index)
+				{
+					if (!skip_restart)
+					{
+						dst[written++] = index_limit<T>();
+					}
+				}
+				else
+				{
+					dst[written++] = min_max(min_index, max_index, index);
+				}
+			}
+
+			return std::make_tuple(min_index, max_index, written);
 		}
 	};
 
 	template<typename T>
 	std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::span<T> dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index)
 	{
-		if (LIKELY(!is_primitive_restart_enabled))
+		if (!is_primitive_restart_enabled)
 		{
 			return untouched_impl::upload_untouched(src, dst);
 		}
+		else if constexpr (std::is_same<T, u16>::value)
+		{
+			if (primitive_restart_index > 0xffff)
+			{
+				return untouched_impl::upload_untouched(src, dst);
+			}
+			else
+			{
+				return primitive_restart_impl::upload_untouched(src, dst, (u16)primitive_restart_index, is_primitive_disjointed(draw_mode));
+			}
+		}
 		else
 		{
 			return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index, is_primitive_disjointed(draw_mode));