From 0db9850a730d439a530e810c4e08fccef47499c5 Mon Sep 17 00:00:00 2001
From: Nekotekina <nekotekina@gmail.com>
Date: Mon, 24 Jan 2022 22:22:42 +0300
Subject: [PATCH] Add loop building utilities for ASMJIT

Refactor copy_data_swap_u32 a bit
---
 Utilities/JIT.h                      |  54 ++++++++
 rpcs3/Emu/RSX/Common/BufferUtils.cpp | 181 ++++++++++++---------------
 2 files changed, 137 insertions(+), 98 deletions(-)
diff --git a/Utilities/JIT.h b/Utilities/JIT.h
index 3e1a1f8cae..4c70785f2e 100644
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@@ -209,6 +209,60 @@ namespace asmjit
 		static_cast<void>(args);
 #endif
 	}
+
+#if defined(ARCH_X64)
+	template <uint Size>
+	struct native_vec;
+
+	template <>
+	struct native_vec<16> { using type = x86::Xmm; };
+
+	template <>
+	struct native_vec<32> { using type = x86::Ymm; };
+
+	template <>
+	struct native_vec<64> { using type = x86::Zmm; };
+
+	template <uint Size>
+	using native_vec_t = typename native_vec<Size>::type;
+
+	// if (count > step) { for (; ctr < (count - step); ctr += step) {...} count -= ctr; }
+	inline void build_incomplete_loop(native_asm& c, auto ctr, auto count, u32 step, auto&& build)
+	{
+		asmjit::Label body = c.newLabel();
+		asmjit::Label exit = c.newLabel();
+
+		ensure((step & (step - 1)) == 0);
+		c.cmp(count, step);
+		c.jbe(exit);
+		c.sub(count, step);
+		c.align(asmjit::AlignMode::kCode, 16);
+		c.bind(body);
+		build();
+		c.add(ctr, step);
+		c.sub(count, step);
+		c.ja(body);
+		c.add(count, step);
+		c.bind(exit);
+	}
+
+	// for (; count > 0; ctr++, count--)
+	inline void build_loop(native_asm& c, auto ctr, auto count, auto&& build)
+	{
+		asmjit::Label body = c.newLabel();
+		asmjit::Label exit = c.newLabel();
+
+		c.test(count, count);
+		c.jz(exit);
+		c.align(asmjit::AlignMode::kCode, 16);
+		c.bind(body);
+		build();
+		c.inc(ctr);
+		c.sub(count, 1);
+		c.ja(body);
+		c.bind(exit);
+	}
+#endif
 }
 
 // Build runtime function with asmjit::X86Assembler
diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
index ddd93ba85d..acb7b7b38c 100644
--- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
@@ -28,17 +28,14 @@
 
 #if defined(_MSC_VER) || !defined(__SSE2__)
 #define PLAIN_FUNC
-#define SSSE3_FUNC
 #define SSE4_1_FUNC
 #define AVX2_FUNC
 #define AVX3_FUNC
 #else
 #ifndef __clang__
 #define PLAIN_FUNC __attribute__((optimize("no-tree-vectorize")))
-#define SSSE3_FUNC __attribute__((__target__("ssse3"))) __attribute__((optimize("tree-vectorize")))
 #else
 #define PLAIN_FUNC
-#define SSSE3_FUNC __attribute__((__target__("ssse3")))
 #endif
 #define SSE4_1_FUNC __attribute__((__target__("sse4.1")))
 #define AVX2_FUNC __attribute__((__target__("avx2")))
@@ -139,115 +136,103 @@ namespace
 		}
 	}
 
-	template <bool Compare>
-	SSSE3_FUNC auto copy_data_swap_u32_ssse3(u32* dst, const u32* src, u32 count)
-	{
-		u32 result = 0;
-
-#ifdef __clang__
-		#pragma clang loop vectorize(enable) interleave(disable) unroll(disable)
-#endif
-		for (u32 i = 0; i < count; i++)
-		{
-			const u32 data = stx::se_storage<u32>::swap(src[i]);
-
-			if constexpr (Compare)
-			{
-				result |= data ^ dst[i];
-			}
-
-			dst[i] = data;
-		}
-
-		if constexpr (Compare)
-		{
-			return static_cast<bool>(result);
-		}
-	}
-
 #if defined(ARCH_X64)
-	template <bool Compare, int Size, typename RT>
-	void build_copy_data_swap_u32_avx3(asmjit::x86::Assembler& c, std::array<asmjit::x86::Gp, 4>& args, const RT& rmask, const RT& rload, const RT& rtest)
+	template <bool Compare, uint Size, bool Avx = true>
+	void build_copy_data_swap_u32_avx3(native_asm& c, native_args& args)
 	{
 		using namespace asmjit;
 
-		Label loop = c.newLabel();
-		Label tail = c.newLabel();
+		native_vec_t<Size> vdata{0};
+		native_vec_t<Size> vmask{1};
+		native_vec_t<Size> vcmp{2};
 
-		// Get start alignment offset
-		c.mov(args[3].r32(), args[0].r32());
-		c.and_(args[3].r32(), Size * 4 - 1);
+		// Load and broadcast shuffle mask
+		if constexpr (!Avx)
+			c.movaps(vmask, x86::oword_ptr(uptr(&s_bswap_u32_mask)));
+		if constexpr (Size == 16 && Avx)
+			c.vmovaps(vmask, x86::oword_ptr(uptr(&s_bswap_u32_mask)));
+		if constexpr (Size >= 32)
+			c.vbroadcasti32x4(vmask, x86::oword_ptr(uptr(&s_bswap_u32_mask)));
 
-		// Load and duplicate shuffle mask
-		c.vbroadcasti32x4(rmask, x86::oword_ptr(uptr(&s_bswap_u32_mask)));
-		if (Compare)
-			c.vpxor(x86::xmm2, x86::xmm2, x86::xmm2);
+		// Clear vcmp (bitwise inequality accumulator)
+		if constexpr (Compare && Avx)
+			c.vxorps(x86::xmm2, x86::xmm2, x86::xmm2);
+		if constexpr (Compare && !Avx)
+			c.xorps(x86::xmm2, x86::xmm2);
+		c.mov(args[3].r32(), -1);
+		c.xor_(x86::eax, x86::eax);
 
-		c.or_(x86::eax, -1);
-		// Small data: skip to tail (ignore alignment)
-		c.cmp(args[2].r32(), Size);
-		c.jbe(tail);
-
-		// Generate mask for first iteration, adjust args using alignment offset
-		c.sub(args[1], args[3]);
-		c.shr(args[3].r32(), 2);
-		c.shlx(x86::eax, x86::eax, args[3].r32());
-		c.kmovw(x86::k1, x86::eax);
-		c.and_(args[0], -Size * 4);
-		c.add(args[2].r32(), args[3].r32());
-
-		c.k(x86::k1).z().vmovdqu32(rload, x86::Mem(args[1], 0, Size * 4u));
-		c.vpshufb(rload, rload, rmask);
-		if (Compare)
-			c.k(x86::k1).z().vpxord(rtest, rload, x86::Mem(args[0], 0, Size * 4u));
-		c.k(x86::k1).vmovdqa32(x86::Mem(args[0], 0, Size * 4u), rload);
-		c.lea(args[0], x86::qword_ptr(args[0], Size * 4));
-		c.lea(args[1], x86::qword_ptr(args[1], Size * 4));
-		c.sub(args[2].r32(), Size);
-
-		c.or_(x86::eax, -1);
-		c.align(AlignMode::kCode, 16);
-
-		c.bind(loop);
-		c.cmp(args[2].r32(), Size);
-		c.jbe(tail);
-		c.vmovdqu32(rload, x86::Mem(args[1], 0, Size * 4u));
-		c.vpshufb(rload, rload, rmask);
-		if (Compare)
-			c.vpternlogd(rtest, rload, x86::Mem(args[0], 0, Size * 4u), 0xf6); // orAxorBC
-		c.vmovdqa32(x86::Mem(args[0], 0, Size * 4u), rload);
-		c.lea(args[0], x86::qword_ptr(args[0], Size * 4));
-		c.lea(args[1], x86::qword_ptr(args[1], Size * 4));
-		c.sub(args[2].r32(), Size);
-		c.jmp(loop);
-
-		c.bind(tail);
-		c.bzhi(x86::eax, x86::eax, args[2].r32());
-		c.kmovw(x86::k1, x86::eax);
-		c.k(x86::k1).z().vmovdqu32(rload, x86::Mem(args[1], 0, Size * 4u));
-		c.vpshufb(rload, rload, rmask);
-		if (Compare)
-			c.k(x86::k1).vpternlogd(rtest, rload, x86::Mem(args[0], 0, Size * 4u), 0xf6);
-		c.k(x86::k1).vmovdqu32(x86::Mem(args[0], 0, Size * 4u), rload);
-
-		if (Compare)
+		build_incomplete_loop(c, x86::eax, args[2].r32(), Size / 4, [&]
 		{
-			if constexpr (Size != 16)
+			if constexpr (Avx)
 			{
-				c.vptest(rtest, rtest);
+				c.vmovdqu32(vdata, x86::ptr(args[1], x86::rax, 2, 0, Size));
+				c.vpshufb(vdata, vdata, vmask);
+				if constexpr (Compare)
+					c.vpternlogd(vcmp, vdata, x86::ptr(args[0], x86::rax, 2, 0, Size), 0xf6); // orAxorBC
+				c.vmovdqu32(x86::ptr(args[0], x86::rax, 2, 0, Size), vdata);
 			}
 			else
 			{
-				c.vptestmd(x86::k1, rtest, rtest);
+				c.movdqu(vdata, x86::oword_ptr(args[1], x86::rax, 2, 0));
+				c.pshufb(vdata, vmask);
+				if constexpr (Compare)
+				{
+					c.movups(x86::xmm3, x86::oword_ptr(args[0], x86::rax, 2, 0));
+					c.xorps(x86::xmm3, vdata);
+					c.orps(vcmp, x86::xmm3);
+				}
+				c.movups(x86::oword_ptr(args[0], x86::rax, 2, 0), vdata);
+			}
+		});
+
+		if constexpr (Avx)
+		{
+			c.bzhi(args[3].r32(), args[3].r32(), args[2].r32());
+			c.kmovw(x86::k1, args[3].r32());
+			c.k(x86::k1).z().vmovdqu32(vdata, x86::ptr(args[1], x86::rax, 2, 0, Size));
+			c.vpshufb(vdata, vdata, vmask);
+			if constexpr (Compare)
+				c.k(x86::k1).vpternlogd(vcmp, vdata, x86::ptr(args[0], x86::rax, 2, 0, Size), 0xf6);
+			c.k(x86::k1).vmovdqu32(x86::ptr(args[0], x86::rax, 2, 0, Size), vdata);
+		}
+		else
+		{
+			build_loop(c, x86::eax, args[2].r32(), [&]
+			{
+				c.movd(vdata, x86::dword_ptr(args[1], x86::rax, 2, 0));
+				c.pshufb(vdata, vmask);
+				if constexpr (Compare)
+				{
+					c.movd(x86::xmm3, x86::dword_ptr(args[0], x86::rax, 2, 0));
+					c.pxor(x86::xmm3, vdata);
+					c.por(vcmp, x86::xmm3);
+				}
+				c.movd(x86::dword_ptr(args[0], x86::rax, 2, 0), vdata);
+			});
+		}
+
+		if (Compare)
+		{
+			if constexpr (!Avx)
+			{
+				c.ptest(vcmp, vcmp);
+			}
+			else if constexpr (Size != 64)
+			{
+				c.vptest(vcmp, vcmp);
+			}
+			else
+			{
+				c.vptestmd(x86::k1, vcmp, vcmp);
 				c.ktestw(x86::k1, x86::k1);
 			}
 
 			c.setnz(x86::al);
 		}
 
-#ifndef __AVX__
-		c.vzeroupper();
-#endif
+		if constexpr (Avx)
+			c.vzeroupper();
 		c.ret();
 	}
 
@@ -260,17 +245,17 @@ namespace
 		{
 			if (utils::has_avx512_icl())
 			{
-				build_copy_data_swap_u32_avx3<Compare, 16>(c, args, x86::zmm0, x86::zmm1, x86::zmm2);
+				build_copy_data_swap_u32_avx3<Compare, 64>(c, args);
 				return;
 			}
 
-			build_copy_data_swap_u32_avx3<Compare, 8>(c, args, x86::ymm0, x86::ymm1, x86::ymm2);
+			build_copy_data_swap_u32_avx3<Compare, 32>(c, args);
 			return;
 		}
 
-		if (utils::has_ssse3())
+		if (utils::has_sse41())
 		{
-			c.jmp(&copy_data_swap_u32_ssse3<Compare>);
+			build_copy_data_swap_u32_avx3<Compare, 16, false>(c, args);
 			return;
 		}