From 0db9850a730d439a530e810c4e08fccef47499c5 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 24 Jan 2022 22:22:42 +0300 Subject: [PATCH] Add loop building utilities for ASMJIT Refactor copy_data_swap_u32 a bit --- Utilities/JIT.h | 54 ++++++++ rpcs3/Emu/RSX/Common/BufferUtils.cpp | 181 ++++++++++++--------------- 2 files changed, 137 insertions(+), 98 deletions(-) diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 3e1a1f8cae..4c70785f2e 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -209,6 +209,60 @@ namespace asmjit static_cast(args); #endif } + +#if defined(ARCH_X64) + template + struct native_vec; + + template <> + struct native_vec<16> { using type = x86::Xmm; }; + + template <> + struct native_vec<32> { using type = x86::Ymm; }; + + template <> + struct native_vec<64> { using type = x86::Zmm; }; + + template + using native_vec_t = typename native_vec::type; + + // if (count > step) { for (; ctr < (count - step); ctr += step) {...} count -= ctr; } + inline void build_incomplete_loop(native_asm& c, auto ctr, auto count, u32 step, auto&& build) + { + asmjit::Label body = c.newLabel(); + asmjit::Label exit = c.newLabel(); + + ensure((step & (step - 1)) == 0); + c.cmp(count, step); + c.jbe(exit); + c.sub(count, step); + c.align(asmjit::AlignMode::kCode, 16); + c.bind(body); + build(); + c.add(ctr, step); + c.sub(count, step); + c.ja(body); + c.add(count, step); + c.bind(exit); + } + + // for (; count > 0; ctr++, count--) + inline void build_loop(native_asm& c, auto ctr, auto count, auto&& build) + { + asmjit::Label body = c.newLabel(); + asmjit::Label exit = c.newLabel(); + + c.test(count, count); + c.jz(exit); + c.align(asmjit::AlignMode::kCode, 16); + c.bind(body); + build(); + c.inc(ctr); + c.sub(count, 1); + c.ja(body); + c.bind(exit); + } +#endif } // Build runtime function with asmjit::X86Assembler diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index ddd93ba85d..acb7b7b38c 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -28,17 +28,14 @@ #if defined(_MSC_VER) || !defined(__SSE2__) #define PLAIN_FUNC -#define SSSE3_FUNC #define SSE4_1_FUNC #define AVX2_FUNC #define AVX3_FUNC #else #ifndef __clang__ #define PLAIN_FUNC __attribute__((optimize("no-tree-vectorize"))) -#define SSSE3_FUNC __attribute__((__target__("ssse3"))) __attribute__((optimize("tree-vectorize"))) #else #define PLAIN_FUNC -#define SSSE3_FUNC __attribute__((__target__("ssse3"))) #endif #define SSE4_1_FUNC __attribute__((__target__("sse4.1"))) #define AVX2_FUNC __attribute__((__target__("avx2"))) @@ -139,115 +136,103 @@ namespace } } - template - SSSE3_FUNC auto copy_data_swap_u32_ssse3(u32* dst, const u32* src, u32 count) - { - u32 result = 0; - -#ifdef __clang__ - #pragma clang loop vectorize(enable) interleave(disable) unroll(disable) -#endif - for (u32 i = 0; i < count; i++) - { - const u32 data = stx::se_storage::swap(src[i]); - - if constexpr (Compare) - { - result |= data ^ dst[i]; - } - - dst[i] = data; - } - - if constexpr (Compare) - { - return static_cast(result); - } - } - #if defined(ARCH_X64) - template - void build_copy_data_swap_u32_avx3(asmjit::x86::Assembler& c, std::array& args, const RT& rmask, const RT& rload, const RT& rtest) + template + void build_copy_data_swap_u32_avx3(native_asm& c, native_args& args) { using namespace asmjit; - Label loop = c.newLabel(); - Label tail = c.newLabel(); + native_vec_t vdata{0}; + native_vec_t vmask{1}; + native_vec_t vcmp{2}; - // Get start alignment offset - c.mov(args[3].r32(), args[0].r32()); - c.and_(args[3].r32(), Size * 4 - 1); + // Load and broadcast shuffle mask + if constexpr (!Avx) + c.movaps(vmask, x86::oword_ptr(uptr(&s_bswap_u32_mask))); + if constexpr (Size == 16 && Avx) + c.vmovaps(vmask, x86::oword_ptr(uptr(&s_bswap_u32_mask))); + if constexpr (Size >= 32) + c.vbroadcasti32x4(vmask, x86::oword_ptr(uptr(&s_bswap_u32_mask))); - // Load and duplicate shuffle mask - c.vbroadcasti32x4(rmask, x86::oword_ptr(uptr(&s_bswap_u32_mask))); - if (Compare) - c.vpxor(x86::xmm2, x86::xmm2, x86::xmm2); + // Clear vcmp (bitwise inequality accumulator) + if constexpr (Compare && Avx) + c.vxorps(x86::xmm2, x86::xmm2, x86::xmm2); + if constexpr (Compare && !Avx) + c.xorps(x86::xmm2, x86::xmm2); + c.mov(args[3].r32(), -1); + c.xor_(x86::eax, x86::eax); - c.or_(x86::eax, -1); - // Small data: skip to tail (ignore alignment) - c.cmp(args[2].r32(), Size); - c.jbe(tail); - - // Generate mask for first iteration, adjust args using alignment offset - c.sub(args[1], args[3]); - c.shr(args[3].r32(), 2); - c.shlx(x86::eax, x86::eax, args[3].r32()); - c.kmovw(x86::k1, x86::eax); - c.and_(args[0], -Size * 4); - c.add(args[2].r32(), args[3].r32()); - - c.k(x86::k1).z().vmovdqu32(rload, x86::Mem(args[1], 0, Size * 4u)); - c.vpshufb(rload, rload, rmask); - if (Compare) - c.k(x86::k1).z().vpxord(rtest, rload, x86::Mem(args[0], 0, Size * 4u)); - c.k(x86::k1).vmovdqa32(x86::Mem(args[0], 0, Size * 4u), rload); - c.lea(args[0], x86::qword_ptr(args[0], Size * 4)); - c.lea(args[1], x86::qword_ptr(args[1], Size * 4)); - c.sub(args[2].r32(), Size); - - c.or_(x86::eax, -1); - c.align(AlignMode::kCode, 16); - - c.bind(loop); - c.cmp(args[2].r32(), Size); - c.jbe(tail); - c.vmovdqu32(rload, x86::Mem(args[1], 0, Size * 4u)); - c.vpshufb(rload, rload, rmask); - if (Compare) - c.vpternlogd(rtest, rload, x86::Mem(args[0], 0, Size * 4u), 0xf6); // orAxorBC - c.vmovdqa32(x86::Mem(args[0], 0, Size * 4u), rload); - c.lea(args[0], x86::qword_ptr(args[0], Size * 4)); - c.lea(args[1], x86::qword_ptr(args[1], Size * 4)); - c.sub(args[2].r32(), Size); - c.jmp(loop); - - c.bind(tail); - c.bzhi(x86::eax, x86::eax, args[2].r32()); - c.kmovw(x86::k1, x86::eax); - c.k(x86::k1).z().vmovdqu32(rload, x86::Mem(args[1], 0, Size * 4u)); - c.vpshufb(rload, rload, rmask); - if (Compare) - c.k(x86::k1).vpternlogd(rtest, rload, x86::Mem(args[0], 0, Size * 4u), 0xf6); - c.k(x86::k1).vmovdqu32(x86::Mem(args[0], 0, Size * 4u), rload); - - if (Compare) + build_incomplete_loop(c, x86::eax, args[2].r32(), Size / 4, [&] { - if constexpr (Size != 16) + if constexpr (Avx) { - c.vptest(rtest, rtest); + c.vmovdqu32(vdata, x86::ptr(args[1], x86::rax, 2, 0, Size)); + c.vpshufb(vdata, vdata, vmask); + if constexpr (Compare) + c.vpternlogd(vcmp, vdata, x86::ptr(args[0], x86::rax, 2, 0, Size), 0xf6); // orAxorBC + c.vmovdqu32(x86::ptr(args[0], x86::rax, 2, 0, Size), vdata); } else { - c.vptestmd(x86::k1, rtest, rtest); + c.movdqu(vdata, x86::oword_ptr(args[1], x86::rax, 2, 0)); + c.pshufb(vdata, vmask); + if constexpr (Compare) + { + c.movups(x86::xmm3, x86::oword_ptr(args[0], x86::rax, 2, 0)); + c.xorps(x86::xmm3, vdata); + c.orps(vcmp, x86::xmm3); + } + c.movups(x86::oword_ptr(args[0], x86::rax, 2, 0), vdata); + } + }); + + if constexpr (Avx) + { + c.bzhi(args[3].r32(), args[3].r32(), args[2].r32()); + c.kmovw(x86::k1, args[3].r32()); + c.k(x86::k1).z().vmovdqu32(vdata, x86::ptr(args[1], x86::rax, 2, 0, Size)); + c.vpshufb(vdata, vdata, vmask); + if constexpr (Compare) + c.k(x86::k1).vpternlogd(vcmp, vdata, x86::ptr(args[0], x86::rax, 2, 0, Size), 0xf6); + c.k(x86::k1).vmovdqu32(x86::ptr(args[0], x86::rax, 2, 0, Size), vdata); + } + else + { + build_loop(c, x86::eax, args[2].r32(), [&] + { + c.movd(vdata, x86::dword_ptr(args[1], x86::rax, 2, 0)); + c.pshufb(vdata, vmask); + if constexpr (Compare) + { + c.movd(x86::xmm3, x86::dword_ptr(args[0], x86::rax, 2, 0)); + c.pxor(x86::xmm3, vdata); + c.por(vcmp, x86::xmm3); + } + c.movd(x86::dword_ptr(args[0], x86::rax, 2, 0), vdata); + }); + } + + if (Compare) + { + if constexpr (!Avx) + { + c.ptest(vcmp, vcmp); + } + else if constexpr (Size != 64) + { + c.vptest(vcmp, vcmp); + } + else + { + c.vptestmd(x86::k1, vcmp, vcmp); c.ktestw(x86::k1, x86::k1); } c.setnz(x86::al); } -#ifndef __AVX__ - c.vzeroupper(); -#endif + if constexpr (Avx) + c.vzeroupper(); c.ret(); } @@ -260,17 +245,17 @@ namespace { if (utils::has_avx512_icl()) { - build_copy_data_swap_u32_avx3(c, args, x86::zmm0, x86::zmm1, x86::zmm2); + build_copy_data_swap_u32_avx3(c, args); return; } - build_copy_data_swap_u32_avx3(c, args, x86::ymm0, x86::ymm1, x86::ymm2); + build_copy_data_swap_u32_avx3(c, args); return; } - if (utils::has_ssse3()) + if (utils::has_sse41()) { - c.jmp(©_data_swap_u32_ssse3); + build_copy_data_swap_u32_avx3(c, args); return; }