diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 66d28ded23..4f5e7037c0 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -90,6 +90,22 @@ static const bool s_tsx_avx = utils::has_avx(); // For special case static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx(); +// Threshold for when rep mosvb is expected to outperform simd copies +// The threshold will be 0xFFFFFFFF when the performance of rep movsb is expected to be bad +static const u32 s_rep_movsb_threshold = utils::get_rep_movsb_threshold(); + +#ifndef _MSC_VER +static FORCE_INLINE void __movsb(unsigned char * Dst, const unsigned char * Src, size_t Size) +{ + __asm__ __volatile__ + ( + "rep; movsb" : + [Dst] "=D" (Dst), [Src] "=S" (Src), [Size] "=c" (Size) : + "[Dst]" (Dst), "[Src]" (Src), "[Size]" (Size) + ); +} +#endif + static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs) { #if defined(_MSC_VER) || defined(__AVX__) @@ -2234,32 +2250,41 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* // Split locking + transfer in two parts (before 64K border, and after it) vm::range_lock(range_lock, range_addr, size0); - // Avoid unaligned stores in mov_rdata_avx - if (reinterpret_cast(dst) & 0x10) + if (size > s_rep_movsb_threshold) { - *reinterpret_cast(dst) = *reinterpret_cast(src); - - dst += 16; - src += 16; - size0 -= 16; + __movsb(dst, src, size0); + dst += size0; + src += size0; } - - while (size0 >= 128) + else { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + // Avoid unaligned stores in mov_rdata_avx + if (reinterpret_cast(dst) & 0x10) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); - dst += 128; - src += 128; - size0 -= 128; - } + dst += 16; + src += 16; + size0 -= 16; + } - while (size0) - { - *reinterpret_cast(dst) = *reinterpret_cast(src); + while (size0 >= 128) + { + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); - dst += 16; - src += 16; - size0 -= 16; + dst += 128; + src += 128; + size0 -= 128; + } + + while (size0) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + + dst += 16; + src += 16; + size0 -= 16; + } } range_lock->release(0); @@ -2268,32 +2293,39 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* vm::range_lock(range_lock, range_addr, range_end - range_addr); - // Avoid unaligned stores in mov_rdata_avx - if (reinterpret_cast(dst) & 0x10) + if (size > s_rep_movsb_threshold) { - *reinterpret_cast(dst) = *reinterpret_cast(src); - - dst += 16; - src += 16; - size -= 16; + __movsb(dst, src, size); } - - while (size >= 128) + else { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + // Avoid unaligned stores in mov_rdata_avx + if (reinterpret_cast(dst) & 0x10) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); - dst += 128; - src += 128; - size -= 128; - } + dst += 16; + src += 16; + size -= 16; + } - while (size) - { - *reinterpret_cast(dst) = *reinterpret_cast(src); + while (size >= 128) + { + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); - dst += 16; - src += 16; - size -= 16; + dst += 128; + src += 128; + size -= 128; + } + + while (size) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + + dst += 16; + src += 16; + size -= 16; + } } range_lock->release(0); @@ -2338,32 +2370,39 @@ plain_access: } default: { - // Avoid unaligned stores in mov_rdata_avx - if (reinterpret_cast(dst) & 0x10) + if (size > s_rep_movsb_threshold) { - *reinterpret_cast(dst) = *reinterpret_cast(src); - - dst += 16; - src += 16; - size -= 16; + __movsb(dst, src, size); } - - while (size >= 128) + else { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + // Avoid unaligned stores in mov_rdata_avx + if (reinterpret_cast(dst) & 0x10) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); - dst += 128; - src += 128; - size -= 128; - } + dst += 16; + src += 16; + size -= 16; + } - while (size) - { - *reinterpret_cast(dst) = *reinterpret_cast(src); + while (size >= 128) + { + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); - dst += 16; - src += 16; - size -= 16; + dst += 128; + src += 128; + size -= 128; + } + + while (size) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + + dst += 16; + src += 16; + size -= 16; + } } break; diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp index 318ec013d6..fad4ee2fe2 100755 --- a/rpcs3/util/sysinfo.cpp +++ b/rpcs3/util/sysinfo.cpp @@ -138,6 +138,38 @@ bool utils::has_fma4() return g_value; } +bool utils::has_erms() +{ + static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x200) == 0x200; + return g_value; +} + +bool utils::has_fsrm() +{ + static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x10) == 0x10; + return g_value; +} + +u32 utils::get_rep_movsb_threshold() +{ + static const u32 g_value = []() + { + u32 thresh_value = 0xFFFFFFFF; + if (has_fsrm()) + { + thresh_value = 2047; + } + else if (has_erms()) + { + thresh_value = 4095; + } + + return thresh_value; + }(); + + return g_value; +} + std::string utils::get_cpu_brand() { std::string brand; diff --git a/rpcs3/util/sysinfo.hpp b/rpcs3/util/sysinfo.hpp index 7925e3e1f9..4de9030164 100755 --- a/rpcs3/util/sysinfo.hpp +++ b/rpcs3/util/sysinfo.hpp @@ -39,6 +39,10 @@ namespace utils bool has_fma4(); + bool has_erms(); + + bool has_fsrm(); + std::string get_cpu_brand(); std::string get_system_info(); @@ -57,5 +61,8 @@ namespace utils u32 get_cpu_model(); + // A threshold of 0xFFFFFFFF means that the rep movsb is expected to be slow on this platform + u32 get_rep_movsb_threshold(); + extern const u64 main_tid; }