1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-25 12:12:50 +01:00

SPU: Use REP MOVSB in do_dma_transfer

- Try to use REP MOVSB when the size of the transfer is above a certain threshold
- This threshold is determined by the ERMS and FSRM cpuid flags
- The threshold values are (roughly) taken from GLIBC
- A threshold of 0xFFFFFFFF indicates that the cpu has neither flag
This commit is contained in:
Malcolm Jestadt 2021-10-29 23:55:47 -04:00 committed by Ivan
parent 1c014299eb
commit 31a5a77ae5
3 changed files with 138 additions and 60 deletions

View File

@ -90,6 +90,22 @@ static const bool s_tsx_avx = utils::has_avx();
// For special case
static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
// Threshold for when rep mosvb is expected to outperform simd copies
// The threshold will be 0xFFFFFFFF when the performance of rep movsb is expected to be bad
static const u32 s_rep_movsb_threshold = utils::get_rep_movsb_threshold();
#ifndef _MSC_VER
static FORCE_INLINE void __movsb(unsigned char * Dst, const unsigned char * Src, size_t Size)
{
__asm__ __volatile__
(
"rep; movsb" :
[Dst] "=D" (Dst), [Src] "=S" (Src), [Size] "=c" (Size) :
"[Dst]" (Dst), "[Src]" (Src), "[Size]" (Size)
);
}
#endif
static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
{
#if defined(_MSC_VER) || defined(__AVX__)
@ -2234,6 +2250,14 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
// Split locking + transfer in two parts (before 64K border, and after it)
vm::range_lock(range_lock, range_addr, size0);
if (size > s_rep_movsb_threshold)
{
__movsb(dst, src, size0);
dst += size0;
src += size0;
}
else
{
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
@ -2261,6 +2285,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
src += 16;
size0 -= 16;
}
}
range_lock->release(0);
range_addr = nexta;
@ -2268,6 +2293,12 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
vm::range_lock(range_lock, range_addr, range_end - range_addr);
if (size > s_rep_movsb_threshold)
{
__movsb(dst, src, size);
}
else
{
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
@ -2295,6 +2326,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
src += 16;
size -= 16;
}
}
range_lock->release(0);
break;
@ -2337,6 +2369,12 @@ plain_access:
break;
}
default:
{
if (size > s_rep_movsb_threshold)
{
__movsb(dst, src, size);
}
else
{
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
@ -2365,6 +2403,7 @@ plain_access:
src += 16;
size -= 16;
}
}
break;
}

View File

@ -138,6 +138,38 @@ bool utils::has_fma4()
return g_value;
}
bool utils::has_erms()
{
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x200) == 0x200;
return g_value;
}
bool utils::has_fsrm()
{
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x10) == 0x10;
return g_value;
}
u32 utils::get_rep_movsb_threshold()
{
static const u32 g_value = []()
{
u32 thresh_value = 0xFFFFFFFF;
if (has_fsrm())
{
thresh_value = 2047;
}
else if (has_erms())
{
thresh_value = 4095;
}
return thresh_value;
}();
return g_value;
}
std::string utils::get_cpu_brand()
{
std::string brand;

View File

@ -39,6 +39,10 @@ namespace utils
bool has_fma4();
bool has_erms();
bool has_fsrm();
std::string get_cpu_brand();
std::string get_system_info();
@ -57,5 +61,8 @@ namespace utils
u32 get_cpu_model();
// A threshold of 0xFFFFFFFF means that the rep movsb is expected to be slow on this platform
u32 get_rep_movsb_threshold();
extern const u64 main_tid;
}