From d304b5239190518215d82f5ce90d1b3c7cd9b60a Mon Sep 17 00:00:00 2001 From: Malcolm Jestadt Date: Sun, 29 Aug 2021 00:48:00 -0400 Subject: [PATCH] SPU LLVM: Add VNNI optimized variant of sumb - Uses vpdpbusd to horrizontally add values, for some reason this is much faster than the normal horizontal add instructions. --- rpcs3/Emu/CPU/CPUTranslator.h | 12 ++++++++++++ rpcs3/Emu/Cell/SPURecompiler.cpp | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index be5d0349e8..c062fbb936 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -2828,6 +2828,18 @@ public: return result; } + template + value_t vpdpbusd(T1 a, T2 b, T3 c) + { + value_t result; + + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + const auto data2 = c.eval(m_ir); + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_avx512_vpdpbusd_128), {data0, data1, data2}); + return result; + } + template value_t vpermb(T1 a, T2 b) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 0e7b3f0e78..1eb261b95c 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7032,6 +7032,18 @@ public: void SUMB(spu_opcode_t op) { + // TODO: Some future CPUS will support VNNI but not avx512 + if (m_use_avx512_icl) + { + const auto [a, b] = get_vrs(op.ra, op.rb); + const auto zeroes = splat(0); + const auto ones = splat(0x01010101); + const auto ax = bitcast(vpdpbusd(zeroes, a, ones)); + const auto bx = bitcast(vpdpbusd(zeroes, b, ones)); + set_vr(op.rt, shuffle2(ax, bx, 0, 8, 2, 10, 4, 12, 6, 14)); + return; + } + const auto [a, b] = get_vrs(op.ra, op.rb); const auto ahs = eval((a >> 8) + (a & 0xff)); const auto bhs = eval((b >> 8) + (b & 0xff));