1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-26 04:32:35 +01:00

SPU LLVM: Add VNNI optimized variant of sumb

- Uses vpdpbusd to horrizontally add values, for some reason this is much faster than the normal horizontal add instructions.
This commit is contained in:
Malcolm Jestadt 2021-08-29 00:48:00 -04:00 committed by Ivan
parent a86b278115
commit d304b52391
2 changed files with 24 additions and 0 deletions

View File

@ -2828,6 +2828,18 @@ public:
return result;
}
template <typename T1, typename T2, typename T3>
value_t<u32[4]> vpdpbusd(T1 a, T2 b, T3 c)
{
value_t<u32[4]> result;
const auto data0 = a.eval(m_ir);
const auto data1 = b.eval(m_ir);
const auto data2 = c.eval(m_ir);
result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_avx512_vpdpbusd_128), {data0, data1, data2});
return result;
}
template <typename T1, typename T2>
value_t<u8[16]> vpermb(T1 a, T2 b)
{

View File

@ -7032,6 +7032,18 @@ public:
void SUMB(spu_opcode_t op)
{
// TODO: Some future CPUS will support VNNI but not avx512
if (m_use_avx512_icl)
{
const auto [a, b] = get_vrs<u32[4]>(op.ra, op.rb);
const auto zeroes = splat<u32[4]>(0);
const auto ones = splat<u32[4]>(0x01010101);
const auto ax = bitcast<u16[8]>(vpdpbusd(zeroes, a, ones));
const auto bx = bitcast<u16[8]>(vpdpbusd(zeroes, b, ones));
set_vr(op.rt, shuffle2(ax, bx, 0, 8, 2, 10, 4, 12, 6, 14));
return;
}
const auto [a, b] = get_vrs<u16[8]>(op.ra, op.rb);
const auto ahs = eval((a >> 8) + (a & 0xff));
const auto bhs = eval((b >> 8) + (b & 0xff));