mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 10:42:36 +01:00
SPU LLVM: Add VNNI optimized variant of sumb
- Uses vpdpbusd to horrizontally add values, for some reason this is much faster than the normal horizontal add instructions.
This commit is contained in:
parent
a86b278115
commit
d304b52391
@ -2828,6 +2828,18 @@ public:
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3>
|
||||
value_t<u32[4]> vpdpbusd(T1 a, T2 b, T3 c)
|
||||
{
|
||||
value_t<u32[4]> result;
|
||||
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto data1 = b.eval(m_ir);
|
||||
const auto data2 = c.eval(m_ir);
|
||||
result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_avx512_vpdpbusd_128), {data0, data1, data2});
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
value_t<u8[16]> vpermb(T1 a, T2 b)
|
||||
{
|
||||
|
@ -7032,6 +7032,18 @@ public:
|
||||
|
||||
void SUMB(spu_opcode_t op)
|
||||
{
|
||||
// TODO: Some future CPUS will support VNNI but not avx512
|
||||
if (m_use_avx512_icl)
|
||||
{
|
||||
const auto [a, b] = get_vrs<u32[4]>(op.ra, op.rb);
|
||||
const auto zeroes = splat<u32[4]>(0);
|
||||
const auto ones = splat<u32[4]>(0x01010101);
|
||||
const auto ax = bitcast<u16[8]>(vpdpbusd(zeroes, a, ones));
|
||||
const auto bx = bitcast<u16[8]>(vpdpbusd(zeroes, b, ones));
|
||||
set_vr(op.rt, shuffle2(ax, bx, 0, 8, 2, 10, 4, 12, 6, 14));
|
||||
return;
|
||||
}
|
||||
|
||||
const auto [a, b] = get_vrs<u16[8]>(op.ra, op.rb);
|
||||
const auto ahs = eval((a >> 8) + (a & 0xff));
|
||||
const auto bhs = eval((b >> 8) + (b & 0xff));
|
||||
|
Loading…
Reference in New Issue
Block a user