mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-01-31 20:41:45 +01:00
SPU: remove SSSE3 dependency
This commit is contained in:
parent
61de20a633
commit
4aee4ed6d7
@ -6,6 +6,7 @@
|
||||
#include "SPUThread.h"
|
||||
#include "SPUInterpreter.h"
|
||||
#include "SPUASMJITRecompiler.h"
|
||||
#include "Utilities/sysinfo.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
@ -20,7 +21,7 @@
|
||||
#define SPU_OFF_16(x, ...) asmjit::x86::word_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
|
||||
#define SPU_OFF_8(x, ...) asmjit::x86::byte_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
|
||||
|
||||
const spu_decoder<spu_interpreter_fast> s_spu_interpreter; // TODO: remove
|
||||
extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast; // TODO: avoid
|
||||
const spu_decoder<spu_recompiler> s_spu_decoder;
|
||||
|
||||
spu_recompiler::spu_recompiler()
|
||||
@ -101,6 +102,8 @@ void spu_recompiler::compile(spu_function_t& f)
|
||||
this->qw1 = &qw1_var;
|
||||
X86Gp qw2_var = compiler.newUInt64("qw2");
|
||||
this->qw2 = &qw2_var;
|
||||
X86Gp qw3_var = compiler.newUInt64("qw3");
|
||||
this->qw3 = &qw3_var;
|
||||
|
||||
std::array<X86Xmm, 6> vec_vars;
|
||||
|
||||
@ -236,7 +239,7 @@ void spu_recompiler::compile(spu_function_t& f)
|
||||
m_jit->add(&fn, codeHolder);
|
||||
|
||||
f.compiled = asmjit::Internal::ptr_cast<decltype(f.compiled)>(fn);
|
||||
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
// Add ASMJIT logs
|
||||
@ -351,7 +354,7 @@ void spu_recompiler::InterpreterCall(spu_opcode_t op)
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, u32(SPUThread*, u32, spu_inter_func_t)>(gate)), asmjit::FuncSignature3<u32, void*, u32, void*>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *cpu);
|
||||
call->setArg(1, asmjit::imm_u(op.opcode));
|
||||
call->setArg(2, asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*>(s_spu_interpreter.decode(op.opcode))));
|
||||
call->setArg(2, asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*>(g_spu_interpreter_fast.decode(op.opcode))));
|
||||
call->setRet(0, *addr);
|
||||
|
||||
// return immediately if an error occured
|
||||
@ -408,7 +411,7 @@ void spu_recompiler::FunctionCall()
|
||||
if (_spu->pc == link)
|
||||
{
|
||||
_spu->recursion_level--;
|
||||
return 0; // Successfully returned
|
||||
return 0; // Successfully returned
|
||||
}
|
||||
}
|
||||
|
||||
@ -1029,9 +1032,24 @@ void spu_recompiler::STQX(spu_opcode_t op)
|
||||
c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->and_(*addr, 0x3fff0);
|
||||
|
||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
|
||||
if (utils::has_ssse3())
|
||||
{
|
||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
|
||||
c->bswap(*qw0);
|
||||
c->bswap(*qw1);
|
||||
c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 0), *qw1);
|
||||
c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 8), *qw0);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
|
||||
c->unuse(*addr);
|
||||
}
|
||||
|
||||
@ -1079,9 +1097,8 @@ void spu_recompiler::HBR(spu_opcode_t op)
|
||||
void spu_recompiler::GB(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->pshufb(va, XmmConst(_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)));
|
||||
c->psllq(va, 7);
|
||||
c->pmovmskb(*addr, va);
|
||||
c->pslld(va, 31);
|
||||
c->movmskps(*addr, va);
|
||||
c->pxor(va, va);
|
||||
c->pinsrw(va, *addr, 6);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
@ -1091,8 +1108,8 @@ void spu_recompiler::GB(spu_opcode_t op)
|
||||
void spu_recompiler::GBH(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->pshufb(va, XmmConst(_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0)));
|
||||
c->psllq(va, 7);
|
||||
c->psllw(va, 15);
|
||||
c->packsswb(va, XmmConst(_mm_setzero_si128()));
|
||||
c->pmovmskb(*addr, va);
|
||||
c->pxor(va, va);
|
||||
c->pinsrw(va, *addr, 6);
|
||||
@ -1171,21 +1188,54 @@ void spu_recompiler::LQX(spu_opcode_t op)
|
||||
c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->and_(*addr, 0x3fff0);
|
||||
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
if (utils::has_ssse3())
|
||||
{
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->mov(*qw0, asmjit::x86::qword_ptr(*ls, *addr, 0, 0));
|
||||
c->mov(*qw1, asmjit::x86::qword_ptr(*ls, *addr, 0, 8));
|
||||
c->bswap(*qw0);
|
||||
c->bswap(*qw1);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
|
||||
c->unuse(*addr);
|
||||
}
|
||||
|
||||
void spu_recompiler::ROTQBYBI(spu_opcode_t op)
|
||||
{
|
||||
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||
{
|
||||
const auto a = *(__m128i*)_a;
|
||||
alignas(32) const __m128i buf[2]{a, a};
|
||||
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (v >> 3 & 0xf))));
|
||||
};
|
||||
|
||||
if (!utils::has_ssse3())
|
||||
{
|
||||
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *qw0);
|
||||
call->setArg(1, *qw1);
|
||||
call->setArg(2, *addr);
|
||||
return;
|
||||
}
|
||||
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->and_(*addr, 0xf << 3);
|
||||
c->shl(*addr, 1);
|
||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
c->unuse(*addr);
|
||||
c->unuse(*qw0);
|
||||
@ -1193,14 +1243,30 @@ void spu_recompiler::ROTQBYBI(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::ROTQMBYBI(spu_opcode_t op)
|
||||
{
|
||||
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||
{
|
||||
const auto a = *(__m128i*)_a;
|
||||
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (v >> 3 & 0x1f)));
|
||||
};
|
||||
|
||||
if (!utils::has_ssse3())
|
||||
{
|
||||
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *qw0);
|
||||
call->setArg(1, *qw1);
|
||||
call->setArg(2, *addr);
|
||||
return;
|
||||
}
|
||||
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->shr(*addr, 3);
|
||||
c->neg(*addr);
|
||||
c->and_(*addr, 0x1f);
|
||||
c->shl(*addr, 4);
|
||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
||||
c->and_(*addr, 0x1f << 3);
|
||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
c->unuse(*addr);
|
||||
c->unuse(*qw0);
|
||||
@ -1208,12 +1274,30 @@ void spu_recompiler::ROTQMBYBI(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::SHLQBYBI(spu_opcode_t op)
|
||||
{
|
||||
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||
{
|
||||
const auto a = *(__m128i*)_a;
|
||||
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (v >> 3 & 0x1f))));
|
||||
};
|
||||
|
||||
if (!utils::has_ssse3())
|
||||
{
|
||||
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *qw0);
|
||||
call->setArg(1, *qw1);
|
||||
call->setArg(2, *addr);
|
||||
return;
|
||||
}
|
||||
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->and_(*addr, 0x1f << 3);
|
||||
c->shl(*addr, 1);
|
||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
c->unuse(*addr);
|
||||
c->unuse(*qw0);
|
||||
@ -1327,6 +1411,25 @@ void spu_recompiler::SHLQBI(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::ROTQBY(spu_opcode_t op)
|
||||
{
|
||||
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||
{
|
||||
const auto a = *(__m128i*)_a;
|
||||
alignas(32) const __m128i buf[2]{a, a};
|
||||
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (v & 0xf))));
|
||||
};
|
||||
|
||||
if (!utils::has_ssse3())
|
||||
{
|
||||
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *qw0);
|
||||
call->setArg(1, *qw1);
|
||||
call->setArg(2, *addr);
|
||||
return;
|
||||
}
|
||||
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
@ -1340,10 +1443,28 @@ void spu_recompiler::ROTQBY(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::ROTQMBY(spu_opcode_t op)
|
||||
{
|
||||
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||
{
|
||||
const auto a = *(__m128i*)_a;
|
||||
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (v & 0x1f)));
|
||||
};
|
||||
|
||||
if (!utils::has_ssse3())
|
||||
{
|
||||
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *qw0);
|
||||
call->setArg(1, *qw1);
|
||||
call->setArg(2, *addr);
|
||||
return;
|
||||
}
|
||||
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->neg(*addr);
|
||||
c->and_(*addr, 0x1f);
|
||||
c->shl(*addr, 4);
|
||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
||||
@ -1354,6 +1475,25 @@ void spu_recompiler::ROTQMBY(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::SHLQBY(spu_opcode_t op)
|
||||
{
|
||||
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||
{
|
||||
const auto a = *(__m128i*)_a;
|
||||
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (v & 0x1f))));
|
||||
};
|
||||
|
||||
if (!utils::has_ssse3())
|
||||
{
|
||||
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *qw0);
|
||||
call->setArg(1, *qw1);
|
||||
call->setArg(2, *addr);
|
||||
return;
|
||||
}
|
||||
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
@ -1523,7 +1663,27 @@ void spu_recompiler::ROTQBYI(spu_opcode_t op)
|
||||
{
|
||||
const int s = op.i7 & 0xf;
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->palignr(va, va, 16 - s);
|
||||
const XmmLink& v2 = XmmAlloc();
|
||||
|
||||
if (s == 0)
|
||||
{
|
||||
}
|
||||
else if (s == 4 || s == 8 || s == 12)
|
||||
{
|
||||
c->pshufd(va, va, ::rol8(0xE4, s / 2));
|
||||
}
|
||||
else if (utils::has_ssse3())
|
||||
{
|
||||
c->palignr(va, va, 16 - s);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->movdqa(v2, va);
|
||||
c->psrldq(va, 16 - s);
|
||||
c->pslldq(v2, s);
|
||||
c->por(va, v2);
|
||||
}
|
||||
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
}
|
||||
|
||||
@ -1588,12 +1748,25 @@ void spu_recompiler::SUMB(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vi = XmmAlloc();
|
||||
c->movdqa(vi, XmmConst(_mm_set1_epi8(1)));
|
||||
c->pmaddubsw(va, vi);
|
||||
c->pmaddubsw(vb, vi);
|
||||
c->phaddw(va, vb);
|
||||
c->pshufb(va, XmmConst(_mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0)));
|
||||
const XmmLink& v1 = XmmAlloc();
|
||||
const XmmLink& v2 = XmmAlloc();
|
||||
c->movdqa(v2, XmmConst(_mm_set1_epi16(0xff)));
|
||||
c->movdqa(v1, va);
|
||||
c->psrlw(va, 8);
|
||||
c->pand(v1, v2);
|
||||
c->pand(v2, vb);
|
||||
c->psrlw(vb, 8);
|
||||
c->paddw(va, v1);
|
||||
c->paddw(vb, v2);
|
||||
c->movdqa(v2, XmmConst(_mm_set1_epi32(0xffff)));
|
||||
c->movdqa(v1, va);
|
||||
c->psrld(va, 16);
|
||||
c->pand(v1, v2);
|
||||
c->pandn(v2, vb);
|
||||
c->pslld(vb, 16);
|
||||
c->paddw(va, v1);
|
||||
c->paddw(vb, v2);
|
||||
c->por(va, vb);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
}
|
||||
|
||||
@ -1657,16 +1830,24 @@ void spu_recompiler::CNTB(spu_opcode_t op)
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& v1 = XmmAlloc();
|
||||
const XmmLink& vm = XmmAlloc();
|
||||
c->movdqa(vm, XmmConst(_mm_set1_epi8(0x55)));
|
||||
c->movdqa(v1, va);
|
||||
c->psrlq(v1, 4);
|
||||
c->movdqa(vm, XmmConst(_mm_set1_epi8(0xf)));
|
||||
c->pand(va, vm);
|
||||
c->psrlq(v1, 1);
|
||||
c->pand(v1, vm);
|
||||
c->movdqa(vm, XmmConst(_mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0)));
|
||||
c->pshufb(vm, va);
|
||||
c->movdqa(va, XmmConst(_mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0)));
|
||||
c->pshufb(va, v1);
|
||||
c->paddb(va, vm);
|
||||
c->paddb(va, v1);
|
||||
c->movdqa(vm, XmmConst(_mm_set1_epi8(0x33)));
|
||||
c->movdqa(v1, va);
|
||||
c->pand(va, vm);
|
||||
c->psrlq(v1, 2);
|
||||
c->pand(v1, vm);
|
||||
c->paddb(va, v1);
|
||||
c->movdqa(vm, XmmConst(_mm_set1_epi8(0x0f)));
|
||||
c->movdqa(v1, va);
|
||||
c->pand(va, vm);
|
||||
c->psrlq(v1, 4);
|
||||
c->pand(v1, vm);
|
||||
c->paddb(va, v1);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
}
|
||||
|
||||
@ -2319,9 +2500,23 @@ void spu_recompiler::BRZ(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::STQA(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)), vt);
|
||||
if (utils::has_ssse3())
|
||||
{
|
||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)), vt);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
|
||||
c->bswap(*qw0);
|
||||
c->bswap(*qw1);
|
||||
c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 0), *qw1);
|
||||
c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 8), *qw0);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
}
|
||||
|
||||
void spu_recompiler::BRNZ(spu_opcode_t op)
|
||||
@ -2401,9 +2596,23 @@ void spu_recompiler::BRHNZ(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::STQR(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)), vt);
|
||||
if (utils::has_ssse3())
|
||||
{
|
||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)), vt);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
|
||||
c->bswap(*qw0);
|
||||
c->bswap(*qw1);
|
||||
c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 0), *qw1);
|
||||
c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 8), *qw0);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
}
|
||||
|
||||
void spu_recompiler::BRA(spu_opcode_t op)
|
||||
@ -2431,10 +2640,24 @@ void spu_recompiler::BRA(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::LQA(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)));
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
if (utils::has_ssse3())
|
||||
{
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)));
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->mov(*qw0, asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 0));
|
||||
c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 8));
|
||||
c->bswap(*qw0);
|
||||
c->bswap(*qw1);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
}
|
||||
|
||||
void spu_recompiler::BRASL(spu_opcode_t op)
|
||||
@ -2516,10 +2739,24 @@ void spu_recompiler::BRSL(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::LQR(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)));
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
if (utils::has_ssse3())
|
||||
{
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)));
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->mov(*qw0, asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 0));
|
||||
c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 8));
|
||||
c->bswap(*qw0);
|
||||
c->bswap(*qw1);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
}
|
||||
|
||||
void spu_recompiler::IL(spu_opcode_t op)
|
||||
@ -2630,9 +2867,24 @@ void spu_recompiler::STQD(spu_opcode_t op)
|
||||
if (op.si10) c->add(*addr, op.si10 << 4);
|
||||
c->and_(*addr, 0x3fff0);
|
||||
|
||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
|
||||
if (utils::has_ssse3())
|
||||
{
|
||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
|
||||
c->bswap(*qw0);
|
||||
c->bswap(*qw1);
|
||||
c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 0), *qw1);
|
||||
c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 8), *qw0);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
|
||||
c->unuse(*addr);
|
||||
}
|
||||
|
||||
@ -2642,10 +2894,25 @@ void spu_recompiler::LQD(spu_opcode_t op)
|
||||
if (op.si10) c->add(*addr, op.si10 << 4);
|
||||
c->and_(*addr, 0x3fff0);
|
||||
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
if (utils::has_ssse3())
|
||||
{
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
|
||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->mov(*qw0, asmjit::x86::qword_ptr(*ls, *addr, 0, 0));
|
||||
c->mov(*qw1, asmjit::x86::qword_ptr(*ls, *addr, 0, 8));
|
||||
c->bswap(*qw0);
|
||||
c->bswap(*qw1);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
|
||||
c->unuse(*addr);
|
||||
}
|
||||
|
||||
@ -2814,6 +3081,61 @@ void spu_recompiler::SELB(spu_opcode_t op)
|
||||
|
||||
void spu_recompiler::SHUFB(spu_opcode_t op)
|
||||
{
|
||||
alignas(16) static thread_local u8 s_lut[256]
|
||||
{
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
};
|
||||
|
||||
auto body = [](u8* t, const u8* a, const u8* b, const u8* c) noexcept
|
||||
{
|
||||
__m128i _a = *(__m128i*)a;
|
||||
__m128i _b = *(__m128i*)b;
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x00), _a);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x10), _b);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x20), _a);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x30), _b);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x40), _a);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x50), _b);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x60), _a);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x70), _b);
|
||||
v128 mask = v128::fromV(_mm_xor_si128(*(__m128i*)c, _mm_set1_epi8(0xf)));
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
t[i] = s_lut[mask._u8[i]];
|
||||
}
|
||||
};
|
||||
|
||||
if (!utils::has_ssse3())
|
||||
{
|
||||
c->lea(*qw0, SPU_OFF_128(gpr, op.rt4));
|
||||
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
|
||||
c->lea(*qw3, SPU_OFF_128(gpr, op.rc));
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, const u8*, const u8*)>(body)), asmjit::FuncSignature4<void, void*, void*, void*, void*>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *qw0);
|
||||
call->setArg(1, *qw1);
|
||||
call->setArg(2, *qw2);
|
||||
call->setArg(3, *qw3);
|
||||
return;
|
||||
}
|
||||
|
||||
const XmmLink& v0 = XmmGet(op.rc, XmmType::Int); // v0 = mask
|
||||
const XmmLink& v1 = XmmAlloc();
|
||||
const XmmLink& v2 = XmmAlloc();
|
||||
|
@ -37,6 +37,7 @@ private:
|
||||
asmjit::X86Gp* qw0;
|
||||
asmjit::X86Gp* qw1;
|
||||
asmjit::X86Gp* qw2;
|
||||
asmjit::X86Gp* qw3;
|
||||
std::array<asmjit::X86Xmm*, 6> vec;
|
||||
|
||||
// labels:
|
||||
|
@ -8,6 +8,10 @@
|
||||
#include <cmath>
|
||||
#include <cfenv>
|
||||
|
||||
#if !defined(_MSC_VER) && !defined(__SSSE3__)
|
||||
#define _mm_shuffle_epi8
|
||||
#endif
|
||||
|
||||
// Compare 16 packed unsigned bytes (greater than)
|
||||
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
|
||||
{
|
||||
@ -73,7 +77,7 @@ void spu_interpreter::LNOP(SPUThread& spu, spu_opcode_t op)
|
||||
// This instruction must be used following a store instruction that modifies the instruction stream.
|
||||
void spu_interpreter::SYNC(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
_mm_mfence();
|
||||
_mm_mfence();
|
||||
}
|
||||
|
||||
// This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
|
||||
@ -398,12 +402,12 @@ void spu_interpreter::HBR(SPUThread& spu, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::GB(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_slli_epi64(_mm_shuffle_epi8(spu.gpr[op.ra].vi, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)), 7)));
|
||||
spu.gpr[op.rt] = v128::from32r(_mm_movemask_ps(_mm_castsi128_ps(_mm_slli_epi32(spu.gpr[op.ra].vi, 31))));
|
||||
}
|
||||
|
||||
void spu_interpreter::GBH(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_slli_epi64(_mm_shuffle_epi8(spu.gpr[op.ra].vi, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0)), 7)));
|
||||
spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_packs_epi16(_mm_slli_epi16(spu.gpr[op.ra].vi, 15), _mm_setzero_si128())));
|
||||
}
|
||||
|
||||
void spu_interpreter::GBB(SPUThread& spu, spu_opcode_t op)
|
||||
@ -442,17 +446,38 @@ void spu_interpreter::LQX(SPUThread& spu, spu_opcode_t op)
|
||||
spu.gpr[op.rt] = spu._ref<v128>((spu.gpr[op.ra]._u32[3] + spu.gpr[op.rb]._u32[3]) & 0x3fff0);
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_precise::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(32) const __m128i buf[2]{a, a};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (spu.gpr[op.rb]._u32[3] >> 3 & 0xf))));
|
||||
}
|
||||
|
||||
void spu_interpreter_fast::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0xf].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_precise::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[-(spu.gpr[op.rb]._s32[3] >> 3) & 0x1f].vi);
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (spu.gpr[op.rb]._u32[3] >> 3 & 0x1f)));
|
||||
}
|
||||
|
||||
void spu_interpreter::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_fast::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] >> 3 & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter_precise::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (spu.gpr[op.rb]._u32[3] >> 3 & 0x1f))));
|
||||
}
|
||||
|
||||
void spu_interpreter_fast::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0x1f].vi);
|
||||
}
|
||||
@ -509,7 +534,7 @@ void spu_interpreter::ROTQBI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
const s32 n = spu.gpr[op.rb]._s32[3] & 0x7;
|
||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_alignr_epi8(a, a, 8), 64 - n));
|
||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n));
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQMBI(SPUThread& spu, spu_opcode_t op)
|
||||
@ -526,17 +551,38 @@ void spu_interpreter::SHLQBI(SPUThread& spu, spu_opcode_t op)
|
||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQBY(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_precise::ROTQBY(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(32) const __m128i buf[2]{a, a};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (spu.gpr[op.rb]._u32[3] & 0xf))));
|
||||
}
|
||||
|
||||
void spu_interpreter_fast::ROTQBY(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] & 0xf].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQMBY(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_precise::ROTQMBY(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[-spu.gpr[op.rb]._s32[3] & 0x1f].vi);
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (spu.gpr[op.rb]._u32[3] & 0x1f)));
|
||||
}
|
||||
|
||||
void spu_interpreter::SHLQBY(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_fast::ROTQMBY(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter_precise::SHLQBY(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (spu.gpr[op.rb]._u32[3] & 0x1f))));
|
||||
}
|
||||
|
||||
void spu_interpreter_fast::SHLQBY(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] & 0x1f].vi);
|
||||
}
|
||||
@ -598,7 +644,7 @@ void spu_interpreter::ROTQBII(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
const s32 n = op.i7 & 0x7;
|
||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_alignr_epi8(a, a, 8), 64 - n));
|
||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n));
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQMBII(SPUThread& spu, spu_opcode_t op)
|
||||
@ -615,17 +661,38 @@ void spu_interpreter::SHLQBII(SPUThread& spu, spu_opcode_t op)
|
||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQBYI(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_precise::ROTQBYI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(32) const __m128i buf[2]{a, a};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (op.i7 & 0xf))));
|
||||
}
|
||||
|
||||
void spu_interpreter_fast::ROTQBYI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[op.i7 & 0xf].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_precise::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[0-op.i7 & 0x1f].vi);
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (op.i7 & 0x1f)));
|
||||
}
|
||||
|
||||
void spu_interpreter::SHLQBYI(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_fast::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[op.i7 & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter_precise::SHLQBYI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (op.i7 & 0x1f))));
|
||||
}
|
||||
|
||||
void spu_interpreter_fast::SHLQBYI(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[op.i7 & 0x1f].vi);
|
||||
}
|
||||
@ -661,10 +728,21 @@ void spu_interpreter::CGTB(SPUThread& spu, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::SUMB(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto ones = _mm_set1_epi8(1);
|
||||
const auto a = _mm_maddubs_epi16(spu.gpr[op.ra].vi, ones);
|
||||
const auto b = _mm_maddubs_epi16(spu.gpr[op.rb].vi, ones);
|
||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(_mm_hadd_epi16(a, b), _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0));
|
||||
const auto m1 = _mm_set1_epi16(0xff);
|
||||
const auto m2 = _mm_set1_epi32(0xffff);
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
const auto b = spu.gpr[op.rb].vi;
|
||||
const auto a1 = _mm_srli_epi16(a, 8);
|
||||
const auto a2 = _mm_and_si128(a, m1);
|
||||
const auto b1 = _mm_srli_epi16(b, 8);
|
||||
const auto b2 = _mm_and_si128(b, m1);
|
||||
const auto sa = _mm_add_epi16(a1, a2);
|
||||
const auto sb = _mm_add_epi16(b1, b2);
|
||||
const auto s2 = _mm_and_si128(sa, m2);
|
||||
const auto s1 = _mm_srli_epi32(sa, 16);
|
||||
const auto s4 = _mm_andnot_si128(m2, sb);
|
||||
const auto s3 = _mm_slli_epi32(sb, 16);
|
||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_add_epi16(s1, s2), _mm_add_epi16(s3, s4));
|
||||
}
|
||||
|
||||
void spu_interpreter::HGT(SPUThread& spu, spu_opcode_t op)
|
||||
@ -696,10 +774,14 @@ void spu_interpreter::XSHW(SPUThread& spu, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::CNTB(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto counts = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
|
||||
const auto mask = _mm_set1_epi8(0xf);
|
||||
const auto a = spu.gpr[op.ra].vi;
|
||||
spu.gpr[op.rt].vi = _mm_add_epi8(_mm_shuffle_epi8(counts, _mm_and_si128(a, mask)), _mm_shuffle_epi8(counts, _mm_and_si128(_mm_srli_epi64(a, 4), mask)));
|
||||
const auto mask1 = _mm_set1_epi8(0x55);
|
||||
const auto sum1 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(a, 1), mask1), _mm_and_si128(a, mask1));
|
||||
const auto mask2 = _mm_set1_epi8(0x33);
|
||||
const auto sum2 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(sum1, 2), mask2), _mm_and_si128(sum1, mask2));
|
||||
const auto mask3 = _mm_set1_epi8(0x0f);
|
||||
const auto sum3 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(sum2, 4), mask3), _mm_and_si128(sum2, mask3));
|
||||
spu.gpr[op.rt].vi = sum3;
|
||||
}
|
||||
|
||||
void spu_interpreter::XSBH(SPUThread& spu, spu_opcode_t op)
|
||||
@ -1354,7 +1436,49 @@ void spu_interpreter::SELB(SPUThread& spu, spu_opcode_t op)
|
||||
spu.gpr[op.rt4] = (spu.gpr[op.rc] & spu.gpr[op.rb]) | v128::andnot(spu.gpr[op.rc], spu.gpr[op.ra]);
|
||||
}
|
||||
|
||||
void spu_interpreter::SHUFB(SPUThread& spu, spu_opcode_t op)
|
||||
void spu_interpreter_precise::SHUFB(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
alignas(16) static thread_local u8 s_lut[256]
|
||||
{
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
};
|
||||
|
||||
const auto _a = spu.gpr[op.ra].vi;
|
||||
const auto _b = spu.gpr[op.rb].vi;
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x00), _a);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x10), _b);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x20), _a);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x30), _b);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x40), _a);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x50), _b);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x60), _a);
|
||||
_mm_store_si128((__m128i*)(s_lut + 0x70), _b);
|
||||
v128 mask = v128::fromV(_mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi8(0xf)));
|
||||
auto& t = spu.gpr[op.rt4];
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
t._u8[i] = s_lut[mask._u8[i]];
|
||||
}
|
||||
}
|
||||
|
||||
void spu_interpreter_fast::SHUFB(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto index = _mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi32(0x0f0f0f0f));
|
||||
const auto res1 = _mm_shuffle_epi8(spu.gpr[op.ra].vi, index);
|
||||
|
@ -66,9 +66,6 @@ struct spu_interpreter
|
||||
static void FSMH(SPUThread&, spu_opcode_t);
|
||||
static void FSMB(SPUThread&, spu_opcode_t);
|
||||
static void LQX(SPUThread&, spu_opcode_t);
|
||||
static void ROTQBYBI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBYBI(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBYBI(SPUThread&, spu_opcode_t);
|
||||
static void CBX(SPUThread&, spu_opcode_t);
|
||||
static void CHX(SPUThread&, spu_opcode_t);
|
||||
static void CWX(SPUThread&, spu_opcode_t);
|
||||
@ -76,9 +73,6 @@ struct spu_interpreter
|
||||
static void ROTQBI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBI(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQBY(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBY(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBY(SPUThread&, spu_opcode_t);
|
||||
static void ORX(SPUThread&, spu_opcode_t);
|
||||
static void CBD(SPUThread&, spu_opcode_t);
|
||||
static void CHD(SPUThread&, spu_opcode_t);
|
||||
@ -87,9 +81,6 @@ struct spu_interpreter
|
||||
static void ROTQBII(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBII(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBII(SPUThread&, spu_opcode_t);
|
||||
static void ROTQBYI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBYI(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBYI(SPUThread&, spu_opcode_t);
|
||||
static void NOP(SPUThread&, spu_opcode_t);
|
||||
static void CGT(SPUThread&, spu_opcode_t);
|
||||
static void XOR(SPUThread&, spu_opcode_t);
|
||||
@ -175,7 +166,6 @@ struct spu_interpreter
|
||||
static void HBRR(SPUThread&, spu_opcode_t);
|
||||
static void ILA(SPUThread&, spu_opcode_t);
|
||||
static void SELB(SPUThread&, spu_opcode_t);
|
||||
static void SHUFB(SPUThread&, spu_opcode_t);
|
||||
static void MPYA(SPUThread&, spu_opcode_t);
|
||||
static void DFCGT(SPUThread&, spu_opcode_t);
|
||||
static void DFCMGT(SPUThread&, spu_opcode_t);
|
||||
@ -186,6 +176,17 @@ struct spu_interpreter
|
||||
|
||||
struct spu_interpreter_fast final : spu_interpreter
|
||||
{
|
||||
static void ROTQBYBI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBYBI(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBYBI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQBY(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBY(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBY(SPUThread&, spu_opcode_t);
|
||||
static void ROTQBYI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBYI(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBYI(SPUThread&, spu_opcode_t);
|
||||
static void SHUFB(SPUThread&, spu_opcode_t);
|
||||
|
||||
static void FREST(SPUThread&, spu_opcode_t);
|
||||
static void FRSQEST(SPUThread&, spu_opcode_t);
|
||||
static void FCGT(SPUThread&, spu_opcode_t);
|
||||
@ -218,6 +219,17 @@ struct spu_interpreter_fast final : spu_interpreter
|
||||
|
||||
struct spu_interpreter_precise final : spu_interpreter
|
||||
{
|
||||
static void ROTQBYBI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBYBI(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBYBI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQBY(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBY(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBY(SPUThread&, spu_opcode_t);
|
||||
static void ROTQBYI(SPUThread&, spu_opcode_t);
|
||||
static void ROTQMBYI(SPUThread&, spu_opcode_t);
|
||||
static void SHLQBYI(SPUThread&, spu_opcode_t);
|
||||
static void SHUFB(SPUThread&, spu_opcode_t);
|
||||
|
||||
static void FREST(SPUThread&, spu_opcode_t);
|
||||
static void FRSQEST(SPUThread&, spu_opcode_t);
|
||||
static void FCGT(SPUThread&, spu_opcode_t);
|
||||
@ -246,4 +258,4 @@ struct spu_interpreter_precise final : spu_interpreter
|
||||
static void FNMS(SPUThread&, spu_opcode_t);
|
||||
static void FMA(SPUThread&, spu_opcode_t);
|
||||
static void FMS(SPUThread&, spu_opcode_t);
|
||||
};
|
||||
};
|
@ -41,7 +41,7 @@ static u32 spu_decode(u32 inst)
|
||||
}
|
||||
|
||||
// SPU decoder object. D provides functions. T is function pointer type returned.
|
||||
template<typename D, typename T = decltype(&D::UNK)>
|
||||
template <typename D, typename T = decltype(&D::UNK)>
|
||||
class spu_decoder
|
||||
{
|
||||
// Fast lookup table
|
||||
@ -271,6 +271,12 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
spu_decoder(F&& init) : spu_decoder()
|
||||
{
|
||||
init(m_table);
|
||||
}
|
||||
|
||||
const std::array<T, 2048>& get_table() const
|
||||
{
|
||||
return m_table;
|
||||
|
@ -25,6 +25,15 @@
|
||||
|
||||
const bool s_use_rtm = utils::has_rtm();
|
||||
|
||||
const bool s_use_ssse3 =
|
||||
#ifdef _MSC_VER
|
||||
utils::has_ssse3();
|
||||
#elif __SSSE3__
|
||||
true;
|
||||
#else
|
||||
false;
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
bool operator ==(const u128& lhs, const u128& rhs)
|
||||
{
|
||||
@ -37,10 +46,60 @@ extern u64 get_system_time();
|
||||
|
||||
extern thread_local u64 g_tls_fault_spu;
|
||||
|
||||
const spu_decoder<spu_interpreter_precise> s_spu_interpreter_precise;
|
||||
const spu_decoder<spu_interpreter_fast> s_spu_interpreter_fast;
|
||||
// Table of identical interpreter functions when precise contains SSE2 version, and fast contains SSSE3 functions
|
||||
const std::pair<spu_inter_func_t, spu_inter_func_t> s_spu_dispatch_table[]
|
||||
{
|
||||
#define FUNC(x) {&spu_interpreter_precise::x, &spu_interpreter_fast::x}
|
||||
FUNC(ROTQBYBI),
|
||||
FUNC(ROTQMBYBI),
|
||||
FUNC(SHLQBYBI),
|
||||
FUNC(ROTQBY),
|
||||
FUNC(ROTQMBY),
|
||||
FUNC(SHLQBY),
|
||||
FUNC(ROTQBYI),
|
||||
FUNC(ROTQMBYI),
|
||||
FUNC(SHLQBYI),
|
||||
FUNC(SHUFB),
|
||||
#undef FUNC
|
||||
};
|
||||
|
||||
std::atomic<u64> g_num_spu_threads = { 0ull };
|
||||
extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise([](auto& table)
|
||||
{
|
||||
if (s_use_ssse3)
|
||||
{
|
||||
for (auto& func : table)
|
||||
{
|
||||
for (const auto& pair : s_spu_dispatch_table)
|
||||
{
|
||||
if (pair.first == func)
|
||||
{
|
||||
func = pair.second;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast([](auto& table)
|
||||
{
|
||||
if (!s_use_ssse3)
|
||||
{
|
||||
for (auto& func : table)
|
||||
{
|
||||
for (const auto& pair : s_spu_dispatch_table)
|
||||
{
|
||||
if (pair.second == func)
|
||||
{
|
||||
func = pair.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
std::atomic<u64> g_num_spu_threads{0ull};
|
||||
|
||||
template <>
|
||||
void fmt_class_string<spu_decoder_type>::format(std::string& out, u64 arg)
|
||||
@ -200,9 +259,11 @@ spu_imm_table_t::spu_imm_table_t()
|
||||
|
||||
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
|
||||
{
|
||||
const u32 im = (0u - i) & 0x1f;
|
||||
|
||||
for (u32 j = 0; j < 16; j++)
|
||||
{
|
||||
srdq_pshufb[i]._u8[j] = (j + i > 15) ? 0xff : static_cast<u8>(j + i);
|
||||
srdq_pshufb[i]._u8[j] = (j + im > 15) ? 0xff : static_cast<u8>(j + im);
|
||||
}
|
||||
}
|
||||
|
||||
@ -314,7 +375,7 @@ extern thread_local std::string(*g_tls_log_prefix)();
|
||||
void SPUThread::cpu_task()
|
||||
{
|
||||
std::fesetround(FE_TOWARDZERO);
|
||||
|
||||
|
||||
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
|
||||
{
|
||||
if (!spu_db) spu_db = fxm::get_always<SPUDatabase>();
|
||||
@ -330,8 +391,8 @@ void SPUThread::cpu_task()
|
||||
|
||||
// Select opcode table
|
||||
const auto& table = *(
|
||||
g_cfg.core.spu_decoder == spu_decoder_type::precise ? &s_spu_interpreter_precise.get_table() :
|
||||
g_cfg.core.spu_decoder == spu_decoder_type::fast ? &s_spu_interpreter_fast.get_table() :
|
||||
g_cfg.core.spu_decoder == spu_decoder_type::precise ? &g_spu_interpreter_precise.get_table() :
|
||||
g_cfg.core.spu_decoder == spu_decoder_type::fast ? &g_spu_interpreter_fast.get_table() :
|
||||
(fmt::throw_exception<std::logic_error>("Invalid SPU decoder"), nullptr));
|
||||
|
||||
// LS base address
|
||||
@ -803,7 +864,7 @@ void SPUThread::process_mfc_cmd()
|
||||
do_dma_transfer(ch_mfc_cmd, false);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
break;
|
||||
}
|
||||
case MFC_PUTL_CMD:
|
||||
@ -831,7 +892,7 @@ void SPUThread::process_mfc_cmd()
|
||||
be_t<u16> ts;
|
||||
be_t<u32> ea;
|
||||
};
|
||||
|
||||
|
||||
u32 total_size = 0;
|
||||
|
||||
while (ch_mfc_cmd.size && total_size <= max_imm_dma_size)
|
||||
@ -1156,7 +1217,7 @@ bool SPUThread::get_ch_value(u32 ch, u32& out)
|
||||
|
||||
thread_ctrl::wait_for(100);
|
||||
}
|
||||
|
||||
|
||||
out = res;
|
||||
return true;
|
||||
}
|
||||
@ -1184,7 +1245,7 @@ bool SPUThread::set_ch_value(u32 ch, u32 value)
|
||||
srr0 = value;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
case SPU_WrOutIntrMbox:
|
||||
{
|
||||
if (offset >= RAW_SPU_BASE_ADDR)
|
||||
@ -1202,7 +1263,7 @@ bool SPUThread::set_ch_value(u32 ch, u32 value)
|
||||
int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
const u32 code = value >> 24;
|
||||
{
|
||||
if (code < 64)
|
||||
@ -1392,7 +1453,7 @@ bool SPUThread::set_ch_value(u32 ch, u32 value)
|
||||
else
|
||||
{
|
||||
auto mfc = fxm::check_unlocked<mfc_thread>();
|
||||
|
||||
|
||||
//if (test(mfc->state, cpu_flag::is_waiting))
|
||||
{
|
||||
mfc->notify();
|
||||
@ -1447,7 +1508,7 @@ bool SPUThread::set_ch_value(u32 ch, u32 value)
|
||||
if (atomic_storage<u32>::btr(ch_stall_mask.raw(), value))
|
||||
{
|
||||
auto mfc = fxm::check_unlocked<mfc_thread>();
|
||||
|
||||
|
||||
//if (test(mfc->state, cpu_flag::is_waiting))
|
||||
{
|
||||
mfc->notify();
|
||||
@ -1687,7 +1748,7 @@ bool SPUThread::stop_and_signal(u32 code)
|
||||
}
|
||||
|
||||
semaphore_lock lock(group->mutex);
|
||||
|
||||
|
||||
if (group->run_state == SPU_THREAD_GROUP_STATUS_WAITING)
|
||||
{
|
||||
group->run_state = SPU_THREAD_GROUP_STATUS_RUNNING;
|
||||
|
Loading…
x
Reference in New Issue
Block a user