mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 02:32:36 +01:00
PPU: refactor shift and splat instructions
Fix utils::rol32/64 functions. Fix immediate clamping in splat instructions. Other fixes.
This commit is contained in:
parent
d92008abe4
commit
b42fae0989
@ -247,10 +247,16 @@ inline FT build_function_asm(std::string_view name, F&& builder)
|
||||
|
||||
Asm compiler(&code);
|
||||
compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
|
||||
if constexpr (std::is_invocable_v<F, Asm&, native_args&>)
|
||||
builder(compiler, args);
|
||||
if constexpr (std::is_invocable_r_v<bool, F, Asm&, native_args&>)
|
||||
{
|
||||
if (!builder(compiler, args))
|
||||
return nullptr;
|
||||
}
|
||||
else
|
||||
builder(compiler);
|
||||
{
|
||||
builder(compiler, args);
|
||||
}
|
||||
|
||||
rt.dump_name = name;
|
||||
const auto result = rt._add(&code);
|
||||
jit_announce(result, code.codeSize(), name);
|
||||
|
@ -111,13 +111,15 @@ struct ppu_exec_select
|
||||
#define RETURN(...) \
|
||||
if constexpr (Build == 0) { \
|
||||
static_cast<void>(exec); \
|
||||
static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c) { \
|
||||
static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c, native_args&) { \
|
||||
static ppu_opcode_t op{}; \
|
||||
static ppu_abstract_t ppu; \
|
||||
exec(__VA_ARGS__); \
|
||||
c.ppu_ret(); \
|
||||
return !c.fail_flag; \
|
||||
}); \
|
||||
return f; \
|
||||
if (f) return f; \
|
||||
RETURN_(__VA_ARGS__); \
|
||||
}
|
||||
#else
|
||||
#define RETURN RETURN_
|
||||
@ -1019,7 +1021,7 @@ auto VADDUWS()
|
||||
}
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2074,7 +2076,7 @@ auto VNOR()
|
||||
d = gv_notfs(gv_orfs(std::move(a), std::move(b)));
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2100,7 +2102,7 @@ auto VPERM()
|
||||
#if defined (ARCH_X64)
|
||||
if constexpr (Build == 0)
|
||||
{
|
||||
static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_VPERM", [&](asmjit::ppu_builder& c)
|
||||
static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_VPERM", [&](asmjit::ppu_builder& c, native_args&)
|
||||
{
|
||||
const auto [v0, v1, v2, v3] = c.vec_alloc<4>();
|
||||
c.movdqa(v0, c.ppu_vr(s_op.vc));
|
||||
@ -2374,17 +2376,12 @@ auto VRLB()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint i = 0; i < 16; i++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u8[i] = utils::rol8(a._u8[i], b._u8[i]);
|
||||
}
|
||||
d = gv_rol8(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2393,17 +2390,12 @@ auto VRLH()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint i = 0; i < 8; i++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u16[i] = utils::rol16(a._u16[i], b._u8[i * 2] & 0xf);
|
||||
}
|
||||
d = gv_rol16(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2412,17 +2404,12 @@ auto VRLW()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint w = 0; w < 4; w++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u32[w] = utils::rol32(a._u32[w], b._u8[w * 4] & 0x1f);
|
||||
}
|
||||
d = gv_rol32(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2447,15 +2434,13 @@ auto VSEL()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
const auto& c = ppu.vr[op.vc];
|
||||
|
||||
d = (b & c) | gv_andn(c, a);
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
|
||||
{
|
||||
auto x = gv_andfs(std::move(b), c);
|
||||
d = gv_orfs(std::move(x), gv_andnfs(std::move(c), std::move(a)));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2464,19 +2449,12 @@ auto VSL()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
v128 VA = ppu.vr[op.va];
|
||||
u8 sh = ppu.vr[op.vb]._u8[0] & 0x7;
|
||||
|
||||
d._u8[0] = VA._u8[0] << sh;
|
||||
for (uint b = 1; b < 16; b++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
sh = ppu.vr[op.vb]._u8[b] & 0x7;
|
||||
d._u8[b] = (VA._u8[b] << sh) | (VA._u8[b - 1] >> (8 - sh));
|
||||
}
|
||||
d = gv_fshl8(std::move(a), gv_shuffle_left<1>(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2485,38 +2463,35 @@ auto VSLB()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint i = 0; i < 16; i++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u8[i] = a._u8[i] << (b._u8[i] & 0x7);
|
||||
}
|
||||
d = gv_shl8(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Count>
|
||||
struct VSLDOI
|
||||
{
|
||||
template <ppu_exec_bit... Flags>
|
||||
static auto select(bs_t<ppu_exec_bit> selected, auto func)
|
||||
{
|
||||
return ppu_exec_select<>::select<Flags...>(selected, func);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
auto VSLDOI()
|
||||
static auto impl()
|
||||
{
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
u8 tmpSRC[32];
|
||||
std::memcpy(tmpSRC, &ppu.vr[op.vb], 16);
|
||||
std::memcpy(tmpSRC + 16, &ppu.vr[op.va], 16);
|
||||
|
||||
for (uint b = 0; b<16; b++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u8[15 - b] = tmpSRC[31 - (b + op.vsh)];
|
||||
d = gv_or32(gv_shuffle_left<Count>(std::move(a)), gv_shuffle_right<16 - Count>(std::move(b)));
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
}
|
||||
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
auto VSLH()
|
||||
@ -2524,17 +2499,12 @@ auto VSLH()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint h = 0; h < 8; h++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u16[h] = a._u16[h] << (b._u16[h] & 0xf);
|
||||
}
|
||||
d = gv_shl16(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2543,19 +2513,12 @@ auto VSLO()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
v128 VA = ppu.vr[op.va];
|
||||
u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
|
||||
|
||||
d.clear();
|
||||
|
||||
for (u8 b = 0; b < 16 - nShift; b++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u8[15 - b] = VA._u8[15 - (b + nShift)];
|
||||
}
|
||||
d._u = a._u << (b._u8[0] & 0x78);
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2564,17 +2527,12 @@ auto VSLW()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint w = 0; w < 4; w++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u32[w] = a._u32[w] << (b._u32[w] & 0x1f);
|
||||
}
|
||||
d = gv_shl32(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2583,16 +2541,12 @@ auto VSPLTB()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
u8 byte = ppu.vr[op.vb]._u8[15 - op.vuimm];
|
||||
|
||||
for (uint b = 0; b < 16; b++)
|
||||
static const auto exec = [](auto&& d, auto&& b, auto&& imm)
|
||||
{
|
||||
d._u8[b] = byte;
|
||||
}
|
||||
d = gv_bcst8(b.u8r[imm & 15]);
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2601,18 +2555,12 @@ auto VSPLTH()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
ensure((op.vuimm < 8));
|
||||
|
||||
u16 hword = ppu.vr[op.vb]._u16[7 - op.vuimm];
|
||||
|
||||
for (uint h = 0; h < 8; h++)
|
||||
static const auto exec = [](auto&& d, auto&& b, auto&& imm)
|
||||
{
|
||||
d._u16[h] = hword;
|
||||
}
|
||||
d = gv_bcst16(b.u16r[imm & 7]);
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2621,16 +2569,12 @@ auto VSPLTISB()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const s8 imm = op.vsimm;
|
||||
|
||||
for (uint b = 0; b < 16; b++)
|
||||
static const auto exec = [](auto&& d, auto&& imm)
|
||||
{
|
||||
d._u8[b] = imm;
|
||||
}
|
||||
d = gv_bcst8(imm);
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], op.vsimm);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2639,16 +2583,12 @@ auto VSPLTISH()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const s16 imm = op.vsimm;
|
||||
|
||||
for (uint h = 0; h < 8; h++)
|
||||
static const auto exec = [](auto&& d, auto&& imm)
|
||||
{
|
||||
d._u16[h] = imm;
|
||||
}
|
||||
d = gv_bcst16(imm);
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], op.vsimm);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2657,16 +2597,12 @@ auto VSPLTISW()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const s32 imm = op.vsimm;
|
||||
|
||||
for (uint w = 0; w < 4; w++)
|
||||
static const auto exec = [](auto&& d, auto&& imm)
|
||||
{
|
||||
d._u32[w] = imm;
|
||||
}
|
||||
d = gv_bcst32(imm);
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], op.vsimm);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2675,18 +2611,12 @@ auto VSPLTW()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
ensure((op.vuimm < 4));
|
||||
|
||||
u32 word = ppu.vr[op.vb]._u32[3 - op.vuimm];
|
||||
|
||||
for (uint w = 0; w < 4; w++)
|
||||
static const auto exec = [](auto&& d, auto&& b, auto&& imm)
|
||||
{
|
||||
d._u32[w] = word;
|
||||
}
|
||||
d = gv_bcst32(b.u32r[imm & 3]);
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2695,19 +2625,12 @@ auto VSR()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
v128 VA = ppu.vr[op.va];
|
||||
u8 sh = ppu.vr[op.vb]._u8[15] & 0x7;
|
||||
|
||||
d._u8[15] = VA._u8[15] >> sh;
|
||||
for (uint b = 14; ~b; b--)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
sh = ppu.vr[op.vb]._u8[b] & 0x7;
|
||||
d._u8[b] = (VA._u8[b] >> sh) | (VA._u8[b + 1] << (8 - sh));
|
||||
}
|
||||
d = gv_fshr8(gv_shuffle_right<1>(a), std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2716,17 +2639,12 @@ auto VSRAB()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint i = 0; i < 16; i++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._s8[i] = a._s8[i] >> (b._u8[i] & 0x7);
|
||||
}
|
||||
d = gv_sar8(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2735,17 +2653,12 @@ auto VSRAH()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint h = 0; h < 8; h++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._s16[h] = a._s16[h] >> (b._u16[h] & 0xf);
|
||||
}
|
||||
d = gv_sar16(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2754,17 +2667,12 @@ auto VSRAW()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint w = 0; w < 4; w++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._s32[w] = a._s32[w] >> (b._u32[w] & 0x1f);
|
||||
}
|
||||
d = gv_sar32(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2773,17 +2681,12 @@ auto VSRB()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint i = 0; i < 16; i++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u8[i] = a._u8[i] >> (b._u8[i] & 0x7);
|
||||
}
|
||||
d = gv_shr8(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2792,17 +2695,12 @@ auto VSRH()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint h = 0; h < 8; h++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u16[h] = a._u16[h] >> (b._u16[h] & 0xf);
|
||||
}
|
||||
d = gv_shr16(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2811,19 +2709,12 @@ auto VSRO()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
v128 VA = ppu.vr[op.va];
|
||||
u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
|
||||
|
||||
d.clear();
|
||||
|
||||
for (u8 b = 0; b < 16 - nShift; b++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u8[b] = VA._u8[b + nShift];
|
||||
}
|
||||
d._u = a._u >> (b._u8[0] & 0x78);
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -2832,17 +2723,12 @@ auto VSRW()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
auto& d = ppu.vr[op.vd];
|
||||
const auto& a = ppu.vr[op.va];
|
||||
const auto& b = ppu.vr[op.vb];
|
||||
|
||||
for (uint w = 0; w < 4; w++)
|
||||
static const auto exec = [](auto&& d, auto&& a, auto&& b)
|
||||
{
|
||||
d._u32[w] = a._u32[w] >> (b._u32[w] & 0x1f);
|
||||
}
|
||||
d = gv_shr32(std::move(a), std::move(b));
|
||||
};
|
||||
RETURN_(ppu, op);
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -3184,30 +3070,14 @@ auto VUPKHPX()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
#if defined(ARCH_X64_0)
|
||||
static const auto make = [](asmjit::ppu_builder& c)
|
||||
{
|
||||
const auto [v0, v1, v2] = c.vec_alloc<3>();
|
||||
EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
|
||||
EMIT(psrad, v0, v0, c.imm(16));
|
||||
EMIT(pslld, v1, v0, c.imm(6));
|
||||
EMIT(pslld, v2, v0, c.imm(3));
|
||||
BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
|
||||
BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
|
||||
BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
|
||||
EMIT(por, v0, v0, v1);
|
||||
EMIT(por, v0, v0, v2);
|
||||
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
|
||||
c.ppu_ret();
|
||||
};
|
||||
#endif
|
||||
static const auto exec = [](auto&& d, auto&& b)
|
||||
{
|
||||
const auto x = gv_extend_hi_s16(b);
|
||||
d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00));
|
||||
auto x = gv_extend_hi_s16(std::move(b));
|
||||
auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
|
||||
d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f)));
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -3216,22 +3086,12 @@ auto VUPKHSB()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
#if defined(ARCH_X64_0)
|
||||
static const auto make = [](asmjit::ppu_builder& c)
|
||||
{
|
||||
const auto v0 = c.vec_alloc();
|
||||
EMIT(punpckhbw, v0, v0, c.ppu_vr(s_op.vb));
|
||||
EMIT(psraw, v0, v0, c.imm(8));
|
||||
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
|
||||
c.ppu_ret();
|
||||
};
|
||||
#endif
|
||||
static const auto exec = [](auto&& d, auto&& b)
|
||||
{
|
||||
d = gv_extend_hi_s8(b);
|
||||
d = gv_extend_hi_s8(std::move(b));
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -3240,22 +3100,12 @@ auto VUPKHSH()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
#if defined(ARCH_X64_0)
|
||||
static const auto make = [](asmjit::ppu_builder& c)
|
||||
{
|
||||
const auto v0 = c.vec_alloc();
|
||||
EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
|
||||
EMIT(psrad, v0, v0, c.imm(16));
|
||||
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
|
||||
c.ppu_ret();
|
||||
};
|
||||
#endif
|
||||
static const auto exec = [](auto&& d, auto&& b)
|
||||
{
|
||||
d = gv_extend_hi_s16(b);
|
||||
d = gv_extend_hi_s16(std::move(b));
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -3264,37 +3114,14 @@ auto VUPKLPX()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
#if defined(ARCH_X64_0)
|
||||
static const auto make = [](asmjit::ppu_builder& c)
|
||||
{
|
||||
const auto [v0, v1, v2] = c.vec_alloc<3>();
|
||||
if (utils::has_sse41())
|
||||
{
|
||||
LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
|
||||
}
|
||||
else
|
||||
{
|
||||
EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
|
||||
EMIT(psrad, v0, v0, c.imm(16));
|
||||
}
|
||||
EMIT(pslld, v1, v0, c.imm(6));
|
||||
EMIT(pslld, v2, v0, c.imm(3));
|
||||
BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
|
||||
BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
|
||||
BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
|
||||
EMIT(por, v0, v0, v1);
|
||||
EMIT(por, v0, v0, v2);
|
||||
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
|
||||
c.ppu_ret();
|
||||
};
|
||||
#endif
|
||||
static const auto exec = [](auto&& d, auto&& b)
|
||||
{
|
||||
const auto x = gv_extend_lo_s16(b);
|
||||
d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00));
|
||||
auto x = gv_extend_lo_s16(std::move(b));
|
||||
auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
|
||||
d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f)));
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -3303,29 +3130,12 @@ auto VUPKLSB()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
#if defined(ARCH_X64_0)
|
||||
static const auto make = [](asmjit::ppu_builder& c)
|
||||
{
|
||||
const auto v0 = c.vec_alloc();
|
||||
if (utils::has_sse41())
|
||||
{
|
||||
LDST(pmovsxbw, v0, c.ppu_vr<8>(s_op.vb));
|
||||
}
|
||||
else
|
||||
{
|
||||
EMIT(punpcklbw, v0, v0, c.ppu_vr(s_op.vb));
|
||||
EMIT(psraw, v0, v0, c.imm(8));
|
||||
}
|
||||
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
|
||||
c.ppu_ret();
|
||||
};
|
||||
#endif
|
||||
static const auto exec = [](auto&& d, auto&& b)
|
||||
{
|
||||
d = gv_extend_lo_s8(b);
|
||||
d = gv_extend_lo_s8(std::move(b));
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -3334,29 +3144,12 @@ auto VUPKLSH()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<>();
|
||||
|
||||
#if defined(ARCH_X64_0)
|
||||
static const auto make = [](asmjit::ppu_builder& c)
|
||||
{
|
||||
const auto v0 = c.vec_alloc();
|
||||
if (utils::has_sse41())
|
||||
{
|
||||
LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
|
||||
}
|
||||
else
|
||||
{
|
||||
EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
|
||||
EMIT(psrad, v0, v0, c.imm(16));
|
||||
}
|
||||
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
|
||||
c.ppu_ret();
|
||||
};
|
||||
#endif
|
||||
static const auto exec = [](auto&& d, auto&& b)
|
||||
{
|
||||
d = gv_extend_lo_s16(b);
|
||||
d = gv_extend_lo_s16(std::move(b));
|
||||
};
|
||||
|
||||
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
|
||||
}
|
||||
|
||||
template <u32 Build, ppu_exec_bit... Flags>
|
||||
@ -7157,7 +6950,8 @@ struct ppu_interpreter_t
|
||||
IT VSEL;
|
||||
IT VSL;
|
||||
IT VSLB;
|
||||
IT VSLDOI;
|
||||
IT VSLDOI{};
|
||||
IT VSLDOI_[16];
|
||||
IT VSLH;
|
||||
IT VSLO;
|
||||
IT VSLW;
|
||||
@ -7629,6 +7423,27 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
|
||||
return ::name<0, Flags...>(); \
|
||||
}); \
|
||||
|
||||
#define INIT_ONE(name, bits) \
|
||||
ptrs->name##_[0b##bits] = ::name<0b##bits>::select(selected, []<ppu_exec_bit... Flags>() { \
|
||||
return ::name<0b##bits>::impl<0, Flags...>(); \
|
||||
}); \
|
||||
|
||||
#define INIT_PACK2(name, bits) \
|
||||
INIT_ONE(name, bits##0) \
|
||||
INIT_ONE(name, bits##1) \
|
||||
|
||||
#define INIT_PACK4(name, bits) \
|
||||
INIT_PACK2(name, bits##0) \
|
||||
INIT_PACK2(name, bits##1) \
|
||||
|
||||
#define INIT_PACK8(name, bits) \
|
||||
INIT_PACK4(name, bits##0) \
|
||||
INIT_PACK4(name, bits##1) \
|
||||
|
||||
#define INIT_PACK16(name, bits) \
|
||||
INIT_PACK8(name, bits##0) \
|
||||
INIT_PACK8(name, bits##1) \
|
||||
|
||||
INIT(MFVSCR);
|
||||
INIT(MTVSCR);
|
||||
INIT(VADDCUW);
|
||||
@ -7732,7 +7547,7 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
|
||||
INIT(VSEL);
|
||||
INIT(VSL);
|
||||
INIT(VSLB);
|
||||
INIT(VSLDOI);
|
||||
INIT_PACK16(VSLDOI,);
|
||||
INIT(VSLH);
|
||||
INIT(VSLO);
|
||||
INIT(VSLW);
|
||||
@ -8051,6 +7866,7 @@ ppu_intrp_func_t ppu_interpreter_rt::decode(u32 opv) const noexcept
|
||||
|
||||
break;
|
||||
}
|
||||
case ppu_itype::VSLDOI: return ptrs->VSLDOI_[op.vsh];
|
||||
default: break;
|
||||
}
|
||||
|
||||
|
@ -193,7 +193,7 @@ namespace utils
|
||||
#elif defined(__clang__)
|
||||
return __builtin_rotateleft32(x, n);
|
||||
#else
|
||||
return (x << n) | (x >> (32 - n));
|
||||
return (x << (n & 31)) | (x >> (((0 - n) & 31)));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -209,7 +209,7 @@ namespace utils
|
||||
#elif defined(__clang__)
|
||||
return __builtin_rotateleft64(x, n);
|
||||
#else
|
||||
return (x << n) | (x >> (64 - n));
|
||||
return (x << (n & 63)) | (x >> (((0 - n) & 63)));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "util/types.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
#include "util/asm.hpp"
|
||||
#include "Utilities/JIT.h"
|
||||
|
||||
#if defined(ARCH_X64)
|
||||
@ -40,6 +41,7 @@ namespace asmjit
|
||||
#else
|
||||
struct gpr_type : Operand
|
||||
{
|
||||
gpr_type() = default;
|
||||
gpr_type(u32)
|
||||
{
|
||||
}
|
||||
@ -47,6 +49,7 @@ namespace asmjit
|
||||
|
||||
struct vec_type : Operand
|
||||
{
|
||||
vec_type() = default;
|
||||
vec_type(u32)
|
||||
{
|
||||
}
|
||||
@ -82,7 +85,7 @@ namespace asmjit
|
||||
|
||||
template <typename T, typename D = std::decay_t<T>>
|
||||
constexpr arg_class arg_classify =
|
||||
std::is_base_of_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> :
|
||||
std::is_same_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> :
|
||||
std::is_base_of_v<mem_type, D> ? arg_class::mem_lv :
|
||||
std::is_base_of_v<mem_lazy, D> ? arg_class::mem_lv + !std::is_reference_v<T> :
|
||||
std::is_reference_v<T> ? arg_class::reg_lv : arg_class::reg_rv;
|
||||
@ -91,6 +94,8 @@ namespace asmjit
|
||||
{
|
||||
using base = native_asm;
|
||||
|
||||
bool fail_flag = false;
|
||||
|
||||
vec_builder(CodeHolder* ch)
|
||||
: native_asm(ch)
|
||||
{
|
||||
@ -150,6 +155,9 @@ namespace asmjit
|
||||
|
||||
std::unordered_map<v128, Label> consts[16]{};
|
||||
|
||||
#if defined(ARCH_X64)
|
||||
std::unordered_map<v128, vec_type> const_allocs{};
|
||||
|
||||
template <typename T, u32 Size = sizeof(T)>
|
||||
x86::Mem get_const(const T& data, u32 esize = Size)
|
||||
{
|
||||
@ -180,16 +188,99 @@ namespace asmjit
|
||||
|
||||
return x86::Mem(_label, 0, Size);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
struct free_on_exit
|
||||
{
|
||||
Operand x{};
|
||||
|
||||
free_on_exit() = default;
|
||||
free_on_exit(const free_on_exit&) = delete;
|
||||
free_on_exit& operator=(const free_on_exit&) = delete;
|
||||
|
||||
~free_on_exit()
|
||||
{
|
||||
if (x.isReg())
|
||||
{
|
||||
vec_type v;
|
||||
v.copyFrom(x);
|
||||
g_vc->vec_dealloc(v);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(ARCH_X64)
|
||||
inline auto arg_eval(const v128& _c, u32 esize)
|
||||
inline Operand arg_eval(v128& _c, u32 esize)
|
||||
{
|
||||
// TODO: implement PSHUFD broadcasts and AVX ones
|
||||
auto r = g_vc->get_const(_c, esize);
|
||||
const auto found = g_vc->const_allocs.find(_c);
|
||||
|
||||
if (found != g_vc->const_allocs.end())
|
||||
{
|
||||
return found->second;
|
||||
}
|
||||
|
||||
vec_type reg = g_vc->vec_alloc();
|
||||
|
||||
// TODO: PSHUFD style broadcast? Needs known const layout
|
||||
if (utils::has_avx() && _c._u64[0] == _c._u64[1])
|
||||
{
|
||||
if (_c._u32[0] == _c._u32[1])
|
||||
{
|
||||
if (utils::has_avx2() && _c._u16[0] == _c._u16[1])
|
||||
{
|
||||
if (_c._u8[0] == _c._u8[1])
|
||||
{
|
||||
ensure(!g_vc->vpbroadcastb(reg, g_vc->get_const(_c._u8[0])));
|
||||
}
|
||||
else
|
||||
{
|
||||
ensure(!g_vc->vpbroadcastw(reg, g_vc->get_const(_c._u16[0])));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ensure(!g_vc->vbroadcastss(reg, g_vc->get_const(_c._u32[0])));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ensure(!g_vc->vbroadcastsd(reg, g_vc->get_const(_c._u32[0])));
|
||||
}
|
||||
}
|
||||
else if (!_c._u)
|
||||
{
|
||||
ensure(!g_vc->pxor(reg, reg));
|
||||
}
|
||||
else if (!~_c._u)
|
||||
{
|
||||
ensure(!g_vc->pcmpeqd(reg, reg));
|
||||
}
|
||||
else
|
||||
{
|
||||
ensure(!g_vc->movaps(reg, g_vc->get_const(_c, esize)));
|
||||
}
|
||||
|
||||
g_vc->const_allocs.emplace(_c, reg);
|
||||
return reg;
|
||||
}
|
||||
|
||||
inline Operand arg_eval(v128&& _c, u32 esize)
|
||||
{
|
||||
const auto found = g_vc->const_allocs.find(_c);
|
||||
|
||||
if (found != g_vc->const_allocs.end())
|
||||
{
|
||||
vec_type r = found->second;
|
||||
g_vc->const_allocs.erase(found);
|
||||
g_vc->vec_dealloc(r);
|
||||
return r;
|
||||
}
|
||||
|
||||
// Hack: assume can use mem op (TODO)
|
||||
return g_vc->get_const(_c, esize);
|
||||
}
|
||||
|
||||
template <typename T> requires(std::is_base_of_v<mem_lazy, std::decay_t<T>>)
|
||||
inline decltype(auto) arg_eval(T&& mem, u32)
|
||||
{
|
||||
@ -211,12 +302,24 @@ namespace asmjit
|
||||
return std::move(mem);
|
||||
}
|
||||
|
||||
inline void arg_free(const v128&)
|
||||
{
|
||||
}
|
||||
|
||||
inline void arg_free(const Operand& op)
|
||||
{
|
||||
if (op.isReg())
|
||||
{
|
||||
g_vc->vec_dealloc(vec_type{op.id()});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool arg_use_evex(const auto& op)
|
||||
{
|
||||
constexpr auto _class = arg_classify<T>;
|
||||
if constexpr (_class == arg_class::imm_rv)
|
||||
return true;
|
||||
return g_vc->const_allocs.count(op) == 0;
|
||||
else if constexpr (_class == arg_class::imm_lv)
|
||||
return false;
|
||||
else if (op.isMem())
|
||||
@ -302,6 +405,7 @@ namespace asmjit
|
||||
template <typename A, typename B, typename... Args>
|
||||
vec_type binary_op(u32 esize, x86::Inst::Id mov_op, x86::Inst::Id sse_op, x86::Inst::Id avx_op, x86::Inst::Id evex_op, A&& a, B&& b, Args&&... args)
|
||||
{
|
||||
free_on_exit e;
|
||||
Operand src1{};
|
||||
|
||||
if constexpr (arg_classify<A> == arg_class::reg_rv)
|
||||
@ -317,12 +421,13 @@ namespace asmjit
|
||||
|
||||
if constexpr (arg_classify<B> == arg_class::reg_rv)
|
||||
{
|
||||
g_vc->vec_dealloc(vec_type{b.id()});
|
||||
//b = Operand();
|
||||
e.x = b;
|
||||
}
|
||||
}
|
||||
else if (utils::has_avx() && avx_op && (arg_classify<A> == arg_class::reg_lv || arg_classify<A> == arg_class::mem_lv))
|
||||
{
|
||||
Operand srca = arg_eval(std::forward<A>(a), 16);
|
||||
|
||||
if constexpr (arg_classify<A> == arg_class::reg_lv)
|
||||
{
|
||||
if constexpr (arg_classify<B> == arg_class::reg_rv)
|
||||
@ -336,47 +441,79 @@ namespace asmjit
|
||||
src1 = g_vc->vec_alloc();
|
||||
}
|
||||
}
|
||||
else // if A == arg_class::reg_rv
|
||||
else
|
||||
{
|
||||
src1 = g_vc->vec_alloc();
|
||||
|
||||
if (!a.isReg())
|
||||
{
|
||||
static_cast<void>(arg_eval(std::forward<A>(a), 16));
|
||||
}
|
||||
|
||||
if constexpr (arg_classify<B> == arg_class::reg_rv)
|
||||
{
|
||||
g_vc->vec_dealloc(vec_type{b.id()});
|
||||
//b = Operand();
|
||||
e.x = b;
|
||||
}
|
||||
}
|
||||
|
||||
if (utils::has_avx512() && evex_op && arg_use_evex<B>(b))
|
||||
{
|
||||
ensure(!g_vc->evex().emit(evex_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
|
||||
ensure(!g_vc->evex().emit(evex_op, src1, srca, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
|
||||
return vec_type{src1.id()};
|
||||
}
|
||||
|
||||
ensure(!g_vc->emit(avx_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
|
||||
ensure(!g_vc->emit(avx_op, src1, srca, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
|
||||
return vec_type{src1.id()};
|
||||
}
|
||||
else do
|
||||
{
|
||||
if constexpr (arg_classify<B> == arg_class::reg_rv)
|
||||
if constexpr (arg_classify<A> == arg_class::mem_rv)
|
||||
{
|
||||
g_vc->vec_dealloc(vec_type{b.id()});
|
||||
//b = Operand();
|
||||
}
|
||||
|
||||
if (arg_classify<A> == arg_class::mem_rv && a.isReg())
|
||||
if (a.isReg())
|
||||
{
|
||||
src1 = vec_type(a.id());
|
||||
|
||||
if constexpr (arg_classify<B> == arg_class::reg_rv)
|
||||
{
|
||||
e.x = b;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (arg_classify<A> == arg_class::imm_rv)
|
||||
{
|
||||
if (auto found = g_vc->const_allocs.find(a); found != g_vc->const_allocs.end())
|
||||
{
|
||||
src1 = found->second;
|
||||
g_vc->const_allocs.erase(found);
|
||||
|
||||
if constexpr (arg_classify<B> == arg_class::reg_rv)
|
||||
{
|
||||
e.x = b;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
src1 = g_vc->vec_alloc();
|
||||
|
||||
if constexpr (arg_classify<B> == arg_class::reg_rv)
|
||||
{
|
||||
e.x = b;
|
||||
}
|
||||
|
||||
if constexpr (arg_classify<A> == arg_class::imm_rv)
|
||||
{
|
||||
if (!a._u)
|
||||
{
|
||||
// All zeros
|
||||
ensure(!g_vc->emit(x86::Inst::kIdPxor, src1, src1));
|
||||
break;
|
||||
}
|
||||
else if (!~a._u)
|
||||
{
|
||||
// All ones
|
||||
ensure(!g_vc->emit(x86::Inst::kIdPcmpeqd, src1, src1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to arg copy
|
||||
ensure(!g_vc->emit(mov_op, src1, arg_eval(std::forward<A>(a), 16)));
|
||||
}
|
||||
@ -404,10 +541,14 @@ namespace asmjit
|
||||
}
|
||||
|
||||
inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false);
|
||||
inline v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false);
|
||||
inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false);
|
||||
inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false);
|
||||
inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false);
|
||||
|
||||
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
|
||||
inline asmjit::vec_type gv_gts32(A&&, B&&);
|
||||
|
||||
inline void gv_set_zeroing_denormals()
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -704,6 +845,16 @@ inline v128 gv_not32(const v128& a)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_not32(A&& a)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
asmjit::vec_type ones = g_vc->vec_alloc();
|
||||
g_vc->pcmpeqd(ones, ones);
|
||||
FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward<A>(a));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v128 gv_notfs(const v128& a)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -713,6 +864,16 @@ inline v128 gv_notfs(const v128& a)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_notfs(A&& a)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
asmjit::vec_type ones = g_vc->vec_alloc();
|
||||
g_vc->pcmpeqd(ones, ones);
|
||||
FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward<A>(a));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v128 gv_shl16(const v128& a, u32 count)
|
||||
{
|
||||
if (count >= 16)
|
||||
@ -867,6 +1028,20 @@ inline v128 gv_sar64(const v128& a, u32 count)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_sar64(A&& a, u32 count)
|
||||
{
|
||||
if (count >= 64)
|
||||
count = 63;
|
||||
#if defined(ARCH_X64)
|
||||
using enum asmjit::x86::Inst::Id;
|
||||
if (utils::has_avx512())
|
||||
return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward<A>(a), count);
|
||||
g_vc->fail_flag = true;
|
||||
return std::forward<A>(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v128 gv_add8(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -1025,6 +1200,20 @@ inline v128 gv_addus_u32(const v128& a, const v128& b)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
|
||||
inline asmjit::vec_type gv_addus_u32(A&& a, B&& b)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
if (utils::has_sse41())
|
||||
return gv_add32(gv_minu32(std::forward<B>(b), gv_not32(a)), std::forward<A>(a));
|
||||
auto s = gv_add32(a, b);
|
||||
auto x = gv_xor32(std::forward<B>(b), gv_bcst32(0x80000000));
|
||||
auto y = gv_xor32(std::forward<A>(a), gv_bcst32(0x7fffffff));
|
||||
return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y)));
|
||||
#endif
|
||||
return {};
|
||||
}
|
||||
|
||||
inline v128 gv_addfs(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -1052,6 +1241,12 @@ inline v128 gv_sub8(const v128& a, const v128& b)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
|
||||
inline auto gv_sub8(A&& a, B&& b)
|
||||
{
|
||||
FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward<A>(a), std::forward<B>(b));
|
||||
}
|
||||
|
||||
inline v128 gv_sub16(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -1265,6 +1460,21 @@ inline v128 gv_minu32(const v128& a, const v128& b)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
|
||||
inline asmjit::vec_type gv_minu32(A&& a, B&& b)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
if (utils::has_sse41())
|
||||
FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward<A>(a), std::forward<B>(b));
|
||||
auto s = gv_bcst32(0x80000000);
|
||||
auto x = gv_xor32(a, s);
|
||||
auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b));
|
||||
auto z = gv_and32(m, std::move(b));
|
||||
return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a)));
|
||||
#endif
|
||||
return {};
|
||||
}
|
||||
|
||||
inline v128 gv_mins8(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
@ -1493,6 +1703,13 @@ inline v128 gv_gts8(const v128& a, const v128& b)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
|
||||
inline asmjit::vec_type gv_gts8(A&& a, B&& b)
|
||||
{
|
||||
FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward<A>(a), std::forward<B>(b));
|
||||
return {};
|
||||
}
|
||||
|
||||
inline v128 gv_gts16(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -1511,6 +1728,13 @@ inline v128 gv_gts32(const v128& a, const v128& b)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
|
||||
inline asmjit::vec_type gv_gts32(A&& a, B&& b)
|
||||
{
|
||||
FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward<A>(a), std::forward<B>(b));
|
||||
return {};
|
||||
}
|
||||
|
||||
inline v128 gv_avgu8(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -2154,7 +2378,7 @@ inline v128 gv_andn(const v128& a, const v128& b)
|
||||
}
|
||||
|
||||
// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
|
||||
inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
|
||||
FORCE_INLINE v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_blendv_epi8(_false, _true, _cmp);
|
||||
@ -2165,6 +2389,45 @@ inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
|
||||
#endif
|
||||
}
|
||||
|
||||
// Select elements using sign bit only
|
||||
FORCE_INLINE v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_blendv_epi8(_false, _true, bits);
|
||||
#else
|
||||
return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A, typename B, typename C> requires (asmjit::any_operand_v<A, B, C>)
|
||||
inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false)
|
||||
{
|
||||
using namespace asmjit;
|
||||
#if defined(ARCH_X64)
|
||||
if (utils::has_avx())
|
||||
{
|
||||
Operand arg0{};
|
||||
Operand arg1 = arg_eval(std::forward<A>(bits), 16);
|
||||
Operand arg2 = arg_eval(std::forward<B>(_true), 16);
|
||||
Operand arg3 = arg_eval(std::forward<C>(_false), 16);
|
||||
if constexpr (!std::is_reference_v<A>)
|
||||
arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1);
|
||||
if constexpr (!std::is_reference_v<B>)
|
||||
arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2);
|
||||
if constexpr (!std::is_reference_v<C>)
|
||||
arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3);
|
||||
if (arg0.isNone())
|
||||
arg0 = g_vc->vec_alloc();
|
||||
g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1);
|
||||
vec_type r;
|
||||
r.copyFrom(arg0);
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
g_vc->fail_flag = true;
|
||||
return vec_type{0};
|
||||
}
|
||||
|
||||
// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
|
||||
inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false)
|
||||
{
|
||||
@ -2305,6 +2568,17 @@ inline v128 gv_extend_lo_s8(const v128& vec)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_extend_lo_s8(A&& a)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
using enum asmjit::x86::Inst::Id;
|
||||
if (utils::has_sse41())
|
||||
return asmjit::unary_op(kIdPmovsxbw, kIdVpmovsxbw, std::forward<A>(a));
|
||||
return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward<A>(a)), 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v128 gv_extend_hi_s8(const v128& vec)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
@ -2316,6 +2590,15 @@ inline v128 gv_extend_hi_s8(const v128& vec)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_extend_hi_s8(A&& a)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
using enum asmjit::x86::Inst::Id;
|
||||
return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward<A>(a)), 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v128 gv_unpacklo16(const v128& lows, const v128& highs)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -2336,6 +2619,17 @@ inline v128 gv_extend_lo_s16(const v128& vec)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_extend_lo_s16(A&& a)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
using enum asmjit::x86::Inst::Id;
|
||||
if (utils::has_sse41())
|
||||
return asmjit::unary_op(kIdPmovsxwd, kIdVpmovsxwd, std::forward<A>(a));
|
||||
return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward<A>(a)), 16);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v128 gv_extend_hi_s16(const v128& vec)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
@ -2347,6 +2641,15 @@ inline v128 gv_extend_hi_s16(const v128& vec)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_extend_hi_s16(A&& a)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
using enum asmjit::x86::Inst::Id;
|
||||
return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward<A>(a)), 16);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline v128 gv_unpacklo32(const v128& lows, const v128& highs)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
@ -2471,3 +2774,280 @@ inline v128 gv_log2_approxfs(const v128& a)
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each 8-bit element, r = a << (b & 7)
|
||||
inline v128 gv_shl8(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_ARM64)
|
||||
return vshlq_u8(a, vandq_s8(b, gv_bcst8(7)));
|
||||
#else
|
||||
const v128 x1 = gv_add8(a, a); // shift left by 1
|
||||
const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
|
||||
const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2
|
||||
const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
|
||||
const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4
|
||||
return gv_signselect8(gv_shl64(b, 5), x3, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each 16-bit element, r = a << (b & 15)
|
||||
inline v128 gv_shl16(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
||||
return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
|
||||
#elif defined(ARCH_ARM64)
|
||||
return vshlq_u16(a, vandq_s16(b, gv_bcst8(15)));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 8; i++)
|
||||
r._u16[i] = a._u16[i] << (b._u16[i] & 15);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each 32-bit element, r = a << (b & 31)
|
||||
inline v128 gv_shl32(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
|
||||
#elif defined(ARCH_ARM64)
|
||||
return vshlq_u32(a, vandq_s32(b, gv_bcst8(31)));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
r._u32[i] = a._u32[i] << (b._u32[i] & 31);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each unsigned 8-bit element, r = a >> (b & 7)
|
||||
inline v128 gv_shr8(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_ARM64)
|
||||
return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
|
||||
#else
|
||||
const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1
|
||||
const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
|
||||
const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2
|
||||
const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
|
||||
const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4
|
||||
return gv_signselect8(gv_shl64(b, 5), x3, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each unsigned 16-bit element, r = a >> (b & 15)
|
||||
inline v128 gv_shr16(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
||||
return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
|
||||
#elif defined(ARCH_ARM64)
|
||||
return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 8; i++)
|
||||
r._u16[i] = a._u16[i] >> (b._u16[i] & 15);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each unsigned 32-bit element, r = a >> (b & 31)
|
||||
inline v128 gv_shr32(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
|
||||
#elif defined(ARCH_ARM64)
|
||||
return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
r._u32[i] = a._u32[i] >> (b._u32[i] & 31);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each signed 8-bit element, r = a >> (b & 7)
|
||||
inline v128 gv_sar8(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_ARM64)
|
||||
return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 16; i++)
|
||||
r._s8[i] = a._s8[i] >> (b._s8[i] & 7);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each signed 16-bit element, r = a >> (b & 15)
|
||||
inline v128 gv_sar16(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
||||
return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
|
||||
#elif defined(ARCH_ARM64)
|
||||
return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 8; i++)
|
||||
r._s16[i] = a._s16[i] >> (b._s16[i] & 15);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each signed 32-bit element, r = a >> (b & 31)
|
||||
inline v128 gv_sar32(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
|
||||
#elif defined(ARCH_ARM64)
|
||||
return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
r._s32[i] = a._s32[i] >> (b._s32[i] & 31);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each 8-bit element, r = rotate a by b
|
||||
inline v128 gv_rol8(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_ARM64)
|
||||
const auto amt1 = vandq_s8(b, gv_bcst8(7));
|
||||
const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
|
||||
return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2));
|
||||
#else
|
||||
const v128 x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1
|
||||
const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
|
||||
const v128 c2 = gv_bcst8(0x3);
|
||||
const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2), gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2
|
||||
const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
|
||||
const v128 c3 = gv_bcst8(0xf);
|
||||
const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3), gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4
|
||||
return gv_signselect8(gv_shl64(b, 5), x3, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each 16-bit element, r = rotate a by b
|
||||
inline v128 gv_rol16(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(ARCH_ARM64)
|
||||
const auto amt1 = vandq_s16(b, gv_bcst16(15));
|
||||
const auto amt2 = vsubq_s16(amt1, gv_bcst16(16));
|
||||
return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 8; i++)
|
||||
r._u16[i] = utils::rol16(a._u16[i], b._u16[i]);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each 32-bit element, r = rotate a by b
|
||||
inline v128 gv_rol32(const v128& a, const v128& b)
|
||||
{
|
||||
#if defined(__AVX512VL__)
|
||||
return _mm_rolv_epi32(a, b);
|
||||
#elif defined(ARCH_ARM64)
|
||||
const auto amt1 = vandq_s32(b, gv_bcst32(31));
|
||||
const auto amt2 = vsubq_s32(amt1, gv_bcst32(32));
|
||||
return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
|
||||
#else
|
||||
v128 r;
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
r._u32[i] = utils::rol32(a._u32[i], b._u32[i]);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1)
|
||||
template <typename A, typename B, typename C>
|
||||
inline auto gv_fshl8(A&& a, B&& b, C&& c)
|
||||
{
|
||||
#if defined(ARCH_ARM64)
|
||||
const auto amt1 = vandq_s8(c, gv_bcst8(7));
|
||||
const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
|
||||
return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2)));
|
||||
#else
|
||||
auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b));
|
||||
auto s1 = gv_shl64(c, 7);
|
||||
auto r1 = gv_signselect8(s1, std::move(x1), std::forward<A>(a));
|
||||
auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward<B>(b));
|
||||
auto c2 = gv_bcst8(0x3);
|
||||
auto x2 = gv_and32(gv_shr64(b1, 6), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2)));
|
||||
auto s2 = gv_shl64(c, 6);
|
||||
auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
|
||||
auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1));
|
||||
auto c3 = gv_bcst8(0xf);
|
||||
auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4)));
|
||||
return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
|
||||
#endif
|
||||
}
|
||||
|
||||
// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1)
|
||||
template <typename A, typename B, typename C>
|
||||
inline auto gv_fshr8(A&& a, B&& b, C&& c)
|
||||
{
|
||||
#if defined(ARCH_ARM64)
|
||||
const auto amt1 = vandq_s8(c, gv_bcst8(7));
|
||||
const auto amt2 = vsubq_s8(gv_bcst8(8), amt1);
|
||||
return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2));
|
||||
#else
|
||||
auto c1 = gv_bcst8(0x7f);
|
||||
auto x1 = gv_and32(gv_shr64(b, 1), c1); x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7)));
|
||||
auto s1 = gv_shl64(c, 7);
|
||||
auto r1 = gv_signselect8(s1, std::move(x1), std::move(b));
|
||||
auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a));
|
||||
auto c2 = gv_bcst8(0x3f);
|
||||
auto x2 = gv_and32(gv_shr64(r1, 2), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6)));
|
||||
auto s2 = gv_shl64(c, 6);
|
||||
auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
|
||||
auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1));
|
||||
auto c3 = gv_bcst8(0x0f);
|
||||
auto x3 = gv_and32(gv_shr64(r2, 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4)));
|
||||
return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Shift left by byte amount
|
||||
template <u32 Count>
|
||||
inline v128 gv_shuffle_left(const v128& a)
|
||||
{
|
||||
if (Count > 15)
|
||||
return {};
|
||||
#if defined(ARCH_X64)
|
||||
return _mm_slli_si128(a, Count);
|
||||
#elif defined(ARCH_ARM64)
|
||||
v128 idx;
|
||||
for (u32 i = 0; i < 16; i++)
|
||||
idx._u8[i] = u8(i - Count);
|
||||
return vqtbl1q_u8(a, idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <u32 Count, typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_shuffle_left(A&& a)
|
||||
{
|
||||
FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward<A>(a), Count);
|
||||
}
|
||||
|
||||
// Shift right by byte amount
|
||||
template <u32 Count>
|
||||
inline v128 gv_shuffle_right(const v128& a)
|
||||
{
|
||||
if (Count > 15)
|
||||
return {};
|
||||
#if defined(ARCH_X64)
|
||||
return _mm_srli_si128(a, Count);
|
||||
#elif defined(ARCH_ARM64)
|
||||
v128 idx;
|
||||
for (u32 i = 0; i < 16; i++)
|
||||
idx._u8[i] = u8(i + Count);
|
||||
return vqtbl1q_u8(a, idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <u32 Count, typename A> requires (asmjit::any_operand_v<A>)
|
||||
inline auto gv_shuffle_right(A&& a)
|
||||
{
|
||||
FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user