1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 10:42:36 +01:00

PPU: refactor shift and splat instructions

Fix utils::rol32/64 functions.
Fix immediate clamping in splat instructions.
Other fixes.
This commit is contained in:
Nekotekina 2022-01-19 02:41:32 +03:00
parent d92008abe4
commit b42fae0989
4 changed files with 784 additions and 382 deletions

View File

@ -247,10 +247,16 @@ inline FT build_function_asm(std::string_view name, F&& builder)
Asm compiler(&code); Asm compiler(&code);
compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign); compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
if constexpr (std::is_invocable_v<F, Asm&, native_args&>) if constexpr (std::is_invocable_r_v<bool, F, Asm&, native_args&>)
builder(compiler, args); {
if (!builder(compiler, args))
return nullptr;
}
else else
builder(compiler); {
builder(compiler, args);
}
rt.dump_name = name; rt.dump_name = name;
const auto result = rt._add(&code); const auto result = rt._add(&code);
jit_announce(result, code.codeSize(), name); jit_announce(result, code.codeSize(), name);

View File

@ -111,13 +111,15 @@ struct ppu_exec_select
#define RETURN(...) \ #define RETURN(...) \
if constexpr (Build == 0) { \ if constexpr (Build == 0) { \
static_cast<void>(exec); \ static_cast<void>(exec); \
static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c) { \ static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c, native_args&) { \
static ppu_opcode_t op{}; \ static ppu_opcode_t op{}; \
static ppu_abstract_t ppu; \ static ppu_abstract_t ppu; \
exec(__VA_ARGS__); \ exec(__VA_ARGS__); \
c.ppu_ret(); \ c.ppu_ret(); \
return !c.fail_flag; \
}); \ }); \
return f; \ if (f) return f; \
RETURN_(__VA_ARGS__); \
} }
#else #else
#define RETURN RETURN_ #define RETURN RETURN_
@ -1019,7 +1021,7 @@ auto VADDUWS()
} }
}; };
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2074,7 +2076,7 @@ auto VNOR()
d = gv_notfs(gv_orfs(std::move(a), std::move(b))); d = gv_notfs(gv_orfs(std::move(a), std::move(b)));
}; };
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2100,7 +2102,7 @@ auto VPERM()
#if defined (ARCH_X64) #if defined (ARCH_X64)
if constexpr (Build == 0) if constexpr (Build == 0)
{ {
static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_VPERM", [&](asmjit::ppu_builder& c) static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_VPERM", [&](asmjit::ppu_builder& c, native_args&)
{ {
const auto [v0, v1, v2, v3] = c.vec_alloc<4>(); const auto [v0, v1, v2, v3] = c.vec_alloc<4>();
c.movdqa(v0, c.ppu_vr(s_op.vc)); c.movdqa(v0, c.ppu_vr(s_op.vc));
@ -2374,17 +2376,12 @@ auto VRLB()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint i = 0; i < 16; i++)
{ {
d._u8[i] = utils::rol8(a._u8[i], b._u8[i]); d = gv_rol8(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2393,17 +2390,12 @@ auto VRLH()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint i = 0; i < 8; i++)
{ {
d._u16[i] = utils::rol16(a._u16[i], b._u8[i * 2] & 0xf); d = gv_rol16(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2412,17 +2404,12 @@ auto VRLW()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint w = 0; w < 4; w++)
{ {
d._u32[w] = utils::rol32(a._u32[w], b._u8[w * 4] & 0x1f); d = gv_rol32(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2447,15 +2434,13 @@ auto VSEL()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
auto& d = ppu.vr[op.vd]; {
const auto& a = ppu.vr[op.va]; auto x = gv_andfs(std::move(b), c);
const auto& b = ppu.vr[op.vb]; d = gv_orfs(std::move(x), gv_andnfs(std::move(c), std::move(a)));
const auto& c = ppu.vr[op.vc];
d = (b & c) | gv_andn(c, a);
}; };
RETURN_(ppu, op);
RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2464,19 +2449,12 @@ auto VSL()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
v128 VA = ppu.vr[op.va];
u8 sh = ppu.vr[op.vb]._u8[0] & 0x7;
d._u8[0] = VA._u8[0] << sh;
for (uint b = 1; b < 16; b++)
{ {
sh = ppu.vr[op.vb]._u8[b] & 0x7; d = gv_fshl8(std::move(a), gv_shuffle_left<1>(a), std::move(b));
d._u8[b] = (VA._u8[b] << sh) | (VA._u8[b - 1] >> (8 - sh));
}
}; };
RETURN_(ppu, op);
RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2485,38 +2463,35 @@ auto VSLB()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint i = 0; i < 16; i++)
{ {
d._u8[i] = a._u8[i] << (b._u8[i] & 0x7); d = gv_shl8(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Count>
auto VSLDOI() struct VSLDOI
{ {
if constexpr (Build == 0xf1a6) template <ppu_exec_bit... Flags>
return ppu_exec_select<Flags...>::template select<>(); static auto select(bs_t<ppu_exec_bit> selected, auto func)
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
auto& d = ppu.vr[op.vd];
u8 tmpSRC[32];
std::memcpy(tmpSRC, &ppu.vr[op.vb], 16);
std::memcpy(tmpSRC + 16, &ppu.vr[op.va], 16);
for (uint b = 0; b<16; b++)
{ {
d._u8[15 - b] = tmpSRC[31 - (b + op.vsh)]; return ppu_exec_select<>::select<Flags...>(selected, func);
} }
};
RETURN_(ppu, op); template <u32 Build, ppu_exec_bit... Flags>
} static auto impl()
{
static const auto exec = [](auto&& d, auto&& a, auto&& b)
{
d = gv_or32(gv_shuffle_left<Count>(std::move(a)), gv_shuffle_right<16 - Count>(std::move(b)));
};
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
}
};
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
auto VSLH() auto VSLH()
@ -2524,17 +2499,12 @@ auto VSLH()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint h = 0; h < 8; h++)
{ {
d._u16[h] = a._u16[h] << (b._u16[h] & 0xf); d = gv_shl16(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2543,19 +2513,12 @@ auto VSLO()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
v128 VA = ppu.vr[op.va];
u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
d.clear();
for (u8 b = 0; b < 16 - nShift; b++)
{ {
d._u8[15 - b] = VA._u8[15 - (b + nShift)]; d._u = a._u << (b._u8[0] & 0x78);
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2564,17 +2527,12 @@ auto VSLW()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint w = 0; w < 4; w++)
{ {
d._u32[w] = a._u32[w] << (b._u32[w] & 0x1f); d = gv_shl32(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2583,16 +2541,12 @@ auto VSPLTB()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& b, auto&& imm)
auto& d = ppu.vr[op.vd];
u8 byte = ppu.vr[op.vb]._u8[15 - op.vuimm];
for (uint b = 0; b < 16; b++)
{ {
d._u8[b] = byte; d = gv_bcst8(b.u8r[imm & 15]);
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2601,18 +2555,12 @@ auto VSPLTH()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& b, auto&& imm)
auto& d = ppu.vr[op.vd];
ensure((op.vuimm < 8));
u16 hword = ppu.vr[op.vb]._u16[7 - op.vuimm];
for (uint h = 0; h < 8; h++)
{ {
d._u16[h] = hword; d = gv_bcst16(b.u16r[imm & 7]);
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2621,16 +2569,12 @@ auto VSPLTISB()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& imm)
auto& d = ppu.vr[op.vd];
const s8 imm = op.vsimm;
for (uint b = 0; b < 16; b++)
{ {
d._u8[b] = imm; d = gv_bcst8(imm);
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], op.vsimm);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2639,16 +2583,12 @@ auto VSPLTISH()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& imm)
auto& d = ppu.vr[op.vd];
const s16 imm = op.vsimm;
for (uint h = 0; h < 8; h++)
{ {
d._u16[h] = imm; d = gv_bcst16(imm);
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], op.vsimm);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2657,16 +2597,12 @@ auto VSPLTISW()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& imm)
auto& d = ppu.vr[op.vd];
const s32 imm = op.vsimm;
for (uint w = 0; w < 4; w++)
{ {
d._u32[w] = imm; d = gv_bcst32(imm);
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], op.vsimm);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2675,18 +2611,12 @@ auto VSPLTW()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& b, auto&& imm)
auto& d = ppu.vr[op.vd];
ensure((op.vuimm < 4));
u32 word = ppu.vr[op.vb]._u32[3 - op.vuimm];
for (uint w = 0; w < 4; w++)
{ {
d._u32[w] = word; d = gv_bcst32(b.u32r[imm & 3]);
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2695,19 +2625,12 @@ auto VSR()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
v128 VA = ppu.vr[op.va];
u8 sh = ppu.vr[op.vb]._u8[15] & 0x7;
d._u8[15] = VA._u8[15] >> sh;
for (uint b = 14; ~b; b--)
{ {
sh = ppu.vr[op.vb]._u8[b] & 0x7; d = gv_fshr8(gv_shuffle_right<1>(a), std::move(a), std::move(b));
d._u8[b] = (VA._u8[b] >> sh) | (VA._u8[b + 1] << (8 - sh));
}
}; };
RETURN_(ppu, op);
RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2716,17 +2639,12 @@ auto VSRAB()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint i = 0; i < 16; i++)
{ {
d._s8[i] = a._s8[i] >> (b._u8[i] & 0x7); d = gv_sar8(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2735,17 +2653,12 @@ auto VSRAH()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint h = 0; h < 8; h++)
{ {
d._s16[h] = a._s16[h] >> (b._u16[h] & 0xf); d = gv_sar16(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2754,17 +2667,12 @@ auto VSRAW()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint w = 0; w < 4; w++)
{ {
d._s32[w] = a._s32[w] >> (b._u32[w] & 0x1f); d = gv_sar32(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2773,17 +2681,12 @@ auto VSRB()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint i = 0; i < 16; i++)
{ {
d._u8[i] = a._u8[i] >> (b._u8[i] & 0x7); d = gv_shr8(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2792,17 +2695,12 @@ auto VSRH()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint h = 0; h < 8; h++)
{ {
d._u16[h] = a._u16[h] >> (b._u16[h] & 0xf); d = gv_shr16(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2811,19 +2709,12 @@ auto VSRO()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
v128 VA = ppu.vr[op.va];
u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
d.clear();
for (u8 b = 0; b < 16 - nShift; b++)
{ {
d._u8[b] = VA._u8[b + nShift]; d._u = a._u >> (b._u8[0] & 0x78);
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -2832,17 +2723,12 @@ auto VSRW()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { static const auto exec = [](auto&& d, auto&& a, auto&& b)
auto& d = ppu.vr[op.vd];
const auto& a = ppu.vr[op.va];
const auto& b = ppu.vr[op.vb];
for (uint w = 0; w < 4; w++)
{ {
d._u32[w] = a._u32[w] >> (b._u32[w] & 0x1f); d = gv_shr32(std::move(a), std::move(b));
}
}; };
RETURN_(ppu, op);
RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -3184,30 +3070,14 @@ auto VUPKHPX()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
#if defined(ARCH_X64_0)
static const auto make = [](asmjit::ppu_builder& c)
{
const auto [v0, v1, v2] = c.vec_alloc<3>();
EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
EMIT(psrad, v0, v0, c.imm(16));
EMIT(pslld, v1, v0, c.imm(6));
EMIT(pslld, v2, v0, c.imm(3));
BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
EMIT(por, v0, v0, v1);
EMIT(por, v0, v0, v2);
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
c.ppu_ret();
};
#endif
static const auto exec = [](auto&& d, auto&& b) static const auto exec = [](auto&& d, auto&& b)
{ {
const auto x = gv_extend_hi_s16(b); auto x = gv_extend_hi_s16(std::move(b));
d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)); auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f)));
}; };
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -3216,22 +3086,12 @@ auto VUPKHSB()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
#if defined(ARCH_X64_0)
static const auto make = [](asmjit::ppu_builder& c)
{
const auto v0 = c.vec_alloc();
EMIT(punpckhbw, v0, v0, c.ppu_vr(s_op.vb));
EMIT(psraw, v0, v0, c.imm(8));
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
c.ppu_ret();
};
#endif
static const auto exec = [](auto&& d, auto&& b) static const auto exec = [](auto&& d, auto&& b)
{ {
d = gv_extend_hi_s8(b); d = gv_extend_hi_s8(std::move(b));
}; };
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -3240,22 +3100,12 @@ auto VUPKHSH()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
#if defined(ARCH_X64_0)
static const auto make = [](asmjit::ppu_builder& c)
{
const auto v0 = c.vec_alloc();
EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
EMIT(psrad, v0, v0, c.imm(16));
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
c.ppu_ret();
};
#endif
static const auto exec = [](auto&& d, auto&& b) static const auto exec = [](auto&& d, auto&& b)
{ {
d = gv_extend_hi_s16(b); d = gv_extend_hi_s16(std::move(b));
}; };
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -3264,37 +3114,14 @@ auto VUPKLPX()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
#if defined(ARCH_X64_0)
static const auto make = [](asmjit::ppu_builder& c)
{
const auto [v0, v1, v2] = c.vec_alloc<3>();
if (utils::has_sse41())
{
LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
}
else
{
EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
EMIT(psrad, v0, v0, c.imm(16));
}
EMIT(pslld, v1, v0, c.imm(6));
EMIT(pslld, v2, v0, c.imm(3));
BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
EMIT(por, v0, v0, v1);
EMIT(por, v0, v0, v2);
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
c.ppu_ret();
};
#endif
static const auto exec = [](auto&& d, auto&& b) static const auto exec = [](auto&& d, auto&& b)
{ {
const auto x = gv_extend_lo_s16(b); auto x = gv_extend_lo_s16(std::move(b));
d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)); auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f)));
}; };
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -3303,29 +3130,12 @@ auto VUPKLSB()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
#if defined(ARCH_X64_0)
static const auto make = [](asmjit::ppu_builder& c)
{
const auto v0 = c.vec_alloc();
if (utils::has_sse41())
{
LDST(pmovsxbw, v0, c.ppu_vr<8>(s_op.vb));
}
else
{
EMIT(punpcklbw, v0, v0, c.ppu_vr(s_op.vb));
EMIT(psraw, v0, v0, c.imm(8));
}
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
c.ppu_ret();
};
#endif
static const auto exec = [](auto&& d, auto&& b) static const auto exec = [](auto&& d, auto&& b)
{ {
d = gv_extend_lo_s8(b); d = gv_extend_lo_s8(std::move(b));
}; };
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -3334,29 +3144,12 @@ auto VUPKLSH()
if constexpr (Build == 0xf1a6) if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>(); return ppu_exec_select<Flags...>::template select<>();
#if defined(ARCH_X64_0)
static const auto make = [](asmjit::ppu_builder& c)
{
const auto v0 = c.vec_alloc();
if (utils::has_sse41())
{
LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
}
else
{
EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
EMIT(psrad, v0, v0, c.imm(16));
}
LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
c.ppu_ret();
};
#endif
static const auto exec = [](auto&& d, auto&& b) static const auto exec = [](auto&& d, auto&& b)
{ {
d = gv_extend_lo_s16(b); d = gv_extend_lo_s16(std::move(b));
}; };
RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
} }
template <u32 Build, ppu_exec_bit... Flags> template <u32 Build, ppu_exec_bit... Flags>
@ -7157,7 +6950,8 @@ struct ppu_interpreter_t
IT VSEL; IT VSEL;
IT VSL; IT VSL;
IT VSLB; IT VSLB;
IT VSLDOI; IT VSLDOI{};
IT VSLDOI_[16];
IT VSLH; IT VSLH;
IT VSLO; IT VSLO;
IT VSLW; IT VSLW;
@ -7629,6 +7423,27 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
return ::name<0, Flags...>(); \ return ::name<0, Flags...>(); \
}); \ }); \
#define INIT_ONE(name, bits) \
ptrs->name##_[0b##bits] = ::name<0b##bits>::select(selected, []<ppu_exec_bit... Flags>() { \
return ::name<0b##bits>::impl<0, Flags...>(); \
}); \
#define INIT_PACK2(name, bits) \
INIT_ONE(name, bits##0) \
INIT_ONE(name, bits##1) \
#define INIT_PACK4(name, bits) \
INIT_PACK2(name, bits##0) \
INIT_PACK2(name, bits##1) \
#define INIT_PACK8(name, bits) \
INIT_PACK4(name, bits##0) \
INIT_PACK4(name, bits##1) \
#define INIT_PACK16(name, bits) \
INIT_PACK8(name, bits##0) \
INIT_PACK8(name, bits##1) \
INIT(MFVSCR); INIT(MFVSCR);
INIT(MTVSCR); INIT(MTVSCR);
INIT(VADDCUW); INIT(VADDCUW);
@ -7732,7 +7547,7 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
INIT(VSEL); INIT(VSEL);
INIT(VSL); INIT(VSL);
INIT(VSLB); INIT(VSLB);
INIT(VSLDOI); INIT_PACK16(VSLDOI,);
INIT(VSLH); INIT(VSLH);
INIT(VSLO); INIT(VSLO);
INIT(VSLW); INIT(VSLW);
@ -8051,6 +7866,7 @@ ppu_intrp_func_t ppu_interpreter_rt::decode(u32 opv) const noexcept
break; break;
} }
case ppu_itype::VSLDOI: return ptrs->VSLDOI_[op.vsh];
default: break; default: break;
} }

View File

@ -193,7 +193,7 @@ namespace utils
#elif defined(__clang__) #elif defined(__clang__)
return __builtin_rotateleft32(x, n); return __builtin_rotateleft32(x, n);
#else #else
return (x << n) | (x >> (32 - n)); return (x << (n & 31)) | (x >> (((0 - n) & 31)));
#endif #endif
} }
@ -209,7 +209,7 @@ namespace utils
#elif defined(__clang__) #elif defined(__clang__)
return __builtin_rotateleft64(x, n); return __builtin_rotateleft64(x, n);
#else #else
return (x << n) | (x >> (64 - n)); return (x << (n & 63)) | (x >> (((0 - n) & 63)));
#endif #endif
} }

View File

@ -3,6 +3,7 @@
#include "util/types.hpp" #include "util/types.hpp"
#include "util/v128.hpp" #include "util/v128.hpp"
#include "util/sysinfo.hpp" #include "util/sysinfo.hpp"
#include "util/asm.hpp"
#include "Utilities/JIT.h" #include "Utilities/JIT.h"
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -40,6 +41,7 @@ namespace asmjit
#else #else
struct gpr_type : Operand struct gpr_type : Operand
{ {
gpr_type() = default;
gpr_type(u32) gpr_type(u32)
{ {
} }
@ -47,6 +49,7 @@ namespace asmjit
struct vec_type : Operand struct vec_type : Operand
{ {
vec_type() = default;
vec_type(u32) vec_type(u32)
{ {
} }
@ -82,7 +85,7 @@ namespace asmjit
template <typename T, typename D = std::decay_t<T>> template <typename T, typename D = std::decay_t<T>>
constexpr arg_class arg_classify = constexpr arg_class arg_classify =
std::is_base_of_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> : std::is_same_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> :
std::is_base_of_v<mem_type, D> ? arg_class::mem_lv : std::is_base_of_v<mem_type, D> ? arg_class::mem_lv :
std::is_base_of_v<mem_lazy, D> ? arg_class::mem_lv + !std::is_reference_v<T> : std::is_base_of_v<mem_lazy, D> ? arg_class::mem_lv + !std::is_reference_v<T> :
std::is_reference_v<T> ? arg_class::reg_lv : arg_class::reg_rv; std::is_reference_v<T> ? arg_class::reg_lv : arg_class::reg_rv;
@ -91,6 +94,8 @@ namespace asmjit
{ {
using base = native_asm; using base = native_asm;
bool fail_flag = false;
vec_builder(CodeHolder* ch) vec_builder(CodeHolder* ch)
: native_asm(ch) : native_asm(ch)
{ {
@ -150,6 +155,9 @@ namespace asmjit
std::unordered_map<v128, Label> consts[16]{}; std::unordered_map<v128, Label> consts[16]{};
#if defined(ARCH_X64)
std::unordered_map<v128, vec_type> const_allocs{};
template <typename T, u32 Size = sizeof(T)> template <typename T, u32 Size = sizeof(T)>
x86::Mem get_const(const T& data, u32 esize = Size) x86::Mem get_const(const T& data, u32 esize = Size)
{ {
@ -180,14 +188,97 @@ namespace asmjit
return x86::Mem(_label, 0, Size); return x86::Mem(_label, 0, Size);
} }
#endif
};
struct free_on_exit
{
Operand x{};
free_on_exit() = default;
free_on_exit(const free_on_exit&) = delete;
free_on_exit& operator=(const free_on_exit&) = delete;
~free_on_exit()
{
if (x.isReg())
{
vec_type v;
v.copyFrom(x);
g_vc->vec_dealloc(v);
}
}
}; };
#if defined(ARCH_X64) #if defined(ARCH_X64)
inline auto arg_eval(const v128& _c, u32 esize) inline Operand arg_eval(v128& _c, u32 esize)
{ {
// TODO: implement PSHUFD broadcasts and AVX ones const auto found = g_vc->const_allocs.find(_c);
auto r = g_vc->get_const(_c, esize);
return r; if (found != g_vc->const_allocs.end())
{
return found->second;
}
vec_type reg = g_vc->vec_alloc();
// TODO: PSHUFD style broadcast? Needs known const layout
if (utils::has_avx() && _c._u64[0] == _c._u64[1])
{
if (_c._u32[0] == _c._u32[1])
{
if (utils::has_avx2() && _c._u16[0] == _c._u16[1])
{
if (_c._u8[0] == _c._u8[1])
{
ensure(!g_vc->vpbroadcastb(reg, g_vc->get_const(_c._u8[0])));
}
else
{
ensure(!g_vc->vpbroadcastw(reg, g_vc->get_const(_c._u16[0])));
}
}
else
{
ensure(!g_vc->vbroadcastss(reg, g_vc->get_const(_c._u32[0])));
}
}
else
{
ensure(!g_vc->vbroadcastsd(reg, g_vc->get_const(_c._u32[0])));
}
}
else if (!_c._u)
{
ensure(!g_vc->pxor(reg, reg));
}
else if (!~_c._u)
{
ensure(!g_vc->pcmpeqd(reg, reg));
}
else
{
ensure(!g_vc->movaps(reg, g_vc->get_const(_c, esize)));
}
g_vc->const_allocs.emplace(_c, reg);
return reg;
}
inline Operand arg_eval(v128&& _c, u32 esize)
{
const auto found = g_vc->const_allocs.find(_c);
if (found != g_vc->const_allocs.end())
{
vec_type r = found->second;
g_vc->const_allocs.erase(found);
g_vc->vec_dealloc(r);
return r;
}
// Hack: assume can use mem op (TODO)
return g_vc->get_const(_c, esize);
} }
template <typename T> requires(std::is_base_of_v<mem_lazy, std::decay_t<T>>) template <typename T> requires(std::is_base_of_v<mem_lazy, std::decay_t<T>>)
@ -211,12 +302,24 @@ namespace asmjit
return std::move(mem); return std::move(mem);
} }
inline void arg_free(const v128&)
{
}
inline void arg_free(const Operand& op)
{
if (op.isReg())
{
g_vc->vec_dealloc(vec_type{op.id()});
}
}
template <typename T> template <typename T>
inline bool arg_use_evex(const auto& op) inline bool arg_use_evex(const auto& op)
{ {
constexpr auto _class = arg_classify<T>; constexpr auto _class = arg_classify<T>;
if constexpr (_class == arg_class::imm_rv) if constexpr (_class == arg_class::imm_rv)
return true; return g_vc->const_allocs.count(op) == 0;
else if constexpr (_class == arg_class::imm_lv) else if constexpr (_class == arg_class::imm_lv)
return false; return false;
else if (op.isMem()) else if (op.isMem())
@ -302,6 +405,7 @@ namespace asmjit
template <typename A, typename B, typename... Args> template <typename A, typename B, typename... Args>
vec_type binary_op(u32 esize, x86::Inst::Id mov_op, x86::Inst::Id sse_op, x86::Inst::Id avx_op, x86::Inst::Id evex_op, A&& a, B&& b, Args&&... args) vec_type binary_op(u32 esize, x86::Inst::Id mov_op, x86::Inst::Id sse_op, x86::Inst::Id avx_op, x86::Inst::Id evex_op, A&& a, B&& b, Args&&... args)
{ {
free_on_exit e;
Operand src1{}; Operand src1{};
if constexpr (arg_classify<A> == arg_class::reg_rv) if constexpr (arg_classify<A> == arg_class::reg_rv)
@ -317,12 +421,13 @@ namespace asmjit
if constexpr (arg_classify<B> == arg_class::reg_rv) if constexpr (arg_classify<B> == arg_class::reg_rv)
{ {
g_vc->vec_dealloc(vec_type{b.id()}); e.x = b;
//b = Operand();
} }
} }
else if (utils::has_avx() && avx_op && (arg_classify<A> == arg_class::reg_lv || arg_classify<A> == arg_class::mem_lv)) else if (utils::has_avx() && avx_op && (arg_classify<A> == arg_class::reg_lv || arg_classify<A> == arg_class::mem_lv))
{ {
Operand srca = arg_eval(std::forward<A>(a), 16);
if constexpr (arg_classify<A> == arg_class::reg_lv) if constexpr (arg_classify<A> == arg_class::reg_lv)
{ {
if constexpr (arg_classify<B> == arg_class::reg_rv) if constexpr (arg_classify<B> == arg_class::reg_rv)
@ -336,47 +441,79 @@ namespace asmjit
src1 = g_vc->vec_alloc(); src1 = g_vc->vec_alloc();
} }
} }
else // if A == arg_class::reg_rv else
{ {
src1 = g_vc->vec_alloc(); src1 = g_vc->vec_alloc();
if (!a.isReg())
{
static_cast<void>(arg_eval(std::forward<A>(a), 16));
}
if constexpr (arg_classify<B> == arg_class::reg_rv) if constexpr (arg_classify<B> == arg_class::reg_rv)
{ {
g_vc->vec_dealloc(vec_type{b.id()}); e.x = b;
//b = Operand();
} }
} }
if (utils::has_avx512() && evex_op && arg_use_evex<B>(b)) if (utils::has_avx512() && evex_op && arg_use_evex<B>(b))
{ {
ensure(!g_vc->evex().emit(evex_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...)); ensure(!g_vc->evex().emit(evex_op, src1, srca, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
return vec_type{src1.id()}; return vec_type{src1.id()};
} }
ensure(!g_vc->emit(avx_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...)); ensure(!g_vc->emit(avx_op, src1, srca, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
return vec_type{src1.id()}; return vec_type{src1.id()};
} }
else do else do
{ {
if constexpr (arg_classify<B> == arg_class::reg_rv) if constexpr (arg_classify<A> == arg_class::mem_rv)
{ {
g_vc->vec_dealloc(vec_type{b.id()}); if (a.isReg())
//b = Operand(); {
src1 = vec_type(a.id());
if constexpr (arg_classify<B> == arg_class::reg_rv)
{
e.x = b;
}
break;
}
} }
if (arg_classify<A> == arg_class::mem_rv && a.isReg()) if constexpr (arg_classify<A> == arg_class::imm_rv)
{ {
src1 = vec_type(a.id()); if (auto found = g_vc->const_allocs.find(a); found != g_vc->const_allocs.end())
break; {
src1 = found->second;
g_vc->const_allocs.erase(found);
if constexpr (arg_classify<B> == arg_class::reg_rv)
{
e.x = b;
}
break;
}
} }
src1 = g_vc->vec_alloc(); src1 = g_vc->vec_alloc();
if constexpr (arg_classify<B> == arg_class::reg_rv)
{
e.x = b;
}
if constexpr (arg_classify<A> == arg_class::imm_rv)
{
if (!a._u)
{
// All zeros
ensure(!g_vc->emit(x86::Inst::kIdPxor, src1, src1));
break;
}
else if (!~a._u)
{
// All ones
ensure(!g_vc->emit(x86::Inst::kIdPcmpeqd, src1, src1));
break;
}
}
// Fallback to arg copy // Fallback to arg copy
ensure(!g_vc->emit(mov_op, src1, arg_eval(std::forward<A>(a), 16))); ensure(!g_vc->emit(mov_op, src1, arg_eval(std::forward<A>(a), 16)));
} }
@ -404,10 +541,14 @@ namespace asmjit
} }
inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false); inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false);
inline v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false);
inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false); inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false);
inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false); inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false);
inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false); inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false);
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
inline asmjit::vec_type gv_gts32(A&&, B&&);
inline void gv_set_zeroing_denormals() inline void gv_set_zeroing_denormals()
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -704,6 +845,16 @@ inline v128 gv_not32(const v128& a)
#endif #endif
} }
template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_not32(A&& a)
{
#if defined(ARCH_X64)
asmjit::vec_type ones = g_vc->vec_alloc();
g_vc->pcmpeqd(ones, ones);
FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward<A>(a));
#endif
}
inline v128 gv_notfs(const v128& a) inline v128 gv_notfs(const v128& a)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -713,6 +864,16 @@ inline v128 gv_notfs(const v128& a)
#endif #endif
} }
template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_notfs(A&& a)
{
#if defined(ARCH_X64)
asmjit::vec_type ones = g_vc->vec_alloc();
g_vc->pcmpeqd(ones, ones);
FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward<A>(a));
#endif
}
inline v128 gv_shl16(const v128& a, u32 count) inline v128 gv_shl16(const v128& a, u32 count)
{ {
if (count >= 16) if (count >= 16)
@ -724,7 +885,7 @@ inline v128 gv_shl16(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires(asmjit::any_operand_v<A>) template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_shl16(A&& a, u32 count) inline auto gv_shl16(A&& a, u32 count)
{ {
FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward<A>(a), count); FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward<A>(a), count);
@ -741,7 +902,7 @@ inline v128 gv_shl32(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires(asmjit::any_operand_v<A>) template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_shl32(A&& a, u32 count) inline auto gv_shl32(A&& a, u32 count)
{ {
FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward<A>(a), count); FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward<A>(a), count);
@ -758,7 +919,7 @@ inline v128 gv_shl64(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires(asmjit::any_operand_v<A>) template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_shl64(A&& a, u32 count) inline auto gv_shl64(A&& a, u32 count)
{ {
FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward<A>(a), count); FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward<A>(a), count);
@ -775,7 +936,7 @@ inline v128 gv_shr16(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires(asmjit::any_operand_v<A>) template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_shr16(A&& a, u32 count) inline auto gv_shr16(A&& a, u32 count)
{ {
FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward<A>(a), count); FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward<A>(a), count);
@ -792,7 +953,7 @@ inline v128 gv_shr32(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires(asmjit::any_operand_v<A>) template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_shr32(A&& a, u32 count) inline auto gv_shr32(A&& a, u32 count)
{ {
FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward<A>(a), count); FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward<A>(a), count);
@ -809,7 +970,7 @@ inline v128 gv_shr64(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires(asmjit::any_operand_v<A>) template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_shr64(A&& a, u32 count) inline auto gv_shr64(A&& a, u32 count)
{ {
FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward<A>(a), count); FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward<A>(a), count);
@ -826,7 +987,7 @@ inline v128 gv_sar16(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires(asmjit::any_operand_v<A>) template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_sar16(A&& a, u32 count) inline auto gv_sar16(A&& a, u32 count)
{ {
FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward<A>(a), count); FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward<A>(a), count);
@ -843,7 +1004,7 @@ inline v128 gv_sar32(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires(asmjit::any_operand_v<A>) template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_sar32(A&& a, u32 count) inline auto gv_sar32(A&& a, u32 count)
{ {
FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward<A>(a), count); FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward<A>(a), count);
@ -867,6 +1028,20 @@ inline v128 gv_sar64(const v128& a, u32 count)
#endif #endif
} }
template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_sar64(A&& a, u32 count)
{
if (count >= 64)
count = 63;
#if defined(ARCH_X64)
using enum asmjit::x86::Inst::Id;
if (utils::has_avx512())
return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward<A>(a), count);
g_vc->fail_flag = true;
return std::forward<A>(a);
#endif
}
inline v128 gv_add8(const v128& a, const v128& b) inline v128 gv_add8(const v128& a, const v128& b)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -1025,6 +1200,20 @@ inline v128 gv_addus_u32(const v128& a, const v128& b)
#endif #endif
} }
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
inline asmjit::vec_type gv_addus_u32(A&& a, B&& b)
{
#if defined(ARCH_X64)
if (utils::has_sse41())
return gv_add32(gv_minu32(std::forward<B>(b), gv_not32(a)), std::forward<A>(a));
auto s = gv_add32(a, b);
auto x = gv_xor32(std::forward<B>(b), gv_bcst32(0x80000000));
auto y = gv_xor32(std::forward<A>(a), gv_bcst32(0x7fffffff));
return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y)));
#endif
return {};
}
inline v128 gv_addfs(const v128& a, const v128& b) inline v128 gv_addfs(const v128& a, const v128& b)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -1052,6 +1241,12 @@ inline v128 gv_sub8(const v128& a, const v128& b)
#endif #endif
} }
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
inline auto gv_sub8(A&& a, B&& b)
{
FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward<A>(a), std::forward<B>(b));
}
inline v128 gv_sub16(const v128& a, const v128& b) inline v128 gv_sub16(const v128& a, const v128& b)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -1265,6 +1460,21 @@ inline v128 gv_minu32(const v128& a, const v128& b)
#endif #endif
} }
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
inline asmjit::vec_type gv_minu32(A&& a, B&& b)
{
#if defined(ARCH_X64)
if (utils::has_sse41())
FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward<A>(a), std::forward<B>(b));
auto s = gv_bcst32(0x80000000);
auto x = gv_xor32(a, s);
auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b));
auto z = gv_and32(m, std::move(b));
return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a)));
#endif
return {};
}
inline v128 gv_mins8(const v128& a, const v128& b) inline v128 gv_mins8(const v128& a, const v128& b)
{ {
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
@ -1493,6 +1703,13 @@ inline v128 gv_gts8(const v128& a, const v128& b)
#endif #endif
} }
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
inline asmjit::vec_type gv_gts8(A&& a, B&& b)
{
FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward<A>(a), std::forward<B>(b));
return {};
}
inline v128 gv_gts16(const v128& a, const v128& b) inline v128 gv_gts16(const v128& a, const v128& b)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -1511,6 +1728,13 @@ inline v128 gv_gts32(const v128& a, const v128& b)
#endif #endif
} }
template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
inline asmjit::vec_type gv_gts32(A&& a, B&& b)
{
FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward<A>(a), std::forward<B>(b));
return {};
}
inline v128 gv_avgu8(const v128& a, const v128& b) inline v128 gv_avgu8(const v128& a, const v128& b)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -2154,7 +2378,7 @@ inline v128 gv_andn(const v128& a, const v128& b)
} }
// Select elements; _cmp must be result of SIMD comparison; undefined otherwise // Select elements; _cmp must be result of SIMD comparison; undefined otherwise
inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false) FORCE_INLINE v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
{ {
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
return _mm_blendv_epi8(_false, _true, _cmp); return _mm_blendv_epi8(_false, _true, _cmp);
@ -2165,6 +2389,45 @@ inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
#endif #endif
} }
// Select elements using sign bit only
FORCE_INLINE v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false)
{
#if defined(__SSE4_1__)
return _mm_blendv_epi8(_false, _true, bits);
#else
return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false);
#endif
}
template <typename A, typename B, typename C> requires (asmjit::any_operand_v<A, B, C>)
inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false)
{
using namespace asmjit;
#if defined(ARCH_X64)
if (utils::has_avx())
{
Operand arg0{};
Operand arg1 = arg_eval(std::forward<A>(bits), 16);
Operand arg2 = arg_eval(std::forward<B>(_true), 16);
Operand arg3 = arg_eval(std::forward<C>(_false), 16);
if constexpr (!std::is_reference_v<A>)
arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1);
if constexpr (!std::is_reference_v<B>)
arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2);
if constexpr (!std::is_reference_v<C>)
arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3);
if (arg0.isNone())
arg0 = g_vc->vec_alloc();
g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1);
vec_type r;
r.copyFrom(arg0);
return r;
}
#endif
g_vc->fail_flag = true;
return vec_type{0};
}
// Select elements; _cmp must be result of SIMD comparison; undefined otherwise // Select elements; _cmp must be result of SIMD comparison; undefined otherwise
inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false) inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false)
{ {
@ -2305,6 +2568,17 @@ inline v128 gv_extend_lo_s8(const v128& vec)
#endif #endif
} }
template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_extend_lo_s8(A&& a)
{
#if defined(ARCH_X64)
using enum asmjit::x86::Inst::Id;
if (utils::has_sse41())
return asmjit::unary_op(kIdPmovsxbw, kIdVpmovsxbw, std::forward<A>(a));
return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward<A>(a)), 8);
#endif
}
inline v128 gv_extend_hi_s8(const v128& vec) inline v128 gv_extend_hi_s8(const v128& vec)
{ {
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
@ -2316,6 +2590,15 @@ inline v128 gv_extend_hi_s8(const v128& vec)
#endif #endif
} }
template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_extend_hi_s8(A&& a)
{
#if defined(ARCH_X64)
using enum asmjit::x86::Inst::Id;
return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward<A>(a)), 8);
#endif
}
inline v128 gv_unpacklo16(const v128& lows, const v128& highs) inline v128 gv_unpacklo16(const v128& lows, const v128& highs)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -2336,6 +2619,17 @@ inline v128 gv_extend_lo_s16(const v128& vec)
#endif #endif
} }
template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_extend_lo_s16(A&& a)
{
#if defined(ARCH_X64)
using enum asmjit::x86::Inst::Id;
if (utils::has_sse41())
return asmjit::unary_op(kIdPmovsxwd, kIdVpmovsxwd, std::forward<A>(a));
return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward<A>(a)), 16);
#endif
}
inline v128 gv_extend_hi_s16(const v128& vec) inline v128 gv_extend_hi_s16(const v128& vec)
{ {
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
@ -2347,6 +2641,15 @@ inline v128 gv_extend_hi_s16(const v128& vec)
#endif #endif
} }
template <typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_extend_hi_s16(A&& a)
{
#if defined(ARCH_X64)
using enum asmjit::x86::Inst::Id;
return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward<A>(a)), 16);
#endif
}
inline v128 gv_unpacklo32(const v128& lows, const v128& highs) inline v128 gv_unpacklo32(const v128& lows, const v128& highs)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)
@ -2471,3 +2774,280 @@ inline v128 gv_log2_approxfs(const v128& a)
return r; return r;
#endif #endif
} }
// For each 8-bit element, r = a << (b & 7)
inline v128 gv_shl8(const v128& a, const v128& b)
{
#if defined(ARCH_ARM64)
return vshlq_u8(a, vandq_s8(b, gv_bcst8(7)));
#else
const v128 x1 = gv_add8(a, a); // shift left by 1
const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2
const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4
return gv_signselect8(gv_shl64(b, 5), x3, r2);
#endif
}
// For each 16-bit element, r = a << (b & 15)
inline v128 gv_shl16(const v128& a, const v128& b)
{
#if defined(__AVX512VL__) && defined(__AVX512BW__)
return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
#elif defined(ARCH_ARM64)
return vshlq_u16(a, vandq_s16(b, gv_bcst8(15)));
#else
v128 r;
for (u32 i = 0; i < 8; i++)
r._u16[i] = a._u16[i] << (b._u16[i] & 15);
return r;
#endif
}
// For each 32-bit element, r = a << (b & 31)
inline v128 gv_shl32(const v128& a, const v128& b)
{
#if defined(__AVX2__)
return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
#elif defined(ARCH_ARM64)
return vshlq_u32(a, vandq_s32(b, gv_bcst8(31)));
#else
v128 r;
for (u32 i = 0; i < 4; i++)
r._u32[i] = a._u32[i] << (b._u32[i] & 31);
return r;
#endif
}
// For each unsigned 8-bit element, r = a >> (b & 7)
inline v128 gv_shr8(const v128& a, const v128& b)
{
#if defined(ARCH_ARM64)
return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
#else
const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1
const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2
const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4
return gv_signselect8(gv_shl64(b, 5), x3, r2);
#endif
}
// For each unsigned 16-bit element, r = a >> (b & 15)
inline v128 gv_shr16(const v128& a, const v128& b)
{
#if defined(__AVX512VL__) && defined(__AVX512BW__)
return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
#elif defined(ARCH_ARM64)
return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
#else
v128 r;
for (u32 i = 0; i < 8; i++)
r._u16[i] = a._u16[i] >> (b._u16[i] & 15);
return r;
#endif
}
// For each unsigned 32-bit element, r = a >> (b & 31)
inline v128 gv_shr32(const v128& a, const v128& b)
{
#if defined(__AVX2__)
return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
#elif defined(ARCH_ARM64)
return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
#else
v128 r;
for (u32 i = 0; i < 4; i++)
r._u32[i] = a._u32[i] >> (b._u32[i] & 31);
return r;
#endif
}
// For each signed 8-bit element, r = a >> (b & 7)
inline v128 gv_sar8(const v128& a, const v128& b)
{
#if defined(ARCH_ARM64)
return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
#else
v128 r;
for (u32 i = 0; i < 16; i++)
r._s8[i] = a._s8[i] >> (b._s8[i] & 7);
return r;
#endif
}
// For each signed 16-bit element, r = a >> (b & 15)
inline v128 gv_sar16(const v128& a, const v128& b)
{
#if defined(__AVX512VL__) && defined(__AVX512BW__)
return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
#elif defined(ARCH_ARM64)
return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
#else
v128 r;
for (u32 i = 0; i < 8; i++)
r._s16[i] = a._s16[i] >> (b._s16[i] & 15);
return r;
#endif
}
// For each signed 32-bit element, r = a >> (b & 31)
inline v128 gv_sar32(const v128& a, const v128& b)
{
#if defined(__AVX2__)
return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
#elif defined(ARCH_ARM64)
return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
#else
v128 r;
for (u32 i = 0; i < 4; i++)
r._s32[i] = a._s32[i] >> (b._s32[i] & 31);
return r;
#endif
}
// For each 8-bit element, r = rotate a by b
inline v128 gv_rol8(const v128& a, const v128& b)
{
#if defined(ARCH_ARM64)
const auto amt1 = vandq_s8(b, gv_bcst8(7));
const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2));
#else
const v128 x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1
const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
const v128 c2 = gv_bcst8(0x3);
const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2), gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2
const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
const v128 c3 = gv_bcst8(0xf);
const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3), gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4
return gv_signselect8(gv_shl64(b, 5), x3, r2);
#endif
}
// For each 16-bit element, r = rotate a by b
inline v128 gv_rol16(const v128& a, const v128& b)
{
#if defined(ARCH_ARM64)
const auto amt1 = vandq_s16(b, gv_bcst16(15));
const auto amt2 = vsubq_s16(amt1, gv_bcst16(16));
return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2));
#else
v128 r;
for (u32 i = 0; i < 8; i++)
r._u16[i] = utils::rol16(a._u16[i], b._u16[i]);
return r;
#endif
}
// For each 32-bit element, r = rotate a by b
inline v128 gv_rol32(const v128& a, const v128& b)
{
#if defined(__AVX512VL__)
return _mm_rolv_epi32(a, b);
#elif defined(ARCH_ARM64)
const auto amt1 = vandq_s32(b, gv_bcst32(31));
const auto amt2 = vsubq_s32(amt1, gv_bcst32(32));
return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
#else
v128 r;
for (u32 i = 0; i < 4; i++)
r._u32[i] = utils::rol32(a._u32[i], b._u32[i]);
return r;
#endif
}
// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1)
template <typename A, typename B, typename C>
inline auto gv_fshl8(A&& a, B&& b, C&& c)
{
#if defined(ARCH_ARM64)
const auto amt1 = vandq_s8(c, gv_bcst8(7));
const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2)));
#else
auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b));
auto s1 = gv_shl64(c, 7);
auto r1 = gv_signselect8(s1, std::move(x1), std::forward<A>(a));
auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward<B>(b));
auto c2 = gv_bcst8(0x3);
auto x2 = gv_and32(gv_shr64(b1, 6), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2)));
auto s2 = gv_shl64(c, 6);
auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1));
auto c3 = gv_bcst8(0xf);
auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4)));
return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
#endif
}
// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1)
template <typename A, typename B, typename C>
inline auto gv_fshr8(A&& a, B&& b, C&& c)
{
#if defined(ARCH_ARM64)
const auto amt1 = vandq_s8(c, gv_bcst8(7));
const auto amt2 = vsubq_s8(gv_bcst8(8), amt1);
return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2));
#else
auto c1 = gv_bcst8(0x7f);
auto x1 = gv_and32(gv_shr64(b, 1), c1); x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7)));
auto s1 = gv_shl64(c, 7);
auto r1 = gv_signselect8(s1, std::move(x1), std::move(b));
auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a));
auto c2 = gv_bcst8(0x3f);
auto x2 = gv_and32(gv_shr64(r1, 2), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6)));
auto s2 = gv_shl64(c, 6);
auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1));
auto c3 = gv_bcst8(0x0f);
auto x3 = gv_and32(gv_shr64(r2, 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4)));
return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
#endif
}
// Shift left by byte amount
template <u32 Count>
inline v128 gv_shuffle_left(const v128& a)
{
if (Count > 15)
return {};
#if defined(ARCH_X64)
return _mm_slli_si128(a, Count);
#elif defined(ARCH_ARM64)
v128 idx;
for (u32 i = 0; i < 16; i++)
idx._u8[i] = u8(i - Count);
return vqtbl1q_u8(a, idx);
#endif
}
template <u32 Count, typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_shuffle_left(A&& a)
{
FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward<A>(a), Count);
}
// Shift right by byte amount
template <u32 Count>
inline v128 gv_shuffle_right(const v128& a)
{
if (Count > 15)
return {};
#if defined(ARCH_X64)
return _mm_srli_si128(a, Count);
#elif defined(ARCH_ARM64)
v128 idx;
for (u32 i = 0; i < 16; i++)
idx._u8[i] = u8(i + Count);
return vqtbl1q_u8(a, idx);
#endif
}
template <u32 Count, typename A> requires (asmjit::any_operand_v<A>)
inline auto gv_shuffle_right(A&& a)
{
FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
}