PPU: refactor shift and splat instructions

Fix utils::rol32/64 functions. Fix immediate clamping in splat instructions. Other fixes.
2024-11-22 02:32:36 +01:00 · 2022-01-19 02:41:32 +03:00 · 2022-01-19 02:41:32 +03:00 · b42fae0989
commit b42fae0989
parent d92008abe4
4 changed files with 784 additions and 382 deletions
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -247,10 +247,16 @@ inline FT build_function_asm(std::string_view name, F&& builder)
 	Asm compiler(&code);
 	compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
-	if constexpr (std::is_invocable_v<F, Asm&, native_args&>)
+	if constexpr (std::is_invocable_r_v<bool, F, Asm&, native_args&>)
-		builder(compiler, args);
+	{
 		if (!builder(compiler, args))
 			return nullptr;
 	}
 	else
-		builder(compiler);
+	{
 		builder(compiler, args);
 	}
 	rt.dump_name = name;
 	const auto result = rt._add(&code);
 	jit_announce(result, code.codeSize(), name);
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -111,13 +111,15 @@ struct ppu_exec_select
 #define RETURN(...) \
 	if constexpr (Build == 0) { \
 		static_cast<void>(exec); \
-		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c) { \
+		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c, native_args&) { \
 			static ppu_opcode_t op{}; \
 			static ppu_abstract_t ppu; \
 			exec(__VA_ARGS__); \
 			c.ppu_ret(); \
 			return !c.fail_flag; \
 		}); \
-		return f; \
+		if (f) return f; \
 		RETURN_(__VA_ARGS__); \
 	}
 #else
 #define RETURN RETURN_
@ -1019,7 +1021,7 @@ auto VADDUWS()
 		}
 	};
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2074,7 +2076,7 @@ auto VNOR()
 		d = gv_notfs(gv_orfs(std::move(a), std::move(b)));
 	};
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2100,7 +2102,7 @@ auto VPERM()
 #if defined (ARCH_X64)
 	if constexpr (Build == 0)
 	{
-		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_VPERM", [&](asmjit::ppu_builder& c)
+		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_VPERM", [&](asmjit::ppu_builder& c, native_args&)
 		{
 			const auto [v0, v1, v2, v3] = c.vec_alloc<4>();
 			c.movdqa(v0, c.ppu_vr(s_op.vc));
@ -2374,17 +2376,12 @@ auto VRLB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint i = 0; i < 16; i++)
 	{
-		d._u8[i] = utils::rol8(a._u8[i], b._u8[i]);
+		d = gv_rol8(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2393,17 +2390,12 @@ auto VRLH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint i = 0; i < 8; i++)
 	{
-		d._u16[i] = utils::rol16(a._u16[i], b._u8[i * 2] & 0xf);
+		d = gv_rol16(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2412,17 +2404,12 @@ auto VRLW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint w = 0; w < 4; w++)
 	{
-		d._u32[w] = utils::rol32(a._u32[w], b._u8[w * 4] & 0x1f);
+		d = gv_rol32(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2447,15 +2434,13 @@ auto VSEL()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
-	auto& d = ppu.vr[op.vd];
+	{
-	const auto& a = ppu.vr[op.va];
+		auto x = gv_andfs(std::move(b), c);
-	const auto& b = ppu.vr[op.vb];
+		d = gv_orfs(std::move(x), gv_andnfs(std::move(c), std::move(a)));
 	const auto& c = ppu.vr[op.vc];
 	d = (b & c) | gv_andn(c, a);
 	};
-	RETURN_(ppu, op);
+
 	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2464,19 +2449,12 @@ auto VSL()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	u8 sh = ppu.vr[op.vb]._u8[0] & 0x7;
 	d._u8[0] = VA._u8[0] << sh;
 	for (uint b = 1; b < 16; b++)
 	{
-		sh = ppu.vr[op.vb]._u8[b] & 0x7;
+		d = gv_fshl8(std::move(a), gv_shuffle_left<1>(a), std::move(b));
 		d._u8[b] = (VA._u8[b] << sh) | (VA._u8[b - 1] >> (8 - sh));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2485,38 +2463,35 @@ auto VSLB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint i = 0; i < 16; i++)
 	{
-		d._u8[i] = a._u8[i] << (b._u8[i] & 0x7);
+		d = gv_shl8(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
-template <u32 Build, ppu_exec_bit... Flags>
+template <u32 Count>
-auto VSLDOI()
+struct VSLDOI
 {
-	if constexpr (Build == 0xf1a6)
+	template <ppu_exec_bit... Flags>
-		return ppu_exec_select<Flags...>::template select<>();
+	static auto select(bs_t<ppu_exec_bit> selected, auto func)
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	u8 tmpSRC[32];
 	std::memcpy(tmpSRC, &ppu.vr[op.vb], 16);
 	std::memcpy(tmpSRC + 16, &ppu.vr[op.va], 16);
 	for (uint b = 0; b<16; b++)
 	{
-		d._u8[15 - b] = tmpSRC[31 - (b + op.vsh)];
+		return ppu_exec_select<>::select<Flags...>(selected, func);
 	}
-	};
+
-	RETURN_(ppu, op);
+	template <u32 Build, ppu_exec_bit... Flags>
-}
+	static auto impl()
 	{
 		static const auto exec = [](auto&& d, auto&& a, auto&& b)
 		{
 			d = gv_or32(gv_shuffle_left<Count>(std::move(a)), gv_shuffle_right<16 - Count>(std::move(b)));
 		};
 		RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 	}
 };
 template <u32 Build, ppu_exec_bit... Flags>
 auto VSLH()
@ -2524,17 +2499,12 @@ auto VSLH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint h = 0; h < 8; h++)
 	{
-		d._u16[h] = a._u16[h] << (b._u16[h] & 0xf);
+		d = gv_shl16(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2543,19 +2513,12 @@ auto VSLO()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
 	d.clear();
 	for (u8 b = 0; b < 16 - nShift; b++)
 	{
-		d._u8[15 - b] = VA._u8[15 - (b + nShift)];
+		d._u = a._u << (b._u8[0] & 0x78);
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2564,17 +2527,12 @@ auto VSLW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint w = 0; w < 4; w++)
 	{
-		d._u32[w] = a._u32[w] << (b._u32[w] & 0x1f);
+		d = gv_shl32(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2583,16 +2541,12 @@ auto VSPLTB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& b, auto&& imm)
 	auto& d = ppu.vr[op.vd];
 	u8 byte = ppu.vr[op.vb]._u8[15 - op.vuimm];
 	for (uint b = 0; b < 16; b++)
 	{
-		d._u8[b] = byte;
+		d = gv_bcst8(b.u8r[imm & 15]);
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2601,18 +2555,12 @@ auto VSPLTH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& b, auto&& imm)
 	auto& d = ppu.vr[op.vd];
 	ensure((op.vuimm < 8));
 	u16 hword = ppu.vr[op.vb]._u16[7 - op.vuimm];
 	for (uint h = 0; h < 8; h++)
 	{
-		d._u16[h] = hword;
+		d = gv_bcst16(b.u16r[imm & 7]);
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2621,16 +2569,12 @@ auto VSPLTISB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& imm)
 	auto& d = ppu.vr[op.vd];
 	const s8 imm = op.vsimm;
 	for (uint b = 0; b < 16; b++)
 	{
-		d._u8[b] = imm;
+		d = gv_bcst8(imm);
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], op.vsimm);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2639,16 +2583,12 @@ auto VSPLTISH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& imm)
 	auto& d = ppu.vr[op.vd];
 	const s16 imm = op.vsimm;
 	for (uint h = 0; h < 8; h++)
 	{
-		d._u16[h] = imm;
+		d = gv_bcst16(imm);
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], op.vsimm);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2657,16 +2597,12 @@ auto VSPLTISW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& imm)
 	auto& d = ppu.vr[op.vd];
 	const s32 imm = op.vsimm;
 	for (uint w = 0; w < 4; w++)
 	{
-		d._u32[w] = imm;
+		d = gv_bcst32(imm);
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], op.vsimm);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2675,18 +2611,12 @@ auto VSPLTW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& b, auto&& imm)
 	auto& d = ppu.vr[op.vd];
 	ensure((op.vuimm < 4));
 	u32 word = ppu.vr[op.vb]._u32[3 - op.vuimm];
 	for (uint w = 0; w < 4; w++)
 	{
-		d._u32[w] = word;
+		d = gv_bcst32(b.u32r[imm & 3]);
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2695,19 +2625,12 @@ auto VSR()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	u8 sh = ppu.vr[op.vb]._u8[15] & 0x7;
 	d._u8[15] = VA._u8[15] >> sh;
 	for (uint b = 14; ~b; b--)
 	{
-		sh = ppu.vr[op.vb]._u8[b] & 0x7;
+		d = gv_fshr8(gv_shuffle_right<1>(a), std::move(a), std::move(b));
 		d._u8[b] = (VA._u8[b] >> sh) | (VA._u8[b + 1] << (8 - sh));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2716,17 +2639,12 @@ auto VSRAB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint i = 0; i < 16; i++)
 	{
-		d._s8[i] = a._s8[i] >> (b._u8[i] & 0x7);
+		d = gv_sar8(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2735,17 +2653,12 @@ auto VSRAH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint h = 0; h < 8; h++)
 	{
-		d._s16[h] = a._s16[h] >> (b._u16[h] & 0xf);
+		d = gv_sar16(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2754,17 +2667,12 @@ auto VSRAW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint w = 0; w < 4; w++)
 	{
-		d._s32[w] = a._s32[w] >> (b._u32[w] & 0x1f);
+		d = gv_sar32(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2773,17 +2681,12 @@ auto VSRB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint i = 0; i < 16; i++)
 	{
-		d._u8[i] = a._u8[i] >> (b._u8[i] & 0x7);
+		d = gv_shr8(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2792,17 +2695,12 @@ auto VSRH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint h = 0; h < 8; h++)
 	{
-		d._u16[h] = a._u16[h] >> (b._u16[h] & 0xf);
+		d = gv_shr16(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2811,19 +2709,12 @@ auto VSRO()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
 	d.clear();
 	for (u8 b = 0; b < 16 - nShift; b++)
 	{
-		d._u8[b] = VA._u8[b + nShift];
+		d._u = a._u >> (b._u8[0] & 0x78);
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -2832,17 +2723,12 @@ auto VSRW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	for (uint w = 0; w < 4; w++)
 	{
-		d._u32[w] = a._u32[w] >> (b._u32[w] & 0x1f);
+		d = gv_shr32(std::move(a), std::move(b));
 	}
 	};
-	RETURN_(ppu, op);
+
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -3184,30 +3070,14 @@ auto VUPKHPX()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 #if defined(ARCH_X64_0)
 	static const auto make = [](asmjit::ppu_builder& c)
 	{
 		const auto [v0, v1, v2] = c.vec_alloc<3>();
 		EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
 		EMIT(psrad, v0, v0, c.imm(16));
 		EMIT(pslld, v1, v0, c.imm(6));
 		EMIT(pslld, v2, v0, c.imm(3));
 		BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
 		BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
 		BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
 		EMIT(por, v0, v0, v1);
 		EMIT(por, v0, v0, v2);
 		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
 		c.ppu_ret();
 	};
 #endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		const auto x = gv_extend_hi_s16(b);
+		auto x = gv_extend_hi_s16(std::move(b));
-		d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00));
+		auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
 		d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f)));
 	};
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -3216,22 +3086,12 @@ auto VUPKHSB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 #if defined(ARCH_X64_0)
 	static const auto make = [](asmjit::ppu_builder& c)
 	{
 		const auto v0 = c.vec_alloc();
 		EMIT(punpckhbw, v0, v0, c.ppu_vr(s_op.vb));
 		EMIT(psraw, v0, v0, c.imm(8));
 		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
 		c.ppu_ret();
 	};
 #endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		d = gv_extend_hi_s8(b);
+		d = gv_extend_hi_s8(std::move(b));
 	};
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -3240,22 +3100,12 @@ auto VUPKHSH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 #if defined(ARCH_X64_0)
 	static const auto make = [](asmjit::ppu_builder& c)
 	{
 		const auto v0 = c.vec_alloc();
 		EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
 		EMIT(psrad, v0, v0, c.imm(16));
 		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
 		c.ppu_ret();
 	};
 #endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		d = gv_extend_hi_s16(b);
+		d = gv_extend_hi_s16(std::move(b));
 	};
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -3264,37 +3114,14 @@ auto VUPKLPX()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 #if defined(ARCH_X64_0)
 	static const auto make = [](asmjit::ppu_builder& c)
 	{
 		const auto [v0, v1, v2] = c.vec_alloc<3>();
 		if (utils::has_sse41())
 		{
 			LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
 		}
 		else
 		{
 			EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
 			EMIT(psrad, v0, v0, c.imm(16));
 		}
 		EMIT(pslld, v1, v0, c.imm(6));
 		EMIT(pslld, v2, v0, c.imm(3));
 		BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
 		BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
 		BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
 		EMIT(por, v0, v0, v1);
 		EMIT(por, v0, v0, v2);
 		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
 		c.ppu_ret();
 	};
 #endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		const auto x = gv_extend_lo_s16(b);
+		auto x = gv_extend_lo_s16(std::move(b));
-		d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00));
+		auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
 		d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f)));
 	};
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -3303,29 +3130,12 @@ auto VUPKLSB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 #if defined(ARCH_X64_0)
 	static const auto make = [](asmjit::ppu_builder& c)
 	{
 		const auto v0 = c.vec_alloc();
 		if (utils::has_sse41())
 		{
 			LDST(pmovsxbw, v0, c.ppu_vr<8>(s_op.vb));
 		}
 		else
 		{
 			EMIT(punpcklbw, v0, v0, c.ppu_vr(s_op.vb));
 			EMIT(psraw, v0, v0, c.imm(8));
 		}
 		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
 		c.ppu_ret();
 	};
 #endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		d = gv_extend_lo_s8(b);
+		d = gv_extend_lo_s8(std::move(b));
 	};
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -3334,29 +3144,12 @@ auto VUPKLSH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 #if defined(ARCH_X64_0)
 	static const auto make = [](asmjit::ppu_builder& c)
 	{
 		const auto v0 = c.vec_alloc();
 		if (utils::has_sse41())
 		{
 			LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
 		}
 		else
 		{
 			EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
 			EMIT(psrad, v0, v0, c.imm(16));
 		}
 		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
 		c.ppu_ret();
 	};
 #endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		d = gv_extend_lo_s16(b);
+		d = gv_extend_lo_s16(std::move(b));
 	};
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 template <u32 Build, ppu_exec_bit... Flags>
@ -7157,7 +6950,8 @@ struct ppu_interpreter_t
 	IT VSEL;
 	IT VSL;
 	IT VSLB;
-	IT VSLDOI;
+	IT VSLDOI{};
 	IT VSLDOI_[16];
 	IT VSLH;
 	IT VSLO;
 	IT VSLW;
@ -7629,6 +7423,27 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
 		return ::name<0, Flags...>(); \
 	}); \
 #define INIT_ONE(name, bits) \
 	ptrs->name##_[0b##bits] = ::name<0b##bits>::select(selected, []<ppu_exec_bit... Flags>() { \
 		return ::name<0b##bits>::impl<0, Flags...>(); \
 	}); \
 #define INIT_PACK2(name, bits) \
 	INIT_ONE(name, bits##0) \
 	INIT_ONE(name, bits##1) \
 #define INIT_PACK4(name, bits) \
 	INIT_PACK2(name, bits##0) \
 	INIT_PACK2(name, bits##1) \
 #define INIT_PACK8(name, bits) \
 	INIT_PACK4(name, bits##0) \
 	INIT_PACK4(name, bits##1) \
 #define INIT_PACK16(name, bits) \
 	INIT_PACK8(name, bits##0) \
 	INIT_PACK8(name, bits##1) \
 	INIT(MFVSCR);
 	INIT(MTVSCR);
 	INIT(VADDCUW);
@ -7732,7 +7547,7 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
 	INIT(VSEL);
 	INIT(VSL);
 	INIT(VSLB);
-	INIT(VSLDOI);
+	INIT_PACK16(VSLDOI,);
 	INIT(VSLH);
 	INIT(VSLO);
 	INIT(VSLW);
@ -8051,6 +7866,7 @@ ppu_intrp_func_t ppu_interpreter_rt::decode(u32 opv) const noexcept
 		break;
 	}
 	case ppu_itype::VSLDOI: return ptrs->VSLDOI_[op.vsh];
 	default: break;
 	}
--- a/rpcs3/util/asm.hpp
+++ b/rpcs3/util/asm.hpp
@ -193,7 +193,7 @@ namespace utils
 #elif defined(__clang__)
 		return __builtin_rotateleft32(x, n);
 #else
-		return (x << n) | (x >> (32 - n));
+		return (x << (n & 31)) | (x >> (((0 - n) & 31)));
 #endif
 	}
@ -209,7 +209,7 @@ namespace utils
 #elif defined(__clang__)
 		return __builtin_rotateleft64(x, n);
 #else
-		return (x << n) | (x >> (64 - n));
+		return (x << (n & 63)) | (x >> (((0 - n) & 63)));
 #endif
 	}
--- a/rpcs3/util/simd.hpp
+++ b/rpcs3/util/simd.hpp
@ -3,6 +3,7 @@
 #include "util/types.hpp"
 #include "util/v128.hpp"
 #include "util/sysinfo.hpp"
 #include "util/asm.hpp"
 #include "Utilities/JIT.h"
 #if defined(ARCH_X64)
@ -40,6 +41,7 @@ namespace asmjit
 #else
 	struct gpr_type : Operand
 	{
 		gpr_type() = default;
 		gpr_type(u32)
 		{
 		}
@ -47,6 +49,7 @@ namespace asmjit
 	struct vec_type : Operand
 	{
 		vec_type() = default;
 		vec_type(u32)
 		{
 		}
@ -82,7 +85,7 @@ namespace asmjit
 	template <typename T, typename D = std::decay_t<T>>
 	constexpr arg_class arg_classify =
-		std::is_base_of_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> :
+		std::is_same_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> :
 		std::is_base_of_v<mem_type, D> ? arg_class::mem_lv :
 		std::is_base_of_v<mem_lazy, D> ? arg_class::mem_lv + !std::is_reference_v<T> :
 		std::is_reference_v<T> ? arg_class::reg_lv : arg_class::reg_rv;
@ -91,6 +94,8 @@ namespace asmjit
 	{
 		using base = native_asm;
 		bool fail_flag = false;
 		vec_builder(CodeHolder* ch)
 			: native_asm(ch)
 		{
@ -150,6 +155,9 @@ namespace asmjit
 		std::unordered_map<v128, Label> consts[16]{};
 #if defined(ARCH_X64)
 		std::unordered_map<v128, vec_type> const_allocs{};
 		template <typename T, u32 Size = sizeof(T)>
 		x86::Mem get_const(const T& data, u32 esize = Size)
 		{
@ -180,14 +188,97 @@ namespace asmjit
 			return x86::Mem(_label, 0, Size);
 		}
 #endif
 	};
 	struct free_on_exit
 	{
 		Operand x{};
 		free_on_exit() = default;
 		free_on_exit(const free_on_exit&) = delete;
 		free_on_exit& operator=(const free_on_exit&) = delete;
 		~free_on_exit()
 		{
 			if (x.isReg())
 			{
 				vec_type v;
 				v.copyFrom(x);
 				g_vc->vec_dealloc(v);
 			}
 		}
 	};
 #if defined(ARCH_X64)
-	inline auto arg_eval(const v128& _c, u32 esize)
+	inline Operand arg_eval(v128& _c, u32 esize)
 	{
-		// TODO: implement PSHUFD broadcasts and AVX ones
+		const auto found = g_vc->const_allocs.find(_c);
-		auto r = g_vc->get_const(_c, esize);
+
-		return r;
+		if (found != g_vc->const_allocs.end())
 		{
 			return found->second;
 		}
 		vec_type reg = g_vc->vec_alloc();
 		// TODO: PSHUFD style broadcast? Needs known const layout
 		if (utils::has_avx() && _c._u64[0] == _c._u64[1])
 		{
 			if (_c._u32[0] == _c._u32[1])
 			{
 				if (utils::has_avx2() && _c._u16[0] == _c._u16[1])
 				{
 					if (_c._u8[0] == _c._u8[1])
 					{
 						ensure(!g_vc->vpbroadcastb(reg, g_vc->get_const(_c._u8[0])));
 					}
 					else
 					{
 						ensure(!g_vc->vpbroadcastw(reg, g_vc->get_const(_c._u16[0])));
 					}
 				}
 				else
 				{
 					ensure(!g_vc->vbroadcastss(reg, g_vc->get_const(_c._u32[0])));
 				}
 			}
 			else
 			{
 				ensure(!g_vc->vbroadcastsd(reg, g_vc->get_const(_c._u32[0])));
 			}
 		}
 		else if (!_c._u)
 		{
 			ensure(!g_vc->pxor(reg, reg));
 		}
 		else if (!~_c._u)
 		{
 			ensure(!g_vc->pcmpeqd(reg, reg));
 		}
 		else
 		{
 			ensure(!g_vc->movaps(reg, g_vc->get_const(_c, esize)));
 		}
 		g_vc->const_allocs.emplace(_c, reg);
 		return reg;
 	}
 	inline Operand arg_eval(v128&& _c, u32 esize)
 	{
 		const auto found = g_vc->const_allocs.find(_c);
 		if (found != g_vc->const_allocs.end())
 		{
 			vec_type r = found->second;
 			g_vc->const_allocs.erase(found);
 			g_vc->vec_dealloc(r);
 			return r;
 		}
 		// Hack: assume can use mem op (TODO)
 		return g_vc->get_const(_c, esize);
 	}
 	template <typename T> requires(std::is_base_of_v<mem_lazy, std::decay_t<T>>)
@ -211,12 +302,24 @@ namespace asmjit
 		return std::move(mem);
 	}
 	inline void arg_free(const v128&)
 	{
 	}
 	inline void arg_free(const Operand& op)
 	{
 		if (op.isReg())
 		{
 			g_vc->vec_dealloc(vec_type{op.id()});
 		}
 	}
 	template <typename T>
 	inline bool arg_use_evex(const auto& op)
 	{
 		constexpr auto _class = arg_classify<T>;
 		if constexpr (_class == arg_class::imm_rv)
-			return true;
+			return g_vc->const_allocs.count(op) == 0;
 		else if constexpr (_class == arg_class::imm_lv)
 			return false;
 		else if (op.isMem())
@ -302,6 +405,7 @@ namespace asmjit
 	template <typename A, typename B, typename... Args>
 	vec_type binary_op(u32 esize, x86::Inst::Id mov_op, x86::Inst::Id sse_op, x86::Inst::Id avx_op, x86::Inst::Id evex_op, A&& a, B&& b, Args&&... args)
 	{
 		free_on_exit e;
 		Operand src1{};
 		if constexpr (arg_classify<A> == arg_class::reg_rv)
@ -317,12 +421,13 @@ namespace asmjit
 			if constexpr (arg_classify<B> == arg_class::reg_rv)
 			{
-				g_vc->vec_dealloc(vec_type{b.id()});
+				e.x = b;
 				//b = Operand();
 			}
 		}
 		else if (utils::has_avx() && avx_op && (arg_classify<A> == arg_class::reg_lv || arg_classify<A> == arg_class::mem_lv))
 		{
 			Operand srca = arg_eval(std::forward<A>(a), 16);
 			if constexpr (arg_classify<A> == arg_class::reg_lv)
 			{
 				if constexpr (arg_classify<B> == arg_class::reg_rv)
@ -336,47 +441,79 @@ namespace asmjit
 					src1 = g_vc->vec_alloc();
 				}
 			}
-			else // if A == arg_class::reg_rv
+			else
 			{
 				src1 = g_vc->vec_alloc();
 				if (!a.isReg())
 				{
 					static_cast<void>(arg_eval(std::forward<A>(a), 16));
 				}
 				if constexpr (arg_classify<B> == arg_class::reg_rv)
 				{
-					g_vc->vec_dealloc(vec_type{b.id()});
+					e.x = b;
 					//b = Operand();
 				}
 			}
 			if (utils::has_avx512() && evex_op && arg_use_evex<B>(b))
 			{
-				ensure(!g_vc->evex().emit(evex_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
+				ensure(!g_vc->evex().emit(evex_op, src1, srca, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
 				return vec_type{src1.id()};
 			}
-			ensure(!g_vc->emit(avx_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
+			ensure(!g_vc->emit(avx_op, src1, srca, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
 			return vec_type{src1.id()};
 		}
 		else do
 		{
-			if constexpr (arg_classify<B> == arg_class::reg_rv)
+			if constexpr (arg_classify<A> == arg_class::mem_rv)
 			{
-				g_vc->vec_dealloc(vec_type{b.id()});
+				if (a.isReg())
-				//b = Operand();
+				{
 					src1 = vec_type(a.id());
 					if constexpr (arg_classify<B> == arg_class::reg_rv)
 					{
 						e.x = b;
 					}
 					break;
 				}
 			}
-			if (arg_classify<A> == arg_class::mem_rv && a.isReg())
+			if constexpr (arg_classify<A> == arg_class::imm_rv)
 			{
-				src1 = vec_type(a.id());
+				if (auto found = g_vc->const_allocs.find(a); found != g_vc->const_allocs.end())
-				break;
+				{
 					src1 = found->second;
 					g_vc->const_allocs.erase(found);
 					if constexpr (arg_classify<B> == arg_class::reg_rv)
 					{
 						e.x = b;
 					}
 					break;
 				}
 			}
 			src1 = g_vc->vec_alloc();
 			if constexpr (arg_classify<B> == arg_class::reg_rv)
 			{
 				e.x = b;
 			}
 			if constexpr (arg_classify<A> == arg_class::imm_rv)
 			{
 				if (!a._u)
 				{
 					// All zeros
 					ensure(!g_vc->emit(x86::Inst::kIdPxor, src1, src1));
 					break;
 				}
 				else if (!~a._u)
 				{
 					// All ones
 					ensure(!g_vc->emit(x86::Inst::kIdPcmpeqd, src1, src1));
 					break;
 				}
 			}
 			// Fallback to arg copy
 			ensure(!g_vc->emit(mov_op, src1, arg_eval(std::forward<A>(a), 16)));
 		}
@ -404,10 +541,14 @@ namespace asmjit
 }
 inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false);
 inline v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false);
 inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false);
 inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false);
 inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false);
 template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
 inline asmjit::vec_type gv_gts32(A&&, B&&);
 inline void gv_set_zeroing_denormals()
 {
 #if defined(ARCH_X64)
@ -704,6 +845,16 @@ inline v128 gv_not32(const v128& a)
 #endif
 }
 template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_not32(A&& a)
 {
 #if defined(ARCH_X64)
 	asmjit::vec_type ones = g_vc->vec_alloc();
 	g_vc->pcmpeqd(ones, ones);
 	FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward<A>(a));
 #endif
 }
 inline v128 gv_notfs(const v128& a)
 {
 #if defined(ARCH_X64)
@ -713,6 +864,16 @@ inline v128 gv_notfs(const v128& a)
 #endif
 }
 template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_notfs(A&& a)
 {
 #if defined(ARCH_X64)
 	asmjit::vec_type ones = g_vc->vec_alloc();
 	g_vc->pcmpeqd(ones, ones);
 	FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward<A>(a));
 #endif
 }
 inline v128 gv_shl16(const v128& a, u32 count)
 {
 	if (count >= 16)
@ -724,7 +885,7 @@ inline v128 gv_shl16(const v128& a, u32 count)
 #endif
 }
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shl16(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward<A>(a), count);
@ -741,7 +902,7 @@ inline v128 gv_shl32(const v128& a, u32 count)
 #endif
 }
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shl32(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward<A>(a), count);
@ -758,7 +919,7 @@ inline v128 gv_shl64(const v128& a, u32 count)
 #endif
 }
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shl64(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward<A>(a), count);
@ -775,7 +936,7 @@ inline v128 gv_shr16(const v128& a, u32 count)
 #endif
 }
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shr16(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward<A>(a), count);
@ -792,7 +953,7 @@ inline v128 gv_shr32(const v128& a, u32 count)
 #endif
 }
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shr32(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward<A>(a), count);
@ -809,7 +970,7 @@ inline v128 gv_shr64(const v128& a, u32 count)
 #endif
 }
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shr64(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward<A>(a), count);
@ -826,7 +987,7 @@ inline v128 gv_sar16(const v128& a, u32 count)
 #endif
 }
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_sar16(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward<A>(a), count);
@ -843,7 +1004,7 @@ inline v128 gv_sar32(const v128& a, u32 count)
 #endif
 }
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_sar32(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward<A>(a), count);
@ -867,6 +1028,20 @@ inline v128 gv_sar64(const v128& a, u32 count)
 #endif
 }
 template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_sar64(A&& a, u32 count)
 {
 	if (count >= 64)
 		count = 63;
 #if defined(ARCH_X64)
 	using enum asmjit::x86::Inst::Id;
 	if (utils::has_avx512())
 		return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward<A>(a), count);
 	g_vc->fail_flag = true;
 	return std::forward<A>(a);
 #endif
 }
 inline v128 gv_add8(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@ -1025,6 +1200,20 @@ inline v128 gv_addus_u32(const v128& a, const v128& b)
 #endif
 }
 template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
 inline asmjit::vec_type gv_addus_u32(A&& a, B&& b)
 {
 #if defined(ARCH_X64)
 	if (utils::has_sse41())
 		return gv_add32(gv_minu32(std::forward<B>(b), gv_not32(a)), std::forward<A>(a));
 	auto s = gv_add32(a, b);
 	auto x = gv_xor32(std::forward<B>(b), gv_bcst32(0x80000000));
 	auto y = gv_xor32(std::forward<A>(a), gv_bcst32(0x7fffffff));
 	return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y)));
 #endif
 	return {};
 }
 inline v128 gv_addfs(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@ -1052,6 +1241,12 @@ inline v128 gv_sub8(const v128& a, const v128& b)
 #endif
 }
 template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
 inline auto gv_sub8(A&& a, B&& b)
 {
 	FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward<A>(a), std::forward<B>(b));
 }
 inline v128 gv_sub16(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@ -1265,6 +1460,21 @@ inline v128 gv_minu32(const v128& a, const v128& b)
 #endif
 }
 template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
 inline asmjit::vec_type gv_minu32(A&& a, B&& b)
 {
 #if defined(ARCH_X64)
 	if (utils::has_sse41())
 		FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward<A>(a), std::forward<B>(b));
 	auto s = gv_bcst32(0x80000000);
 	auto x = gv_xor32(a, s);
 	auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b));
 	auto z = gv_and32(m, std::move(b));
 	return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a)));
 #endif
 	return {};
 }
 inline v128 gv_mins8(const v128& a, const v128& b)
 {
 #if defined(__SSE4_1__)
@ -1493,6 +1703,13 @@ inline v128 gv_gts8(const v128& a, const v128& b)
 #endif
 }
 template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
 inline asmjit::vec_type gv_gts8(A&& a, B&& b)
 {
 	FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward<A>(a), std::forward<B>(b));
 	return {};
 }
 inline v128 gv_gts16(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@ -1511,6 +1728,13 @@ inline v128 gv_gts32(const v128& a, const v128& b)
 #endif
 }
 template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
 inline asmjit::vec_type gv_gts32(A&& a, B&& b)
 {
 	FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward<A>(a), std::forward<B>(b));
 	return {};
 }
 inline v128 gv_avgu8(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@ -2154,7 +2378,7 @@ inline v128 gv_andn(const v128& a, const v128& b)
 }
 // Select elements; _cmp must be result of SIMD comparison; undefined otherwise
-inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
+FORCE_INLINE v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
 {
 #if defined(__SSE4_1__)
 	return _mm_blendv_epi8(_false, _true, _cmp);
@ -2165,6 +2389,45 @@ inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
 #endif
 }
 // Select elements using sign bit only
 FORCE_INLINE v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false)
 {
 #if defined(__SSE4_1__)
 	return _mm_blendv_epi8(_false, _true, bits);
 #else
 	return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false);
 #endif
 }
 template <typename A, typename B, typename C> requires (asmjit::any_operand_v<A, B, C>)
 inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false)
 {
 	using namespace asmjit;
 #if defined(ARCH_X64)
 	if (utils::has_avx())
 	{
 		Operand arg0{};
 		Operand arg1 = arg_eval(std::forward<A>(bits), 16);
 		Operand arg2 = arg_eval(std::forward<B>(_true), 16);
 		Operand arg3 = arg_eval(std::forward<C>(_false), 16);
 		if constexpr (!std::is_reference_v<A>)
 			arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1);
 		if constexpr (!std::is_reference_v<B>)
 			arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2);
 		if constexpr (!std::is_reference_v<C>)
 			arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3);
 		if (arg0.isNone())
 			arg0 = g_vc->vec_alloc();
 		g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1);
 		vec_type r;
 		r.copyFrom(arg0);
 		return r;
 	}
 #endif
 	g_vc->fail_flag = true;
 	return vec_type{0};
 }
 // Select elements; _cmp must be result of SIMD comparison; undefined otherwise
 inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false)
 {
@ -2305,6 +2568,17 @@ inline v128 gv_extend_lo_s8(const v128& vec)
 #endif
 }
 template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_extend_lo_s8(A&& a)
 {
 #if defined(ARCH_X64)
 	using enum asmjit::x86::Inst::Id;
 	if (utils::has_sse41())
 		return asmjit::unary_op(kIdPmovsxbw, kIdVpmovsxbw, std::forward<A>(a));
 	return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward<A>(a)), 8);
 #endif
 }
 inline v128 gv_extend_hi_s8(const v128& vec)
 {
 #if defined(__SSE4_1__)
@ -2316,6 +2590,15 @@ inline v128 gv_extend_hi_s8(const v128& vec)
 #endif
 }
 template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_extend_hi_s8(A&& a)
 {
 #if defined(ARCH_X64)
 	using enum asmjit::x86::Inst::Id;
 	return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward<A>(a)), 8);
 #endif
 }
 inline v128 gv_unpacklo16(const v128& lows, const v128& highs)
 {
 #if defined(ARCH_X64)
@ -2336,6 +2619,17 @@ inline v128 gv_extend_lo_s16(const v128& vec)
 #endif
 }
 template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_extend_lo_s16(A&& a)
 {
 #if defined(ARCH_X64)
 	using enum asmjit::x86::Inst::Id;
 	if (utils::has_sse41())
 		return asmjit::unary_op(kIdPmovsxwd, kIdVpmovsxwd, std::forward<A>(a));
 	return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward<A>(a)), 16);
 #endif
 }
 inline v128 gv_extend_hi_s16(const v128& vec)
 {
 #if defined(__SSE4_1__)
@ -2347,6 +2641,15 @@ inline v128 gv_extend_hi_s16(const v128& vec)
 #endif
 }
 template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_extend_hi_s16(A&& a)
 {
 #if defined(ARCH_X64)
 	using enum asmjit::x86::Inst::Id;
 	return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward<A>(a)), 16);
 #endif
 }
 inline v128 gv_unpacklo32(const v128& lows, const v128& highs)
 {
 #if defined(ARCH_X64)
@ -2471,3 +2774,280 @@ inline v128 gv_log2_approxfs(const v128& a)
 	return r;
 #endif
 }
 // For each 8-bit element, r = a << (b & 7)
 inline v128 gv_shl8(const v128& a, const v128& b)
 {
 #if defined(ARCH_ARM64)
 	return vshlq_u8(a, vandq_s8(b, gv_bcst8(7)));
 #else
 	const v128 x1 = gv_add8(a, a); // shift left by 1
 	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
 	const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2
 	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
 	const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4
 	return gv_signselect8(gv_shl64(b, 5), x3, r2);
 #endif
 }
 // For each 16-bit element, r = a << (b & 15)
 inline v128 gv_shl16(const v128& a, const v128& b)
 {
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
 	return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
 #elif defined(ARCH_ARM64)
 	return vshlq_u16(a, vandq_s16(b, gv_bcst8(15)));
 #else
 	v128 r;
 	for (u32 i = 0; i < 8; i++)
 		r._u16[i] = a._u16[i] << (b._u16[i] & 15);
 	return r;
 #endif
 }
 // For each 32-bit element, r = a << (b & 31)
 inline v128 gv_shl32(const v128& a, const v128& b)
 {
 #if defined(__AVX2__)
 	return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
 #elif defined(ARCH_ARM64)
 	return vshlq_u32(a, vandq_s32(b, gv_bcst8(31)));
 #else
 	v128 r;
 	for (u32 i = 0; i < 4; i++)
 		r._u32[i] = a._u32[i] << (b._u32[i] & 31);
 	return r;
 #endif
 }
 // For each unsigned 8-bit element, r = a >> (b & 7)
 inline v128 gv_shr8(const v128& a, const v128& b)
 {
 #if defined(ARCH_ARM64)
 	return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
 #else
 	const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1
 	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
 	const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2
 	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
 	const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4
 	return gv_signselect8(gv_shl64(b, 5), x3, r2);
 #endif
 }
 // For each unsigned 16-bit element, r = a >> (b & 15)
 inline v128 gv_shr16(const v128& a, const v128& b)
 {
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
 	return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
 #elif defined(ARCH_ARM64)
 	return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
 #else
 	v128 r;
 	for (u32 i = 0; i < 8; i++)
 		r._u16[i] = a._u16[i] >> (b._u16[i] & 15);
 	return r;
 #endif
 }
 // For each unsigned 32-bit element, r = a >> (b & 31)
 inline v128 gv_shr32(const v128& a, const v128& b)
 {
 #if defined(__AVX2__)
 	return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
 #elif defined(ARCH_ARM64)
 	return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
 #else
 	v128 r;
 	for (u32 i = 0; i < 4; i++)
 		r._u32[i] = a._u32[i] >> (b._u32[i] & 31);
 	return r;
 #endif
 }
 // For each signed 8-bit element, r = a >> (b & 7)
 inline v128 gv_sar8(const v128& a, const v128& b)
 {
 #if defined(ARCH_ARM64)
 	return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
 #else
 	v128 r;
 	for (u32 i = 0; i < 16; i++)
 		r._s8[i] = a._s8[i] >> (b._s8[i] & 7);
 	return r;
 #endif
 }
 // For each signed 16-bit element, r = a >> (b & 15)
 inline v128 gv_sar16(const v128& a, const v128& b)
 {
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
 	return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
 #elif defined(ARCH_ARM64)
 	return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
 #else
 	v128 r;
 	for (u32 i = 0; i < 8; i++)
 		r._s16[i] = a._s16[i] >> (b._s16[i] & 15);
 	return r;
 #endif
 }
 // For each signed 32-bit element, r = a >> (b & 31)
 inline v128 gv_sar32(const v128& a, const v128& b)
 {
 #if defined(__AVX2__)
 	return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
 #elif defined(ARCH_ARM64)
 	return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
 #else
 	v128 r;
 	for (u32 i = 0; i < 4; i++)
 		r._s32[i] = a._s32[i] >> (b._s32[i] & 31);
 	return r;
 #endif
 }
 // For each 8-bit element, r = rotate a by b
 inline v128 gv_rol8(const v128& a, const v128& b)
 {
 #if defined(ARCH_ARM64)
 	const auto amt1 = vandq_s8(b, gv_bcst8(7));
 	const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
 	return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2));
 #else
 	const v128 x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1
 	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
 	const v128 c2 = gv_bcst8(0x3);
 	const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2), gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2
 	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
 	const v128 c3 = gv_bcst8(0xf);
 	const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3), gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4
 	return gv_signselect8(gv_shl64(b, 5), x3, r2);
 #endif
 }
 // For each 16-bit element, r = rotate a by b
 inline v128 gv_rol16(const v128& a, const v128& b)
 {
 #if defined(ARCH_ARM64)
 	const auto amt1 = vandq_s16(b, gv_bcst16(15));
 	const auto amt2 = vsubq_s16(amt1, gv_bcst16(16));
 	return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2));
 #else
 	v128 r;
 	for (u32 i = 0; i < 8; i++)
 		r._u16[i] = utils::rol16(a._u16[i], b._u16[i]);
 	return r;
 #endif
 }
 // For each 32-bit element, r = rotate a by b
 inline v128 gv_rol32(const v128& a, const v128& b)
 {
 #if defined(__AVX512VL__)
 	return _mm_rolv_epi32(a, b);
 #elif defined(ARCH_ARM64)
 	const auto amt1 = vandq_s32(b, gv_bcst32(31));
 	const auto amt2 = vsubq_s32(amt1, gv_bcst32(32));
 	return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
 #else
 	v128 r;
 	for (u32 i = 0; i < 4; i++)
 		r._u32[i] = utils::rol32(a._u32[i], b._u32[i]);
 	return r;
 #endif
 }
 // For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1)
 template <typename A, typename B, typename C>
 inline auto gv_fshl8(A&& a, B&& b, C&& c)
 {
 #if defined(ARCH_ARM64)
 	const auto amt1 = vandq_s8(c, gv_bcst8(7));
 	const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
 	return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2)));
 #else
 	auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b));
 	auto s1 = gv_shl64(c, 7);
 	auto r1 = gv_signselect8(s1, std::move(x1), std::forward<A>(a));
 	auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward<B>(b));
 	auto c2 = gv_bcst8(0x3);
 	auto x2 = gv_and32(gv_shr64(b1, 6), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2)));
 	auto s2 = gv_shl64(c, 6);
 	auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
 	auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1));
 	auto c3 = gv_bcst8(0xf);
 	auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4)));
 	return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
 #endif
 }
 // For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1)
 template <typename A, typename B, typename C>
 inline auto gv_fshr8(A&& a, B&& b, C&& c)
 {
 #if defined(ARCH_ARM64)
 	const auto amt1 = vandq_s8(c, gv_bcst8(7));
 	const auto amt2 = vsubq_s8(gv_bcst8(8), amt1);
 	return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2));
 #else
 	auto c1 = gv_bcst8(0x7f);
 	auto x1 = gv_and32(gv_shr64(b, 1), c1); x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7)));
 	auto s1 = gv_shl64(c, 7);
 	auto r1 = gv_signselect8(s1, std::move(x1), std::move(b));
 	auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a));
 	auto c2 = gv_bcst8(0x3f);
 	auto x2 = gv_and32(gv_shr64(r1, 2), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6)));
 	auto s2 = gv_shl64(c, 6);
 	auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
 	auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1));
 	auto c3 = gv_bcst8(0x0f);
 	auto x3 = gv_and32(gv_shr64(r2, 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4)));
 	return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
 #endif
 }
 // Shift left by byte amount
 template <u32 Count>
 inline v128 gv_shuffle_left(const v128& a)
 {
 	if (Count > 15)
 		return {};
 #if defined(ARCH_X64)
 	return _mm_slli_si128(a, Count);
 #elif defined(ARCH_ARM64)
 	v128 idx;
 	for (u32 i = 0; i < 16; i++)
 		idx._u8[i] = u8(i - Count);
 	return vqtbl1q_u8(a, idx);
 #endif
 }
 template <u32 Count, typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shuffle_left(A&& a)
 {
 	FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward<A>(a), Count);
 }
 // Shift right by byte amount
 template <u32 Count>
 inline v128 gv_shuffle_right(const v128& a)
 {
 	if (Count > 15)
 		return {};
 #if defined(ARCH_X64)
 	return _mm_srli_si128(a, Count);
 #elif defined(ARCH_ARM64)
 	v128 idx;
 	for (u32 i = 0; i < 16; i++)
 		idx._u8[i] = u8(i + Count);
 	return vqtbl1q_u8(a, idx);
 #endif
 }
 template <u32 Count, typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shuffle_right(A&& a)
 {
 	FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
 }