diff --git a/Utilities/JIT.h b/Utilities/JIT.h
index 914474d93e..6d9a70e432 100644
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@@ -247,10 +247,16 @@ inline FT build_function_asm(std::string_view name, F&& builder)
 
 	Asm compiler(&code);
 	compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
-	if constexpr (std::is_invocable_v<F, Asm&, native_args&>)
-		builder(compiler, args);
+	if constexpr (std::is_invocable_r_v<bool, F, Asm&, native_args&>)
+	{
+		if (!builder(compiler, args))
+			return nullptr;
+	}
 	else
-		builder(compiler);
+	{
+		builder(compiler, args);
+	}
+
 	rt.dump_name = name;
 	const auto result = rt._add(&code);
 	jit_announce(result, code.codeSize(), name);
diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp
index 26ad65d79f..d658e0557c 100644
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@@ -111,13 +111,15 @@ struct ppu_exec_select
 #define RETURN(...) \
 	if constexpr (Build == 0) { \
 		static_cast<void>(exec); \
-		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c) { \
+		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c, native_args&) { \
 			static ppu_opcode_t op{}; \
 			static ppu_abstract_t ppu; \
 			exec(__VA_ARGS__); \
 			c.ppu_ret(); \
+			return !c.fail_flag; \
 		}); \
-		return f; \
+		if (f) return f; \
+		RETURN_(__VA_ARGS__); \
 	}
 #else
 #define RETURN RETURN_
@@ -1019,7 +1021,7 @@ auto VADDUWS()
 		}
 	};
 
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2074,7 +2076,7 @@ auto VNOR()
 		d = gv_notfs(gv_orfs(std::move(a), std::move(b)));
 	};
 
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2100,7 +2102,7 @@ auto VPERM()
 #if defined (ARCH_X64)
 	if constexpr (Build == 0)
 	{
-		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_VPERM", [&](asmjit::ppu_builder& c)
+		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_VPERM", [&](asmjit::ppu_builder& c, native_args&)
 		{
 			const auto [v0, v1, v2, v3] = c.vec_alloc<4>();
 			c.movdqa(v0, c.ppu_vr(s_op.vc));
@@ -2374,17 +2376,12 @@ auto VRLB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint i = 0; i < 16; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u8[i] = utils::rol8(a._u8[i], b._u8[i]);
-	}
+		d = gv_rol8(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2393,17 +2390,12 @@ auto VRLH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint i = 0; i < 8; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u16[i] = utils::rol16(a._u16[i], b._u8[i * 2] & 0xf);
-	}
+		d = gv_rol16(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2412,17 +2404,12 @@ auto VRLW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u32[w] = utils::rol32(a._u32[w], b._u8[w * 4] & 0x1f);
-	}
+		d = gv_rol32(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2447,15 +2434,13 @@ auto VSEL()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	const auto& c = ppu.vr[op.vc];
-
-	d = (b & c) | gv_andn(c, a);
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
+	{
+		auto x = gv_andfs(std::move(b), c);
+		d = gv_orfs(std::move(x), gv_andnfs(std::move(c), std::move(a)));
 	};
-	RETURN_(ppu, op);
+
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2464,19 +2449,12 @@ auto VSL()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	u8 sh = ppu.vr[op.vb]._u8[0] & 0x7;
-
-	d._u8[0] = VA._u8[0] << sh;
-	for (uint b = 1; b < 16; b++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		sh = ppu.vr[op.vb]._u8[b] & 0x7;
-		d._u8[b] = (VA._u8[b] << sh) | (VA._u8[b - 1] >> (8 - sh));
-	}
+		d = gv_fshl8(std::move(a), gv_shuffle_left<1>(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2485,38 +2463,35 @@ auto VSLB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint i = 0; i < 16; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u8[i] = a._u8[i] << (b._u8[i] & 0x7);
-	}
+		d = gv_shl8(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
-template <u32 Build, ppu_exec_bit... Flags>
-auto VSLDOI()
+template <u32 Count>
+struct VSLDOI
 {
-	if constexpr (Build == 0xf1a6)
-		return ppu_exec_select<Flags...>::template select<>();
-
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	u8 tmpSRC[32];
-	std::memcpy(tmpSRC, &ppu.vr[op.vb], 16);
-	std::memcpy(tmpSRC + 16, &ppu.vr[op.va], 16);
-
-	for (uint b = 0; b<16; b++)
+	template <ppu_exec_bit... Flags>
+	static auto select(bs_t<ppu_exec_bit> selected, auto func)
 	{
-		d._u8[15 - b] = tmpSRC[31 - (b + op.vsh)];
+		return ppu_exec_select<>::select<Flags...>(selected, func);
 	}
-	};
-	RETURN_(ppu, op);
-}
+
+	template <u32 Build, ppu_exec_bit... Flags>
+	static auto impl()
+	{
+		static const auto exec = [](auto&& d, auto&& a, auto&& b)
+		{
+			d = gv_or32(gv_shuffle_left<Count>(std::move(a)), gv_shuffle_right<16 - Count>(std::move(b)));
+		};
+
+		RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+	}
+};
+
 
 template <u32 Build, ppu_exec_bit... Flags>
 auto VSLH()
@@ -2524,17 +2499,12 @@ auto VSLH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint h = 0; h < 8; h++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u16[h] = a._u16[h] << (b._u16[h] & 0xf);
-	}
+		d = gv_shl16(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2543,19 +2513,12 @@ auto VSLO()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
-
-	d.clear();
-
-	for (u8 b = 0; b < 16 - nShift; b++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u8[15 - b] = VA._u8[15 - (b + nShift)];
-	}
+		d._u = a._u << (b._u8[0] & 0x78);
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2564,17 +2527,12 @@ auto VSLW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u32[w] = a._u32[w] << (b._u32[w] & 0x1f);
-	}
+		d = gv_shl32(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2583,16 +2541,12 @@ auto VSPLTB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	u8 byte = ppu.vr[op.vb]._u8[15 - op.vuimm];
-
-	for (uint b = 0; b < 16; b++)
+	static const auto exec = [](auto&& d, auto&& b, auto&& imm)
 	{
-		d._u8[b] = byte;
-	}
+		d = gv_bcst8(b.u8r[imm & 15]);
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2601,18 +2555,12 @@ auto VSPLTH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	ensure((op.vuimm < 8));
-
-	u16 hword = ppu.vr[op.vb]._u16[7 - op.vuimm];
-
-	for (uint h = 0; h < 8; h++)
+	static const auto exec = [](auto&& d, auto&& b, auto&& imm)
 	{
-		d._u16[h] = hword;
-	}
+		d = gv_bcst16(b.u16r[imm & 7]);
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2621,16 +2569,12 @@ auto VSPLTISB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const s8 imm = op.vsimm;
-
-	for (uint b = 0; b < 16; b++)
+	static const auto exec = [](auto&& d, auto&& imm)
 	{
-		d._u8[b] = imm;
-	}
+		d = gv_bcst8(imm);
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], op.vsimm);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2639,16 +2583,12 @@ auto VSPLTISH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const s16 imm = op.vsimm;
-
-	for (uint h = 0; h < 8; h++)
+	static const auto exec = [](auto&& d, auto&& imm)
 	{
-		d._u16[h] = imm;
-	}
+		d = gv_bcst16(imm);
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], op.vsimm);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2657,16 +2597,12 @@ auto VSPLTISW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const s32 imm = op.vsimm;
-
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& imm)
 	{
-		d._u32[w] = imm;
-	}
+		d = gv_bcst32(imm);
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], op.vsimm);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2675,18 +2611,12 @@ auto VSPLTW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	ensure((op.vuimm < 4));
-
-	u32 word = ppu.vr[op.vb]._u32[3 - op.vuimm];
-
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& b, auto&& imm)
 	{
-		d._u32[w] = word;
-	}
+		d = gv_bcst32(b.u32r[imm & 3]);
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2695,19 +2625,12 @@ auto VSR()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	u8 sh = ppu.vr[op.vb]._u8[15] & 0x7;
-
-	d._u8[15] = VA._u8[15] >> sh;
-	for (uint b = 14; ~b; b--)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		sh = ppu.vr[op.vb]._u8[b] & 0x7;
-		d._u8[b] = (VA._u8[b] >> sh) | (VA._u8[b + 1] << (8 - sh));
-	}
+		d = gv_fshr8(gv_shuffle_right<1>(a), std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2716,17 +2639,12 @@ auto VSRAB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint i = 0; i < 16; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._s8[i] = a._s8[i] >> (b._u8[i] & 0x7);
-	}
+		d = gv_sar8(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2735,17 +2653,12 @@ auto VSRAH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint h = 0; h < 8; h++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._s16[h] = a._s16[h] >> (b._u16[h] & 0xf);
-	}
+		d = gv_sar16(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2754,17 +2667,12 @@ auto VSRAW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._s32[w] = a._s32[w] >> (b._u32[w] & 0x1f);
-	}
+		d = gv_sar32(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2773,17 +2681,12 @@ auto VSRB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint i = 0; i < 16; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u8[i] = a._u8[i] >> (b._u8[i] & 0x7);
-	}
+		d = gv_shr8(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2792,17 +2695,12 @@ auto VSRH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint h = 0; h < 8; h++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u16[h] = a._u16[h] >> (b._u16[h] & 0xf);
-	}
+		d = gv_shr16(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2811,19 +2709,12 @@ auto VSRO()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
-
-	d.clear();
-
-	for (u8 b = 0; b < 16 - nShift; b++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u8[b] = VA._u8[b + nShift];
-	}
+		d._u = a._u >> (b._u8[0] & 0x78);
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -2832,17 +2723,12 @@ auto VSRW()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		d._u32[w] = a._u32[w] >> (b._u32[w] & 0x1f);
-	}
+		d = gv_shr32(std::move(a), std::move(b));
 	};
-	RETURN_(ppu, op);
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -3184,30 +3070,14 @@ auto VUPKHPX()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-#if defined(ARCH_X64_0)
-	static const auto make = [](asmjit::ppu_builder& c)
-	{
-		const auto [v0, v1, v2] = c.vec_alloc<3>();
-		EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
-		EMIT(psrad, v0, v0, c.imm(16));
-		EMIT(pslld, v1, v0, c.imm(6));
-		EMIT(pslld, v2, v0, c.imm(3));
-		BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
-		BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
-		BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
-		EMIT(por, v0, v0, v1);
-		EMIT(por, v0, v0, v2);
-		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
-		c.ppu_ret();
-	};
-#endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		const auto x = gv_extend_hi_s16(b);
-		d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00));
+		auto x = gv_extend_hi_s16(std::move(b));
+		auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
+		d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f)));
 	};
 
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -3216,22 +3086,12 @@ auto VUPKHSB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-#if defined(ARCH_X64_0)
-	static const auto make = [](asmjit::ppu_builder& c)
-	{
-		const auto v0 = c.vec_alloc();
-		EMIT(punpckhbw, v0, v0, c.ppu_vr(s_op.vb));
-		EMIT(psraw, v0, v0, c.imm(8));
-		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
-		c.ppu_ret();
-	};
-#endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		d = gv_extend_hi_s8(b);
+		d = gv_extend_hi_s8(std::move(b));
 	};
 
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -3240,22 +3100,12 @@ auto VUPKHSH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-#if defined(ARCH_X64_0)
-	static const auto make = [](asmjit::ppu_builder& c)
-	{
-		const auto v0 = c.vec_alloc();
-		EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
-		EMIT(psrad, v0, v0, c.imm(16));
-		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
-		c.ppu_ret();
-	};
-#endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		d = gv_extend_hi_s16(b);
+		d = gv_extend_hi_s16(std::move(b));
 	};
 
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -3264,37 +3114,14 @@ auto VUPKLPX()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-#if defined(ARCH_X64_0)
-	static const auto make = [](asmjit::ppu_builder& c)
-	{
-		const auto [v0, v1, v2] = c.vec_alloc<3>();
-		if (utils::has_sse41())
-		{
-			LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
-		}
-		else
-		{
-			EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
-			EMIT(psrad, v0, v0, c.imm(16));
-		}
-		EMIT(pslld, v1, v0, c.imm(6));
-		EMIT(pslld, v2, v0, c.imm(3));
-		BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
-		BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
-		BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
-		EMIT(por, v0, v0, v1);
-		EMIT(por, v0, v0, v2);
-		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
-		c.ppu_ret();
-	};
-#endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		const auto x = gv_extend_lo_s16(b);
-		d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00));
+		auto x = gv_extend_lo_s16(std::move(b));
+		auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
+		d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f)));
 	};
 
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -3303,29 +3130,12 @@ auto VUPKLSB()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-#if defined(ARCH_X64_0)
-	static const auto make = [](asmjit::ppu_builder& c)
-	{
-		const auto v0 = c.vec_alloc();
-		if (utils::has_sse41())
-		{
-			LDST(pmovsxbw, v0, c.ppu_vr<8>(s_op.vb));
-		}
-		else
-		{
-			EMIT(punpcklbw, v0, v0, c.ppu_vr(s_op.vb));
-			EMIT(psraw, v0, v0, c.imm(8));
-		}
-		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
-		c.ppu_ret();
-	};
-#endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		d = gv_extend_lo_s8(b);
+		d = gv_extend_lo_s8(std::move(b));
 	};
 
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -3334,29 +3144,12 @@ auto VUPKLSH()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<>();
 
-#if defined(ARCH_X64_0)
-	static const auto make = [](asmjit::ppu_builder& c)
-	{
-		const auto v0 = c.vec_alloc();
-		if (utils::has_sse41())
-		{
-			LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
-		}
-		else
-		{
-			EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
-			EMIT(psrad, v0, v0, c.imm(16));
-		}
-		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
-		c.ppu_ret();
-	};
-#endif
 	static const auto exec = [](auto&& d, auto&& b)
 	{
-		d = gv_extend_lo_s16(b);
+		d = gv_extend_lo_s16(std::move(b));
 	};
 
-	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+	RETURN(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
 template <u32 Build, ppu_exec_bit... Flags>
@@ -7157,7 +6950,8 @@ struct ppu_interpreter_t
 	IT VSEL;
 	IT VSL;
 	IT VSLB;
-	IT VSLDOI;
+	IT VSLDOI{};
+	IT VSLDOI_[16];
 	IT VSLH;
 	IT VSLO;
 	IT VSLW;
@@ -7629,6 +7423,27 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
 		return ::name<0, Flags...>(); \
 	}); \
 
+#define INIT_ONE(name, bits) \
+	ptrs->name##_[0b##bits] = ::name<0b##bits>::select(selected, []<ppu_exec_bit... Flags>() { \
+		return ::name<0b##bits>::impl<0, Flags...>(); \
+	}); \
+
+#define INIT_PACK2(name, bits) \
+	INIT_ONE(name, bits##0) \
+	INIT_ONE(name, bits##1) \
+
+#define INIT_PACK4(name, bits) \
+	INIT_PACK2(name, bits##0) \
+	INIT_PACK2(name, bits##1) \
+
+#define INIT_PACK8(name, bits) \
+	INIT_PACK4(name, bits##0) \
+	INIT_PACK4(name, bits##1) \
+
+#define INIT_PACK16(name, bits) \
+	INIT_PACK8(name, bits##0) \
+	INIT_PACK8(name, bits##1) \
+
 	INIT(MFVSCR);
 	INIT(MTVSCR);
 	INIT(VADDCUW);
@@ -7732,7 +7547,7 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
 	INIT(VSEL);
 	INIT(VSL);
 	INIT(VSLB);
-	INIT(VSLDOI);
+	INIT_PACK16(VSLDOI,);
 	INIT(VSLH);
 	INIT(VSLO);
 	INIT(VSLW);
@@ -8051,6 +7866,7 @@ ppu_intrp_func_t ppu_interpreter_rt::decode(u32 opv) const noexcept
 
 		break;
 	}
+	case ppu_itype::VSLDOI: return ptrs->VSLDOI_[op.vsh];
 	default: break;
 	}
 
diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp
index 5a989a0a60..5efe3c0fbb 100644
--- a/rpcs3/util/asm.hpp
+++ b/rpcs3/util/asm.hpp
@@ -193,7 +193,7 @@ namespace utils
 #elif defined(__clang__)
 		return __builtin_rotateleft32(x, n);
 #else
-		return (x << n) | (x >> (32 - n));
+		return (x << (n & 31)) | (x >> (((0 - n) & 31)));
 #endif
 	}
 
@@ -209,7 +209,7 @@ namespace utils
 #elif defined(__clang__)
 		return __builtin_rotateleft64(x, n);
 #else
-		return (x << n) | (x >> (64 - n));
+		return (x << (n & 63)) | (x >> (((0 - n) & 63)));
 #endif
 	}
 
diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp
index c3c1b52ff8..d04ab111db 100644
--- a/rpcs3/util/simd.hpp
+++ b/rpcs3/util/simd.hpp
@@ -3,6 +3,7 @@
 #include "util/types.hpp"
 #include "util/v128.hpp"
 #include "util/sysinfo.hpp"
+#include "util/asm.hpp"
 #include "Utilities/JIT.h"
 
 #if defined(ARCH_X64)
@@ -40,6 +41,7 @@ namespace asmjit
 #else
 	struct gpr_type : Operand
 	{
+		gpr_type() = default;
 		gpr_type(u32)
 		{
 		}
@@ -47,6 +49,7 @@ namespace asmjit
 
 	struct vec_type : Operand
 	{
+		vec_type() = default;
 		vec_type(u32)
 		{
 		}
@@ -82,7 +85,7 @@ namespace asmjit
 
 	template <typename T, typename D = std::decay_t<T>>
 	constexpr arg_class arg_classify =
-		std::is_base_of_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> :
+		std::is_same_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> :
 		std::is_base_of_v<mem_type, D> ? arg_class::mem_lv :
 		std::is_base_of_v<mem_lazy, D> ? arg_class::mem_lv + !std::is_reference_v<T> :
 		std::is_reference_v<T> ? arg_class::reg_lv : arg_class::reg_rv;
@@ -91,6 +94,8 @@ namespace asmjit
 	{
 		using base = native_asm;
 
+		bool fail_flag = false;
+
 		vec_builder(CodeHolder* ch)
 			: native_asm(ch)
 		{
@@ -150,6 +155,9 @@ namespace asmjit
 
 		std::unordered_map<v128, Label> consts[16]{};
 
+#if defined(ARCH_X64)
+		std::unordered_map<v128, vec_type> const_allocs{};
+
 		template <typename T, u32 Size = sizeof(T)>
 		x86::Mem get_const(const T& data, u32 esize = Size)
 		{
@@ -180,14 +188,97 @@ namespace asmjit
 
 			return x86::Mem(_label, 0, Size);
 		}
+#endif
+	};
+
+	struct free_on_exit
+	{
+		Operand x{};
+
+		free_on_exit() = default;
+		free_on_exit(const free_on_exit&) = delete;
+		free_on_exit& operator=(const free_on_exit&) = delete;
+
+		~free_on_exit()
+		{
+			if (x.isReg())
+			{
+				vec_type v;
+				v.copyFrom(x);
+				g_vc->vec_dealloc(v);
+			}
+		}
 	};
 
 #if defined(ARCH_X64)
-	inline auto arg_eval(const v128& _c, u32 esize)
+	inline Operand arg_eval(v128& _c, u32 esize)
 	{
-		// TODO: implement PSHUFD broadcasts and AVX ones
-		auto r = g_vc->get_const(_c, esize);
-		return r;
+		const auto found = g_vc->const_allocs.find(_c);
+
+		if (found != g_vc->const_allocs.end())
+		{
+			return found->second;
+		}
+
+		vec_type reg = g_vc->vec_alloc();
+
+		// TODO: PSHUFD style broadcast? Needs known const layout
+		if (utils::has_avx() && _c._u64[0] == _c._u64[1])
+		{
+			if (_c._u32[0] == _c._u32[1])
+			{
+				if (utils::has_avx2() && _c._u16[0] == _c._u16[1])
+				{
+					if (_c._u8[0] == _c._u8[1])
+					{
+						ensure(!g_vc->vpbroadcastb(reg, g_vc->get_const(_c._u8[0])));
+					}
+					else
+					{
+						ensure(!g_vc->vpbroadcastw(reg, g_vc->get_const(_c._u16[0])));
+					}
+				}
+				else
+				{
+					ensure(!g_vc->vbroadcastss(reg, g_vc->get_const(_c._u32[0])));
+				}
+			}
+			else
+			{
+				ensure(!g_vc->vbroadcastsd(reg, g_vc->get_const(_c._u32[0])));
+			}
+		}
+		else if (!_c._u)
+		{
+			ensure(!g_vc->pxor(reg, reg));
+		}
+		else if (!~_c._u)
+		{
+			ensure(!g_vc->pcmpeqd(reg, reg));
+		}
+		else
+		{
+			ensure(!g_vc->movaps(reg, g_vc->get_const(_c, esize)));
+		}
+
+		g_vc->const_allocs.emplace(_c, reg);
+		return reg;
+	}
+
+	inline Operand arg_eval(v128&& _c, u32 esize)
+	{
+		const auto found = g_vc->const_allocs.find(_c);
+
+		if (found != g_vc->const_allocs.end())
+		{
+			vec_type r = found->second;
+			g_vc->const_allocs.erase(found);
+			g_vc->vec_dealloc(r);
+			return r;
+		}
+
+		// Hack: assume can use mem op (TODO)
+		return g_vc->get_const(_c, esize);
 	}
 
 	template <typename T> requires(std::is_base_of_v<mem_lazy, std::decay_t<T>>)
@@ -211,12 +302,24 @@ namespace asmjit
 		return std::move(mem);
 	}
 
+	inline void arg_free(const v128&)
+	{
+	}
+
+	inline void arg_free(const Operand& op)
+	{
+		if (op.isReg())
+		{
+			g_vc->vec_dealloc(vec_type{op.id()});
+		}
+	}
+
 	template <typename T>
 	inline bool arg_use_evex(const auto& op)
 	{
 		constexpr auto _class = arg_classify<T>;
 		if constexpr (_class == arg_class::imm_rv)
-			return true;
+			return g_vc->const_allocs.count(op) == 0;
 		else if constexpr (_class == arg_class::imm_lv)
 			return false;
 		else if (op.isMem())
@@ -302,6 +405,7 @@ namespace asmjit
 	template <typename A, typename B, typename... Args>
 	vec_type binary_op(u32 esize, x86::Inst::Id mov_op, x86::Inst::Id sse_op, x86::Inst::Id avx_op, x86::Inst::Id evex_op, A&& a, B&& b, Args&&... args)
 	{
+		free_on_exit e;
 		Operand src1{};
 
 		if constexpr (arg_classify<A> == arg_class::reg_rv)
@@ -317,12 +421,13 @@ namespace asmjit
 
 			if constexpr (arg_classify<B> == arg_class::reg_rv)
 			{
-				g_vc->vec_dealloc(vec_type{b.id()});
-				//b = Operand();
+				e.x = b;
 			}
 		}
 		else if (utils::has_avx() && avx_op && (arg_classify<A> == arg_class::reg_lv || arg_classify<A> == arg_class::mem_lv))
 		{
+			Operand srca = arg_eval(std::forward<A>(a), 16);
+
 			if constexpr (arg_classify<A> == arg_class::reg_lv)
 			{
 				if constexpr (arg_classify<B> == arg_class::reg_rv)
@@ -336,47 +441,79 @@ namespace asmjit
 					src1 = g_vc->vec_alloc();
 				}
 			}
-			else // if A == arg_class::reg_rv
+			else
 			{
 				src1 = g_vc->vec_alloc();
 
-				if (!a.isReg())
-				{
-					static_cast<void>(arg_eval(std::forward<A>(a), 16));
-				}
-
 				if constexpr (arg_classify<B> == arg_class::reg_rv)
 				{
-					g_vc->vec_dealloc(vec_type{b.id()});
-					//b = Operand();
+					e.x = b;
 				}
 			}
 
 			if (utils::has_avx512() && evex_op && arg_use_evex<B>(b))
 			{
-				ensure(!g_vc->evex().emit(evex_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
+				ensure(!g_vc->evex().emit(evex_op, src1, srca, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
 				return vec_type{src1.id()};
 			}
 
-			ensure(!g_vc->emit(avx_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
+			ensure(!g_vc->emit(avx_op, src1, srca, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
 			return vec_type{src1.id()};
 		}
 		else do
 		{
-			if constexpr (arg_classify<B> == arg_class::reg_rv)
+			if constexpr (arg_classify<A> == arg_class::mem_rv)
 			{
-				g_vc->vec_dealloc(vec_type{b.id()});
-				//b = Operand();
+				if (a.isReg())
+				{
+					src1 = vec_type(a.id());
+
+					if constexpr (arg_classify<B> == arg_class::reg_rv)
+					{
+						e.x = b;
+					}
+					break;
+				}
 			}
 
-			if (arg_classify<A> == arg_class::mem_rv && a.isReg())
+			if constexpr (arg_classify<A> == arg_class::imm_rv)
 			{
-				src1 = vec_type(a.id());
-				break;
+				if (auto found = g_vc->const_allocs.find(a); found != g_vc->const_allocs.end())
+				{
+					src1 = found->second;
+					g_vc->const_allocs.erase(found);
+
+					if constexpr (arg_classify<B> == arg_class::reg_rv)
+					{
+						e.x = b;
+					}
+					break;
+				}
 			}
 
 			src1 = g_vc->vec_alloc();
 
+			if constexpr (arg_classify<B> == arg_class::reg_rv)
+			{
+				e.x = b;
+			}
+
+			if constexpr (arg_classify<A> == arg_class::imm_rv)
+			{
+				if (!a._u)
+				{
+					// All zeros
+					ensure(!g_vc->emit(x86::Inst::kIdPxor, src1, src1));
+					break;
+				}
+				else if (!~a._u)
+				{
+					// All ones
+					ensure(!g_vc->emit(x86::Inst::kIdPcmpeqd, src1, src1));
+					break;
+				}
+			}
+
 			// Fallback to arg copy
 			ensure(!g_vc->emit(mov_op, src1, arg_eval(std::forward<A>(a), 16)));
 		}
@@ -404,10 +541,14 @@ namespace asmjit
 }
 
 inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false);
+inline v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false);
 inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false);
 inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false);
 inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false);
 
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline asmjit::vec_type gv_gts32(A&&, B&&);
+
 inline void gv_set_zeroing_denormals()
 {
 #if defined(ARCH_X64)
@@ -704,6 +845,16 @@ inline v128 gv_not32(const v128& a)
 #endif
 }
 
+template <typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_not32(A&& a)
+{
+#if defined(ARCH_X64)
+	asmjit::vec_type ones = g_vc->vec_alloc();
+	g_vc->pcmpeqd(ones, ones);
+	FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward<A>(a));
+#endif
+}
+
 inline v128 gv_notfs(const v128& a)
 {
 #if defined(ARCH_X64)
@@ -713,6 +864,16 @@ inline v128 gv_notfs(const v128& a)
 #endif
 }
 
+template <typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_notfs(A&& a)
+{
+#if defined(ARCH_X64)
+	asmjit::vec_type ones = g_vc->vec_alloc();
+	g_vc->pcmpeqd(ones, ones);
+	FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward<A>(a));
+#endif
+}
+
 inline v128 gv_shl16(const v128& a, u32 count)
 {
 	if (count >= 16)
@@ -724,7 +885,7 @@ inline v128 gv_shl16(const v128& a, u32 count)
 #endif
 }
 
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shl16(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward<A>(a), count);
@@ -741,7 +902,7 @@ inline v128 gv_shl32(const v128& a, u32 count)
 #endif
 }
 
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shl32(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward<A>(a), count);
@@ -758,7 +919,7 @@ inline v128 gv_shl64(const v128& a, u32 count)
 #endif
 }
 
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shl64(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward<A>(a), count);
@@ -775,7 +936,7 @@ inline v128 gv_shr16(const v128& a, u32 count)
 #endif
 }
 
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shr16(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward<A>(a), count);
@@ -792,7 +953,7 @@ inline v128 gv_shr32(const v128& a, u32 count)
 #endif
 }
 
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shr32(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward<A>(a), count);
@@ -809,7 +970,7 @@ inline v128 gv_shr64(const v128& a, u32 count)
 #endif
 }
 
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_shr64(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward<A>(a), count);
@@ -826,7 +987,7 @@ inline v128 gv_sar16(const v128& a, u32 count)
 #endif
 }
 
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_sar16(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward<A>(a), count);
@@ -843,7 +1004,7 @@ inline v128 gv_sar32(const v128& a, u32 count)
 #endif
 }
 
-template <typename A> requires(asmjit::any_operand_v<A>)
+template <typename A> requires (asmjit::any_operand_v<A>)
 inline auto gv_sar32(A&& a, u32 count)
 {
 	FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward<A>(a), count);
@@ -867,6 +1028,20 @@ inline v128 gv_sar64(const v128& a, u32 count)
 #endif
 }
 
+template <typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_sar64(A&& a, u32 count)
+{
+	if (count >= 64)
+		count = 63;
+#if defined(ARCH_X64)
+	using enum asmjit::x86::Inst::Id;
+	if (utils::has_avx512())
+		return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward<A>(a), count);
+	g_vc->fail_flag = true;
+	return std::forward<A>(a);
+#endif
+}
+
 inline v128 gv_add8(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@@ -1025,6 +1200,20 @@ inline v128 gv_addus_u32(const v128& a, const v128& b)
 #endif
 }
 
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline asmjit::vec_type gv_addus_u32(A&& a, B&& b)
+{
+#if defined(ARCH_X64)
+	if (utils::has_sse41())
+		return gv_add32(gv_minu32(std::forward<B>(b), gv_not32(a)), std::forward<A>(a));
+	auto s = gv_add32(a, b);
+	auto x = gv_xor32(std::forward<B>(b), gv_bcst32(0x80000000));
+	auto y = gv_xor32(std::forward<A>(a), gv_bcst32(0x7fffffff));
+	return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y)));
+#endif
+	return {};
+}
+
 inline v128 gv_addfs(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@@ -1052,6 +1241,12 @@ inline v128 gv_sub8(const v128& a, const v128& b)
 #endif
 }
 
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_sub8(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+}
+
 inline v128 gv_sub16(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@@ -1265,6 +1460,21 @@ inline v128 gv_minu32(const v128& a, const v128& b)
 #endif
 }
 
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline asmjit::vec_type gv_minu32(A&& a, B&& b)
+{
+#if defined(ARCH_X64)
+	if (utils::has_sse41())
+		FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward<A>(a), std::forward<B>(b));
+	auto s = gv_bcst32(0x80000000);
+	auto x = gv_xor32(a, s);
+	auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b));
+	auto z = gv_and32(m, std::move(b));
+	return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a)));
+#endif
+	return {};
+}
+
 inline v128 gv_mins8(const v128& a, const v128& b)
 {
 #if defined(__SSE4_1__)
@@ -1493,6 +1703,13 @@ inline v128 gv_gts8(const v128& a, const v128& b)
 #endif
 }
 
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline asmjit::vec_type gv_gts8(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	return {};
+}
+
 inline v128 gv_gts16(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@@ -1511,6 +1728,13 @@ inline v128 gv_gts32(const v128& a, const v128& b)
 #endif
 }
 
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline asmjit::vec_type gv_gts32(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	return {};
+}
+
 inline v128 gv_avgu8(const v128& a, const v128& b)
 {
 #if defined(ARCH_X64)
@@ -2154,7 +2378,7 @@ inline v128 gv_andn(const v128& a, const v128& b)
 }
 
 // Select elements; _cmp must be result of SIMD comparison; undefined otherwise
-inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
+FORCE_INLINE v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
 {
 #if defined(__SSE4_1__)
 	return _mm_blendv_epi8(_false, _true, _cmp);
@@ -2165,6 +2389,45 @@ inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
 #endif
 }
 
+// Select elements using sign bit only
+FORCE_INLINE v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false)
+{
+#if defined(__SSE4_1__)
+	return _mm_blendv_epi8(_false, _true, bits);
+#else
+	return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false);
+#endif
+}
+
+template <typename A, typename B, typename C> requires (asmjit::any_operand_v<A, B, C>)
+inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false)
+{
+	using namespace asmjit;
+#if defined(ARCH_X64)
+	if (utils::has_avx())
+	{
+		Operand arg0{};
+		Operand arg1 = arg_eval(std::forward<A>(bits), 16);
+		Operand arg2 = arg_eval(std::forward<B>(_true), 16);
+		Operand arg3 = arg_eval(std::forward<C>(_false), 16);
+		if constexpr (!std::is_reference_v<A>)
+			arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1);
+		if constexpr (!std::is_reference_v<B>)
+			arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2);
+		if constexpr (!std::is_reference_v<C>)
+			arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3);
+		if (arg0.isNone())
+			arg0 = g_vc->vec_alloc();
+		g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1);
+		vec_type r;
+		r.copyFrom(arg0);
+		return r;
+	}
+#endif
+	g_vc->fail_flag = true;
+	return vec_type{0};
+}
+
 // Select elements; _cmp must be result of SIMD comparison; undefined otherwise
 inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false)
 {
@@ -2305,6 +2568,17 @@ inline v128 gv_extend_lo_s8(const v128& vec)
 #endif
 }
 
+template <typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_extend_lo_s8(A&& a)
+{
+#if defined(ARCH_X64)
+	using enum asmjit::x86::Inst::Id;
+	if (utils::has_sse41())
+		return asmjit::unary_op(kIdPmovsxbw, kIdVpmovsxbw, std::forward<A>(a));
+	return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward<A>(a)), 8);
+#endif
+}
+
 inline v128 gv_extend_hi_s8(const v128& vec)
 {
 #if defined(__SSE4_1__)
@@ -2316,6 +2590,15 @@ inline v128 gv_extend_hi_s8(const v128& vec)
 #endif
 }
 
+template <typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_extend_hi_s8(A&& a)
+{
+#if defined(ARCH_X64)
+	using enum asmjit::x86::Inst::Id;
+	return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward<A>(a)), 8);
+#endif
+}
+
 inline v128 gv_unpacklo16(const v128& lows, const v128& highs)
 {
 #if defined(ARCH_X64)
@@ -2336,6 +2619,17 @@ inline v128 gv_extend_lo_s16(const v128& vec)
 #endif
 }
 
+template <typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_extend_lo_s16(A&& a)
+{
+#if defined(ARCH_X64)
+	using enum asmjit::x86::Inst::Id;
+	if (utils::has_sse41())
+		return asmjit::unary_op(kIdPmovsxwd, kIdVpmovsxwd, std::forward<A>(a));
+	return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward<A>(a)), 16);
+#endif
+}
+
 inline v128 gv_extend_hi_s16(const v128& vec)
 {
 #if defined(__SSE4_1__)
@@ -2347,6 +2641,15 @@ inline v128 gv_extend_hi_s16(const v128& vec)
 #endif
 }
 
+template <typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_extend_hi_s16(A&& a)
+{
+#if defined(ARCH_X64)
+	using enum asmjit::x86::Inst::Id;
+	return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward<A>(a)), 16);
+#endif
+}
+
 inline v128 gv_unpacklo32(const v128& lows, const v128& highs)
 {
 #if defined(ARCH_X64)
@@ -2471,3 +2774,280 @@ inline v128 gv_log2_approxfs(const v128& a)
 	return r;
 #endif
 }
+
+// For each 8-bit element, r = a << (b & 7)
+inline v128 gv_shl8(const v128& a, const v128& b)
+{
+#if defined(ARCH_ARM64)
+	return vshlq_u8(a, vandq_s8(b, gv_bcst8(7)));
+#else
+	const v128 x1 = gv_add8(a, a); // shift left by 1
+	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
+	const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2
+	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
+	const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4
+	return gv_signselect8(gv_shl64(b, 5), x3, r2);
+#endif
+}
+
+// For each 16-bit element, r = a << (b & 15)
+inline v128 gv_shl16(const v128& a, const v128& b)
+{
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+	return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
+#elif defined(ARCH_ARM64)
+	return vshlq_u16(a, vandq_s16(b, gv_bcst8(15)));
+#else
+	v128 r;
+	for (u32 i = 0; i < 8; i++)
+		r._u16[i] = a._u16[i] << (b._u16[i] & 15);
+	return r;
+#endif
+}
+
+// For each 32-bit element, r = a << (b & 31)
+inline v128 gv_shl32(const v128& a, const v128& b)
+{
+#if defined(__AVX2__)
+	return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
+#elif defined(ARCH_ARM64)
+	return vshlq_u32(a, vandq_s32(b, gv_bcst8(31)));
+#else
+	v128 r;
+	for (u32 i = 0; i < 4; i++)
+		r._u32[i] = a._u32[i] << (b._u32[i] & 31);
+	return r;
+#endif
+}
+
+// For each unsigned 8-bit element, r = a >> (b & 7)
+inline v128 gv_shr8(const v128& a, const v128& b)
+{
+#if defined(ARCH_ARM64)
+	return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
+#else
+	const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1
+	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
+	const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2
+	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
+	const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4
+	return gv_signselect8(gv_shl64(b, 5), x3, r2);
+#endif
+}
+
+// For each unsigned 16-bit element, r = a >> (b & 15)
+inline v128 gv_shr16(const v128& a, const v128& b)
+{
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+	return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
+#elif defined(ARCH_ARM64)
+	return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
+#else
+	v128 r;
+	for (u32 i = 0; i < 8; i++)
+		r._u16[i] = a._u16[i] >> (b._u16[i] & 15);
+	return r;
+#endif
+}
+
+// For each unsigned 32-bit element, r = a >> (b & 31)
+inline v128 gv_shr32(const v128& a, const v128& b)
+{
+#if defined(__AVX2__)
+	return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
+#elif defined(ARCH_ARM64)
+	return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
+#else
+	v128 r;
+	for (u32 i = 0; i < 4; i++)
+		r._u32[i] = a._u32[i] >> (b._u32[i] & 31);
+	return r;
+#endif
+}
+
+// For each signed 8-bit element, r = a >> (b & 7)
+inline v128 gv_sar8(const v128& a, const v128& b)
+{
+#if defined(ARCH_ARM64)
+	return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
+#else
+	v128 r;
+	for (u32 i = 0; i < 16; i++)
+		r._s8[i] = a._s8[i] >> (b._s8[i] & 7);
+	return r;
+#endif
+}
+
+// For each signed 16-bit element, r = a >> (b & 15)
+inline v128 gv_sar16(const v128& a, const v128& b)
+{
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+	return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
+#elif defined(ARCH_ARM64)
+	return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
+#else
+	v128 r;
+	for (u32 i = 0; i < 8; i++)
+		r._s16[i] = a._s16[i] >> (b._s16[i] & 15);
+	return r;
+#endif
+}
+
+// For each signed 32-bit element, r = a >> (b & 31)
+inline v128 gv_sar32(const v128& a, const v128& b)
+{
+#if defined(__AVX2__)
+	return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
+#elif defined(ARCH_ARM64)
+	return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
+#else
+	v128 r;
+	for (u32 i = 0; i < 4; i++)
+		r._s32[i] = a._s32[i] >> (b._s32[i] & 31);
+	return r;
+#endif
+}
+
+// For each 8-bit element, r = rotate a by b
+inline v128 gv_rol8(const v128& a, const v128& b)
+{
+#if defined(ARCH_ARM64)
+	const auto amt1 = vandq_s8(b, gv_bcst8(7));
+	const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
+	return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2));
+#else
+	const v128 x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1
+	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
+	const v128 c2 = gv_bcst8(0x3);
+	const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2), gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2
+	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
+	const v128 c3 = gv_bcst8(0xf);
+	const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3), gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4
+	return gv_signselect8(gv_shl64(b, 5), x3, r2);
+#endif
+}
+
+// For each 16-bit element, r = rotate a by b
+inline v128 gv_rol16(const v128& a, const v128& b)
+{
+#if defined(ARCH_ARM64)
+	const auto amt1 = vandq_s16(b, gv_bcst16(15));
+	const auto amt2 = vsubq_s16(amt1, gv_bcst16(16));
+	return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2));
+#else
+	v128 r;
+	for (u32 i = 0; i < 8; i++)
+		r._u16[i] = utils::rol16(a._u16[i], b._u16[i]);
+	return r;
+#endif
+}
+
+// For each 32-bit element, r = rotate a by b
+inline v128 gv_rol32(const v128& a, const v128& b)
+{
+#if defined(__AVX512VL__)
+	return _mm_rolv_epi32(a, b);
+#elif defined(ARCH_ARM64)
+	const auto amt1 = vandq_s32(b, gv_bcst32(31));
+	const auto amt2 = vsubq_s32(amt1, gv_bcst32(32));
+	return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
+#else
+	v128 r;
+	for (u32 i = 0; i < 4; i++)
+		r._u32[i] = utils::rol32(a._u32[i], b._u32[i]);
+	return r;
+#endif
+}
+
+// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1)
+template <typename A, typename B, typename C>
+inline auto gv_fshl8(A&& a, B&& b, C&& c)
+{
+#if defined(ARCH_ARM64)
+	const auto amt1 = vandq_s8(c, gv_bcst8(7));
+	const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
+	return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2)));
+#else
+	auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b));
+	auto s1 = gv_shl64(c, 7);
+	auto r1 = gv_signselect8(s1, std::move(x1), std::forward<A>(a));
+	auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward<B>(b));
+	auto c2 = gv_bcst8(0x3);
+	auto x2 = gv_and32(gv_shr64(b1, 6), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2)));
+	auto s2 = gv_shl64(c, 6);
+	auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
+	auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1));
+	auto c3 = gv_bcst8(0xf);
+	auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4)));
+	return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
+#endif
+}
+
+// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1)
+template <typename A, typename B, typename C>
+inline auto gv_fshr8(A&& a, B&& b, C&& c)
+{
+#if defined(ARCH_ARM64)
+	const auto amt1 = vandq_s8(c, gv_bcst8(7));
+	const auto amt2 = vsubq_s8(gv_bcst8(8), amt1);
+	return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2));
+#else
+	auto c1 = gv_bcst8(0x7f);
+	auto x1 = gv_and32(gv_shr64(b, 1), c1); x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7)));
+	auto s1 = gv_shl64(c, 7);
+	auto r1 = gv_signselect8(s1, std::move(x1), std::move(b));
+	auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a));
+	auto c2 = gv_bcst8(0x3f);
+	auto x2 = gv_and32(gv_shr64(r1, 2), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6)));
+	auto s2 = gv_shl64(c, 6);
+	auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
+	auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1));
+	auto c3 = gv_bcst8(0x0f);
+	auto x3 = gv_and32(gv_shr64(r2, 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4)));
+	return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
+#endif
+}
+
+// Shift left by byte amount
+template <u32 Count>
+inline v128 gv_shuffle_left(const v128& a)
+{
+	if (Count > 15)
+		return {};
+#if defined(ARCH_X64)
+	return _mm_slli_si128(a, Count);
+#elif defined(ARCH_ARM64)
+	v128 idx;
+	for (u32 i = 0; i < 16; i++)
+		idx._u8[i] = u8(i - Count);
+	return vqtbl1q_u8(a, idx);
+#endif
+}
+
+template <u32 Count, typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_shuffle_left(A&& a)
+{
+	FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward<A>(a), Count);
+}
+
+// Shift right by byte amount
+template <u32 Count>
+inline v128 gv_shuffle_right(const v128& a)
+{
+	if (Count > 15)
+		return {};
+#if defined(ARCH_X64)
+	return _mm_srli_si128(a, Count);
+#elif defined(ARCH_ARM64)
+	v128 idx;
+	for (u32 i = 0; i < 16; i++)
+		idx._u8[i] = u8(i + Count);
+	return vqtbl1q_u8(a, idx);
+#endif
+}
+
+template <u32 Count, typename A> requires (asmjit::any_operand_v<A>)
+inline auto gv_shuffle_right(A&& a)
+{
+	FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
+}