From 580bd2b25eba04f9f30d3ed1df3b3f31e0387b02 Mon Sep 17 00:00:00 2001
From: Nekotekina <nekotekina@gmail.com>
Date: Thu, 30 Dec 2021 19:39:18 +0300
Subject: [PATCH] Initial Linux Aarch64 support

* Update asmjit dependency (aarch64 branch)
* Disable USE_DISCORD_RPC by default
* Dump some JIT objects in rpcs3 cache dir
* Add SIGILL handler for all platforms
* Fix resetting zeroing denormals in thread pool
* Refactor most v128:: utils into global gv_** functions
* Refactor PPU interpreter (incomplete), remove "precise"
* - Instruction specializations with multiple accuracy flags
* - Adjust calling convention for speed
* - Removed precise/fast setting, replaced with static
* - Started refactoring interpreters for building at runtime JIT
*   (I got tired of poor compiler optimizations)
* - Expose some accuracy settings (SAT, NJ, VNAN, FPCC)
* - Add exec_bytes PPU thread variable (akin to cycle count)
* PPU LLVM: fix VCTUXS+VCTSXS instruction NaN results
* SPU interpreter: remove "precise" for now (extremely non-portable)
* - As with PPU, settings changed to static/dynamic for interpreters.
* - Precise options will be implemented later
* Fix termination after fatal error dialog
---
 3rdparty/asmjit/asmjit                        |    2 +-
 3rdparty/discord-rpc/CMakeLists.txt           |    2 +-
 3rdparty/llvm.cmake                           |   14 +-
 CMakeLists.txt                                |    2 +-
 Utilities/JIT.cpp                             |   36 +-
 Utilities/JIT.h                               |  104 +-
 Utilities/StrFmt.h                            |    2 +-
 Utilities/Thread.cpp                          |   96 +-
 Utilities/Thread.h                            |   12 +
 buildfiles/cmake/ConfigureCompiler.cmake      |   11 +-
 rpcs3/Crypto/aes.cpp                          |    6 +
 rpcs3/Crypto/aesni.cpp                        |    4 +
 rpcs3/Emu/CPU/CPUThread.cpp                   |   16 +-
 rpcs3/Emu/CPU/CPUTranslator.cpp               |    2 +-
 rpcs3/Emu/CPU/CPUTranslator.h                 |   31 +-
 rpcs3/Emu/CPU/sse2neon.h                      | 8776 +++++++++++++++++
 rpcs3/Emu/Cell/Modules/cellAudio.cpp          |    9 +-
 rpcs3/Emu/Cell/Modules/cellSpurs.cpp          |    6 +-
 rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp       |   10 +-
 rpcs3/Emu/Cell/PPUAnalyser.h                  |  275 +
 rpcs3/Emu/Cell/PPUDisAsm.h                    |  137 +
 rpcs3/Emu/Cell/PPUFunction.cpp                |   42 +-
 rpcs3/Emu/Cell/PPUFunction.h                  |   15 +-
 rpcs3/Emu/Cell/PPUInterpreter.cpp             | 8362 ++++++++++------
 rpcs3/Emu/Cell/PPUInterpreter.h               |  463 +-
 rpcs3/Emu/Cell/PPUModule.cpp                  |    4 +-
 rpcs3/Emu/Cell/PPUModule.h                    |    2 +-
 rpcs3/Emu/Cell/PPUOpcodes.h                   |  842 +-
 rpcs3/Emu/Cell/PPUThread.cpp                  |  330 +-
 rpcs3/Emu/Cell/PPUThread.h                    |    1 +
 rpcs3/Emu/Cell/PPUTranslator.cpp              |   94 +-
 rpcs3/Emu/Cell/PPUTranslator.h                |  137 +
 rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp        |  335 +-
 rpcs3/Emu/Cell/SPUASMJITRecompiler.h          |    2 -
 rpcs3/Emu/Cell/SPUAnalyser.cpp                |    5 +
 rpcs3/Emu/Cell/SPUDisAsm.cpp                  |   17 +-
 rpcs3/Emu/Cell/SPUInterpreter.cpp             | 1524 ++-
 rpcs3/Emu/Cell/SPUInterpreter.h               |  257 +-
 rpcs3/Emu/Cell/SPUOpcodes.h                   |  418 +-
 rpcs3/Emu/Cell/SPURecompiler.cpp              |   90 +-
 rpcs3/Emu/Cell/SPUThread.cpp                  |   66 +-
 rpcs3/Emu/Cell/SPUThread.h                    |    4 +-
 rpcs3/Emu/Cell/lv2/lv2.cpp                    |   20 +-
 rpcs3/Emu/Cell/lv2/sys_net.cpp                |    7 +
 rpcs3/Emu/Cell/lv2/sys_usbd.cpp               |    2 +-
 rpcs3/Emu/GDB.cpp                             |    7 +
 rpcs3/Emu/Memory/vm.cpp                       |   11 +-
 rpcs3/Emu/Memory/vm.h                         |   10 +-
 rpcs3/Emu/Memory/vm_reservation.h             |   23 +-
 rpcs3/Emu/NP/np_dnshook.cpp                   |    7 +
 rpcs3/Emu/NP/np_handler.cpp                   |    7 +
 rpcs3/Emu/NP/rpcn_client.cpp                  |    7 +
 rpcs3/Emu/NP/rpcn_client.h                    |    7 +
 rpcs3/Emu/RSX/Common/BufferUtils.cpp          |   58 +-
 rpcs3/Emu/RSX/Common/BufferUtils.h            |    4 +-
 rpcs3/Emu/RSX/GL/GLTextureCache.cpp           |    6 +-
 .../Emu/RSX/Program/program_state_cache2.hpp  |   30 +-
 rpcs3/Emu/RSX/VK/VKGSRender.h                 |    7 +-
 rpcs3/Emu/RSX/VK/vkutils/device.cpp           |   12 +
 rpcs3/Emu/RSX/VK/vkutils/sync.cpp             |   16 +-
 rpcs3/Emu/RSX/rsx_methods.cpp                 |   12 +-
 rpcs3/Emu/perf_meter.cpp                      |   12 +-
 rpcs3/Emu/perf_meter.hpp                      |   19 +-
 rpcs3/Emu/system_config.h                     |   11 +-
 rpcs3/Emu/system_config_types.cpp             |    8 +-
 rpcs3/Emu/system_config_types.h               |    8 +-
 rpcs3/emucore.vcxproj                         |    2 +-
 rpcs3/emucore.vcxproj.filters                 |    2 +-
 rpcs3/main.cpp                                |   20 +-
 rpcs3/rpcs3qt/cheat_manager.cpp               |    2 +-
 rpcs3/rpcs3qt/debugger_frame.cpp              |    8 +-
 rpcs3/rpcs3qt/emu_settings.cpp                |    8 +-
 rpcs3/rpcs3qt/emu_settings_type.h             |   18 +-
 rpcs3/rpcs3qt/settings_dialog.cpp             |   52 +-
 rpcs3/rpcs3qt/settings_dialog.ui              |   49 +-
 rpcs3/rpcs3qt/tooltips.h                      |   17 +-
 rpcs3/util/asm.hpp                            |   59 +-
 rpcs3/util/atomic.cpp                         |   11 +-
 rpcs3/util/atomic.hpp                         |  185 +-
 rpcs3/util/fence.hpp                          |   24 +
 rpcs3/util/shared_ptr.hpp                     |    4 +-
 rpcs3/util/simd.hpp                           | 2143 ++++
 rpcs3/util/sysinfo.cpp                        |  134 +-
 rpcs3/util/sysinfo.hpp                        |    8 +-
 rpcs3/util/tsc.hpp                            |   25 +
 rpcs3/util/types.hpp                          |   16 +-
 rpcs3/util/v128.hpp                           |  131 +-
 rpcs3/util/v128sse.hpp                        |  178 -
 rpcs3/util/vm_native.cpp                      |    2 +-
 89 files changed, 20360 insertions(+), 5612 deletions(-)
 create mode 100644 rpcs3/Emu/CPU/sse2neon.h
 create mode 100644 rpcs3/util/fence.hpp
 create mode 100644 rpcs3/util/simd.hpp
 create mode 100644 rpcs3/util/tsc.hpp
 delete mode 100644 rpcs3/util/v128sse.hpp

diff --git a/3rdparty/asmjit/asmjit b/3rdparty/asmjit/asmjit
index eae7197fce..fc2a5d82f7 160000
--- a/3rdparty/asmjit/asmjit
+++ b/3rdparty/asmjit/asmjit
@@ -1 +1 @@
-Subproject commit eae7197fce03fd52a6e71ca89207a88ce270fb1a
+Subproject commit fc2a5d82f7434d7d03161275a764c051f970f41c
diff --git a/3rdparty/discord-rpc/CMakeLists.txt b/3rdparty/discord-rpc/CMakeLists.txt
index 34c7c4fdb9..cb465f33e2 100644
--- a/3rdparty/discord-rpc/CMakeLists.txt
+++ b/3rdparty/discord-rpc/CMakeLists.txt
@@ -2,7 +2,7 @@
 add_library(3rdparty_discordRPC INTERFACE)
 
 # We don't want Discord Rich Presence on the BSDs and other OSes
-if (USE_DISCORD_RPC AND (WIN32 OR CMAKE_SYSTEM MATCHES "Linux" OR APPLE))
+if (USE_DISCORD_RPC AND (WIN32 OR CMAKE_SYSTEM MATCHES "Linux" OR APPLE) AND COMPILER_X86)
 	if (WIN32 AND NOT MSVC)
 		ExternalProject_Add(discordRPC
 				GIT_REPOSITORY https://github.com/discordapp/discord-rpc
diff --git a/3rdparty/llvm.cmake b/3rdparty/llvm.cmake
index c0c8cd5280..266ebd55b0 100644
--- a/3rdparty/llvm.cmake
+++ b/3rdparty/llvm.cmake
@@ -1,8 +1,10 @@
 if(WITH_LLVM)
+	CHECK_CXX_COMPILER_FLAG("-msse -msse2 -mcx16" COMPILER_X86)
+	CHECK_CXX_COMPILER_FLAG("-march=armv8-a+lse" COMPILER_ARM)
+
 	if(BUILD_LLVM_SUBMODULE)
 		message(STATUS "LLVM will be built from the submodule.")
 
-		set(LLVM_TARGETS_TO_BUILD "X86" CACHE INTERNAL "")
 		option(LLVM_BUILD_RUNTIME OFF)
 		option(LLVM_BUILD_TOOLS OFF)
 		option(LLVM_INCLUDE_BENCHMARKS OFF)
@@ -61,7 +63,15 @@ if(WITH_LLVM)
 		endif()
 	endif()
 
-	set(LLVM_LIBS LLVMMCJIT LLVMX86CodeGen LLVMX86AsmParser)
+	set(LLVM_LIBS LLVMMCJIT)
+
+	if(COMPILER_X86)
+		set(LLVM_LIBS ${LLVM_LIBS} LLVMX86CodeGen LLVMX86AsmParser)
+	endif()
+
+	if(COMPILER_ARM)
+		set(LLVM_LIBS ${LLVM_LIBS} LLVMX86CodeGen LLVMX86AsmParser LLVMARMCodeGen LLVMARMAsmParser)
+	endif()
 
 	if(WIN32 OR CMAKE_SYSTEM MATCHES "Linux")
 		set(LLVM_LIBS ${LLVM_LIBS} LLVMIntelJITEvents)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74421142c5..1206969025 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,7 +17,7 @@ option(WITH_LLVM "Enable usage of LLVM library" ON)
 option(BUILD_LLVM_SUBMODULE "Build LLVM from git submodule" ON)
 option(USE_FAUDIO "FAudio audio backend" ON)
 option(USE_LIBEVDEV "libevdev-based joystick support" ON)
-option(USE_DISCORD_RPC "Discord rich presence integration" ON)
+option(USE_DISCORD_RPC "Discord rich presence integration" OFF)
 option(USE_SYSTEM_ZLIB "Prefer system ZLIB instead of the builtin one" ON)
 option(USE_VULKAN "Vulkan render backend" ON)
 option(USE_PRECOMPILED_HEADERS "Use precompiled headers" OFF)
diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp
index ced02827e1..a6c8bb2015 100644
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@@ -18,6 +18,12 @@ LOG_CHANNEL(jit_log, "JIT");
 
 void jit_announce(uptr func, usz size, std::string_view name)
 {
+	if (!size)
+	{
+		jit_log.error("Empty function announced: %s (%p)", name, func);
+		return;
+	}
+
 #ifdef __linux__
 	static const fs::file s_map(fmt::format("/tmp/perf-%d.map", getpid()), fs::rewrite + fs::append);
 
@@ -124,15 +130,31 @@ void* jit_runtime_base::_add(asmjit::CodeHolder* code) noexcept
 {
 	ensure(!code->flatten());
 	ensure(!code->resolveUnresolvedLinks());
-	usz codeSize = ensure(code->codeSize());
+	usz codeSize = code->codeSize();
+	if (!codeSize)
+		return nullptr;
+
 	auto p = ensure(this->_alloc(codeSize, 64));
 	ensure(!code->relocateToBase(uptr(p)));
 
-	asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize);
-
-	for (asmjit::Section* section : code->_sections)
 	{
-		std::memcpy(p + section->offset(), section->data(), section->bufferSize());
+		asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize);
+
+		for (asmjit::Section* section : code->_sections)
+		{
+			std::memcpy(p + section->offset(), section->data(), section->bufferSize());
+		}
+	}
+
+	if (!dump_name.empty())
+	{
+		// If directory ASMJIT doesn't exist, nothing will be written
+		fs::file dump(fmt::format("%s/ASMJIT/%s", fs::get_cache_dir(), dump_name), fs::rewrite);
+
+		if (dump)
+		{
+			dump.write(p, codeSize);
+		}
 	}
 
 	return p;
@@ -349,8 +371,9 @@ static u64 make_null_function(const std::string& name)
 		using namespace asmjit;
 
 		// Build a "null" function that contains its name
-		const auto func = build_function_asm<void (*)()>("NULL", [&](x86::Assembler& c, auto& args)
+		const auto func = build_function_asm<void (*)()>("NULL", [&](native_asm& c, auto& args)
 		{
+#if defined(ARCH_X64)
 			Label data = c.newLabel();
 			c.lea(args[0], x86::qword_ptr(data, 0));
 			c.jmp(Imm(&null));
@@ -362,6 +385,7 @@ static u64 make_null_function(const std::string& name)
 				c.db(ch);
 			c.db(0);
 			c.align(AlignMode::kData, 16);
+#endif
 		});
 
 		func_ptr = reinterpret_cast<u64>(func);
diff --git a/Utilities/JIT.h b/Utilities/JIT.h
index 68e0c8bd30..9faddb8615 100644
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@@ -22,10 +22,17 @@
 #pragma GCC diagnostic ignored "-Wredundant-decls"
 #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
 #pragma GCC diagnostic ignored "-Weffc++"
-#ifndef __clang__
+#ifdef __clang__
+#pragma GCC diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#else
 #pragma GCC diagnostic ignored "-Wduplicated-branches"
+#pragma GCC diagnostic ignored "-Wdeprecated-enum-enum-conversion"
 #endif
 #include <asmjit/asmjit.h>
+#if defined(ARCH_ARM64)
+#include <asmjit/a64.h>
+#endif
 #pragma GCC diagnostic pop
 #endif
 
@@ -36,6 +43,14 @@
 #include <string_view>
 #include <unordered_map>
 
+#if defined(ARCH_X64)
+using native_asm = asmjit::x86::Assembler;
+using native_args = std::array<asmjit::x86::Gp, 4>;
+#elif defined(ARCH_ARM64)
+using native_asm = asmjit::a64::Assembler;
+using native_args = std::array<asmjit::a64::Gp, 4>;
+#endif
+
 void jit_announce(uptr func, usz size, std::string_view name);
 
 void jit_announce(auto* func, usz size, std::string_view name)
@@ -62,6 +77,8 @@ struct jit_runtime_base
 	const asmjit::Environment& environment() const noexcept;
 	void* _add(asmjit::CodeHolder* code) noexcept;
 	virtual uchar* _alloc(usz size, usz align) noexcept = 0;
+
+	std::string_view dump_name;
 };
 
 // ASMJIT runtime for emitting code in a single 2G region
@@ -167,11 +184,39 @@ namespace asmjit
 		}
 	}
 
+	inline void build_init_args_from_ghc(native_asm& c, native_args& args)
+	{
+#if defined(ARCH_X64)
+		// TODO: handle case when args don't overlap with r13/rbp/r12/rbx
+		c.mov(args[0], x86::r13);
+		c.mov(args[1], x86::rbp);
+		c.mov(args[2], x86::r12);
+		c.mov(args[3], x86::rbx);
+#else
+		static_cast<void>(c);
+		static_cast<void>(args);
+#endif
+	}
+
+	inline void build_init_ghc_args(native_asm& c, native_args& args)
+	{
+#if defined(ARCH_X64)
+		// TODO: handle case when args don't overlap with r13/rbp/r12/rbx
+		c.mov(x86::r13, args[0]);
+		c.mov(x86::rbp, args[1]);
+		c.mov(x86::r12, args[2]);
+		c.mov(x86::rbx, args[3]);
+#else
+		static_cast<void>(c);
+		static_cast<void>(args);
+#endif
+	}
+
 	using imm_ptr = Imm;
 }
 
 // Build runtime function with asmjit::X86Assembler
-template <typename FT, typename F>
+template <typename FT, typename Asm = native_asm, typename F>
 inline FT build_function_asm(std::string_view name, F&& builder)
 {
 	using namespace asmjit;
@@ -181,7 +226,8 @@ inline FT build_function_asm(std::string_view name, F&& builder)
 	CodeHolder code;
 	code.init(rt.environment());
 
-	std::array<x86::Gp, 4> args;
+#if defined(ARCH_X64)
+	native_args args;
 #ifdef _WIN32
 	args[0] = x86::rcx;
 	args[1] = x86::rdx;
@@ -193,16 +239,27 @@ inline FT build_function_asm(std::string_view name, F&& builder)
 	args[2] = x86::rdx;
 	args[3] = x86::rcx;
 #endif
+#elif defined(ARCH_ARM64)
+	native_args args;
+	args[0] = a64::x0;
+	args[1] = a64::x1;
+	args[2] = a64::x2;
+	args[3] = a64::x3;
+#endif
 
-	x86::Assembler compiler(&code);
+	Asm compiler(&code);
 	compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
-	builder(std::ref(compiler), args);
+	if constexpr (std::is_invocable_v<F, Asm&, native_args&>)
+		builder(compiler, args);
+	else
+		builder(compiler);
+	rt.dump_name = name;
 	const auto result = rt._add(&code);
 	jit_announce(result, code.codeSize(), name);
 	return reinterpret_cast<FT>(uptr(result));
 }
 
-#ifdef __APPLE__
+#if !defined(ARCH_X64) || defined(__APPLE__)
 template <typename FT, usz = 4096>
 class built_function
 {
@@ -213,9 +270,23 @@ public:
 
 	built_function& operator=(const built_function&) = delete;
 
-	template <typename F>
-	built_function(std::string_view name, F&& builder)
-		: m_func(ensure(build_function_asm<FT>(name, std::forward<F>(builder))))
+	template <typename F> requires (std::is_invocable_v<F, native_asm&, native_args&>)
+	built_function(std::string_view name, F&& builder,
+		u32 line = __builtin_LINE(),
+		u32 col = __builtin_COLUMN(),
+		const char* file = __builtin_FILE(),
+		const char* func = __builtin_FUNCTION())
+		: m_func(ensure(build_function_asm<FT>(name, std::forward<F>(builder)), const_str(), line, col, file, func))
+	{
+	}
+
+	template <typename F> requires (std::is_invocable_v<F>)
+	built_function(std::string_view, F&& getter,
+		u32 line = __builtin_LINE(),
+		u32 col = __builtin_COLUMN(),
+		const char* file = __builtin_FILE(),
+		const char* func = __builtin_FUNCTION())
+		: m_func(ensure(getter(), const_str(), line, col, file, func))
 	{
 	}
 
@@ -251,7 +322,8 @@ public:
 		CodeHolder code;
 		code.init(rt.environment());
 
-		std::array<x86::Gp, 4> args;
+#if defined(ARCH_X64)
+		native_args args;
 	#ifdef _WIN32
 		args[0] = x86::rcx;
 		args[1] = x86::rdx;
@@ -263,10 +335,18 @@ public:
 		args[2] = x86::rdx;
 		args[3] = x86::rcx;
 	#endif
+#elif defined(ARCH_ARM64)
+		native_args args;
+		args[0] = a64::x0;
+		args[1] = a64::x1;
+		args[2] = a64::x2;
+		args[3] = a64::x3;
+#endif
 
-		x86::Assembler compiler(&code);
+		native_asm compiler(&code);
 		compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
-		builder(std::ref(compiler), args);
+		builder(compiler, args);
+		rt.dump_name = name;
 		jit_announce(rt._add(&code), code.codeSize(), name);
 	}
 
diff --git a/Utilities/StrFmt.h b/Utilities/StrFmt.h
index 747d01918d..3164d3c6f6 100644
--- a/Utilities/StrFmt.h
+++ b/Utilities/StrFmt.h
@@ -239,7 +239,7 @@ struct fmt_class_string<T, void>
 	static void format(std::string& out, u64 arg)
 	{
 		const auto& obj = get_object(arg);
-	
+
 		void format_byte_array(std::string&, const uchar*, usz);
 		format_byte_array(out, reinterpret_cast<const uchar*>(std::data(obj)), std::size(obj));
 	}
diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp
index c79d3af673..34284417eb 100644
--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@@ -77,7 +77,7 @@
 #include "util/logs.hpp"
 #include "util/asm.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 #include "util/sysinfo.hpp"
 #include "Emu/Memory/vm_locking.h"
 
@@ -189,6 +189,7 @@ bool IsDebuggerPresent()
 }
 #endif
 
+#if defined(ARCH_X64)
 enum x64_reg_t : u32
 {
 	X64R_RAX = 0,
@@ -839,6 +840,7 @@ void decode_x64_reg_op(const u8* code, x64_op_t& out_op, x64_reg_t& out_reg, usz
 #ifdef _WIN32
 
 typedef CONTEXT x64_context;
+typedef CONTEXT ucontext_t;
 
 #define X64REG(context, reg) (&(&(context)->Rax)[reg])
 #define XMMREG(context, reg) (reinterpret_cast<v128*>(&(&(context)->Xmm0)[reg]))
@@ -1211,12 +1213,18 @@ usz get_x64_access_size(x64_context* context, x64_op_t op, x64_reg_t reg, usz d_
 	return d_size;
 }
 
+#elif defined(ARCH_ARM64)
+
+#define RIP(context) ((context)->uc_mcontext.pc)
+
+#endif /* ARCH_ */
+
 namespace rsx
 {
 	extern std::function<bool(u32 addr, bool is_writing)> g_access_violation_handler;
 }
 
-bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) noexcept
+bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noexcept
 {
 	g_tls_fault_all++;
 
@@ -1243,6 +1251,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
 		}
 	}
 
+#if defined(ARCH_X64)
 	const u8* const code = reinterpret_cast<u8*>(RIP(context));
 
 	x64_op_t op;
@@ -1382,6 +1391,9 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
 		g_tls_fault_spu++;
 		return true;
 	} while (0);
+#else
+	static_cast<void>(context);
+#endif /* ARCH_ */
 
 	if (vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable))
 	{
@@ -1545,7 +1557,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
 			if (!g_tls_access_violation_recovered)
 			{
 				vm_log.notice("\n%s", dump_useful_thread_info());
-				vm_log.error("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
+				vm_log.error("Access violation %s location 0x%x (%s)", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory");
 			}
 
 			// TODO:
@@ -1582,7 +1594,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
 	// Do not log any further access violations in this case.
 	if (!g_tls_access_violation_recovered)
 	{
-		vm_log.fatal("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : (cpu && cpu->id_type() == 1 && cpu->get_pc() == addr ? "executing" : "reading"), addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
+		vm_log.fatal("Access violation %s location 0x%x (%s)", is_writing ? "writing" : (cpu && cpu->id_type() == 1 && cpu->get_pc() == addr ? "executing" : "reading"), addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory");
 	}
 
 	while (Emu.IsPaused())
@@ -1754,8 +1766,9 @@ const bool s_exception_handler_set = []() -> bool
 
 static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
 {
-	x64_context* context = static_cast<ucontext_t*>(uct);
+	ucontext_t* context = static_cast<ucontext_t*>(uct);
 
+#if defined(ARCH_X64)
 #ifdef __APPLE__
 	const u64 err = context->uc_mcontext->__es.__err;
 #elif defined(__DragonFly__) || defined(__FreeBSD__)
@@ -1770,6 +1783,23 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
 
 	const bool is_executing = err & 0x10;
 	const bool is_writing = err & 0x2;
+#elif defined(ARCH_ARM64)
+	const bool is_executing = uptr(info->si_addr) == RIP(context);
+	const u32 insn = is_executing ? 0 : *reinterpret_cast<u32*>(RIP(context));
+	const bool is_writing = (insn & 0xbfff0000) == 0x0c000000
+		|| (insn & 0xbfe00000) == 0x0c800000
+		|| (insn & 0xbfdf0000) == 0x0d000000
+		|| (insn & 0xbfc00000) == 0x0d800000
+		|| (insn & 0x3f400000) == 0x08000000
+		|| (insn & 0x3bc00000) == 0x39000000
+		|| (insn & 0x3fc00000) == 0x3d800000
+		|| (insn & 0x3bc00000) == 0x38000000
+		|| (insn & 0x3fe00000) == 0x3c800000
+		|| (insn & 0x3a400000) == 0x28000000;
+
+#else
+#error "signal_handler not implemented"
+#endif
 
 	const u64 exec64 = (reinterpret_cast<u64>(info->si_addr) - reinterpret_cast<u64>(vm::g_exec_addr)) / 2;
 	const auto cause = is_executing ? "executing" : is_writing ? "writing" : "reading";
@@ -1809,6 +1839,26 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
 	thread_ctrl::emergency_exit(msg);
 }
 
+static void sigill_handler(int /*sig*/, siginfo_t* info, void* /*uct*/) noexcept
+{
+	std::string msg = fmt::format("Illegal instruction at %p (%s).\n", info->si_addr, *reinterpret_cast<be_t<u128>*>(info->si_addr));
+
+	append_thread_name(msg);
+
+	if (IsDebuggerPresent())
+	{
+		sys_log.fatal("\n%s", msg);
+
+		sys_log.notice("\n%s", dump_useful_thread_info());
+
+		// Convert to SIGTRAP
+		raise(SIGTRAP);
+		return;
+	}
+
+	thread_ctrl::emergency_exit(msg);
+}
+
 void sigpipe_signaling_handler(int)
 {
 }
@@ -1834,6 +1884,13 @@ const bool s_exception_handler_set = []() -> bool
 	}
 #endif
 
+	sa.sa_sigaction = sigill_handler;
+	if (::sigaction(SIGILL, &sa, NULL) == -1)
+	{
+		std::fprintf(stderr, "sigaction(SIGILL) failed (%d).\n", errno);
+		std::abort();
+	}
+
 	sa.sa_handler = sigpipe_signaling_handler;
 	if (::sigaction(SIGPIPE, &sa, NULL) == -1)
 	{
@@ -1852,11 +1909,7 @@ const bool s_terminate_handler_set = []() -> bool
 	std::set_terminate([]()
 	{
 		if (IsDebuggerPresent())
-#ifdef _MSC_VER
-			__debugbreak();
-#else
-			__asm("int3;");
-#endif
+			utils::trap();
 
 		report_fatal_error("RPCS3 has abnormally terminated.");
 	});
@@ -1935,7 +1988,7 @@ void thread_base::initialize(void (*error_cb)())
 	{
 		if (attempts == umax)
 		{
-			g_tls_wait_time += __rdtsc() - stamp0;
+			g_tls_wait_time += utils::get_tsc() - stamp0;
 		}
 		else if (attempts > 1)
 		{
@@ -2096,6 +2149,8 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
 
 	std::fesetround(FE_TONEAREST);
 
+	gv_unset_zeroing_denormals();
+
 	static constexpr u64 s_stop_bit = 0x8000'0000'0000'0000ull;
 
 	static atomic_t<u64> s_pool_ctr = []
@@ -2195,10 +2250,11 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
 
 thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base))
 {
-	return build_function_asm<native_entry>("thread_base_trampoline", [&](asmjit::x86::Assembler& c, auto& args)
+	return build_function_asm<native_entry>("thread_base_trampoline", [&](native_asm& c, auto& args)
 	{
 		using namespace asmjit;
 
+#if defined(ARCH_X64)
 		Label _ret = c.newLabel();
 		c.push(x86::rbp);
 		c.sub(x86::rsp, 0x20);
@@ -2222,6 +2278,7 @@ thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base*
 		c.bind(_ret);
 		c.add(x86::rsp, 0x28);
 		c.ret();
+#endif
 	});
 }
 
@@ -2364,7 +2421,7 @@ bool thread_base::join(bool dtor) const
 	// Hacked for too sleepy threads (1ms) TODO: make sure it's unneeded and remove
 	const auto timeout = dtor && Emu.IsStopped() ? atomic_wait_timeout{1'000'000} : atomic_wait_timeout::inf;
 
-	auto stamp0 = __rdtsc();
+	auto stamp0 = utils::get_tsc();
 
 	for (u64 i = 0; (m_sync & 3) <= 1; i++)
 	{
@@ -2377,7 +2434,7 @@ bool thread_base::join(bool dtor) const
 
 		if (i >= 16 && !(i & (i - 1)) && timeout != atomic_wait_timeout::inf)
 		{
-			sig_log.error(u8"Thread [%s] is too sleepy. Waiting for it %.3fµs already!", *m_tname.load(), (__rdtsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
+			sig_log.error(u8"Thread [%s] is too sleepy. Waiting for it %.3fµs already!", *m_tname.load(), (utils::get_tsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
 		}
 	}
 
@@ -2522,17 +2579,8 @@ void thread_base::exec()
 
 	sig_log.fatal("Thread terminated due to fatal error: %s", reason);
 
-#ifdef _WIN32
 	if (IsDebuggerPresent())
-	{
-		__debugbreak();
-	}
-#else
-	if (IsDebuggerPresent())
-	{
-		__asm("int3;");
-	}
-#endif
+		utils::trap();
 
 	if (const auto _this = g_tls_this_thread)
 	{
diff --git a/Utilities/Thread.h b/Utilities/Thread.h
index 54731c4454..bc50dbbc6c 100644
--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@@ -478,7 +478,19 @@ class named_thread final : public Context, result_storage<Context>, thread_base
 		return thread::finalize(thread_state::finished);
 	}
 
+#if defined(ARCH_X64)
 	static inline thread::native_entry trampoline = thread::make_trampoline(entry_point);
+#else
+	static void* trampoline(void* arg)
+	{
+		if (const auto next = thread_base::finalize(entry_point(static_cast<thread_base*>(arg))))
+		{
+			return next(thread_ctrl::get_current());
+		}
+
+		return nullptr;
+	}
+#endif
 
 	friend class thread_ctrl;
 
diff --git a/buildfiles/cmake/ConfigureCompiler.cmake b/buildfiles/cmake/ConfigureCompiler.cmake
index 6a5a390b8b..da78daae7c 100644
--- a/buildfiles/cmake/ConfigureCompiler.cmake
+++ b/buildfiles/cmake/ConfigureCompiler.cmake
@@ -20,11 +20,20 @@ else()
 	# Some distros have the compilers set to use PIE by default, but RPCS3 doesn't work with PIE, so we need to disable it.
 	CHECK_CXX_COMPILER_FLAG("-no-pie" HAS_NO_PIE)
 	CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+	CHECK_CXX_COMPILER_FLAG("-msse -msse2 -mcx16" COMPILER_X86)
+	CHECK_CXX_COMPILER_FLAG("-march=armv8.1-a" COMPILER_ARM)
 
 	add_compile_options(-Wall)
 	add_compile_options(-fno-exceptions)
 	add_compile_options(-fstack-protector)
-	add_compile_options(-msse -msse2 -mcx16)
+
+	if (COMPILER_X86)
+		add_compile_options(-msse -msse2 -mcx16)
+	endif()
+
+	if (COMPILER_ARM)
+		add_compile_options(-march=armv8.1-a)
+	endif()
 
 	add_compile_options(-Werror=old-style-cast)
 	add_compile_options(-Werror=sign-compare)
diff --git a/rpcs3/Crypto/aes.cpp b/rpcs3/Crypto/aes.cpp
index bc81ede958..2294ccfa57 100644
--- a/rpcs3/Crypto/aes.cpp
+++ b/rpcs3/Crypto/aes.cpp
@@ -461,8 +461,10 @@ int aes_setkey_enc( aes_context *ctx, const unsigned char *key, unsigned int key
 
     ctx->rk = RK = ctx->buf;
 
+#if defined(__SSE2__) || defined(_M_X64)
     if( aesni_supports( POLARSSL_AESNI_AES ) )
         return( aesni_setkey_enc( reinterpret_cast<unsigned char*>(ctx->rk), key, keysize ) );
+#endif
 
     for( i = 0; i < (keysize >> 5); i++ )
     {
@@ -564,12 +566,14 @@ int aes_setkey_dec( aes_context *ctx, const unsigned char *key, unsigned int key
     if( ret != 0 )
         return( ret );
 
+#if defined(__SSE2__) || defined(_M_X64)
     if( aesni_supports( POLARSSL_AESNI_AES ) )
     {
         aesni_inverse_key( reinterpret_cast<unsigned char*>(ctx->rk),
                            reinterpret_cast<const unsigned char*>(cty.rk), ctx->nr );
         goto done;
     }
+#endif
 
     SK = cty.rk + cty.nr * 4;
 
@@ -658,8 +662,10 @@ int aes_crypt_ecb( aes_context *ctx,
     int i;
     uint32_t *RK, X0, X1, X2, X3, Y0, Y1, Y2, Y3;
 
+#if defined(__SSE2__) || defined(_M_X64)
     if( aesni_supports( POLARSSL_AESNI_AES ) )
         return( aesni_crypt_ecb( ctx, mode, input, output ) );
+#endif
 
     RK = ctx->rk;
 
diff --git a/rpcs3/Crypto/aesni.cpp b/rpcs3/Crypto/aesni.cpp
index c0e3fa90cf..05bb65fe7f 100644
--- a/rpcs3/Crypto/aesni.cpp
+++ b/rpcs3/Crypto/aesni.cpp
@@ -1,3 +1,5 @@
+#if defined(__SSE2__) || defined(_M_X64)
+
 /*
  *  AES-NI support functions
  *
@@ -680,3 +682,5 @@ int aesni_setkey_enc( unsigned char *rk,
 
     return( 0 );
 }
+
+#endif
diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp
index 9ba98f2ed1..23d89577f5 100644
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@@ -17,7 +17,9 @@
 #include <unordered_map>
 #include <map>
 
+#if defined(ARCH_X64)
 #include <emmintrin.h>
+#endif
 
 DECLARE(cpu_thread::g_threads_created){0};
 DECLARE(cpu_thread::g_threads_deleted){0};
@@ -410,20 +412,6 @@ void cpu_thread::operator()()
 	{
 		thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(id_type() == 1 ? thread_class::ppu : thread_class::spu));
 	}
-	if (id_type() == 2)
-	{
-		// force input/output denormals to zero for SPU threads (FTZ/DAZ)
-		_mm_setcsr( _mm_getcsr() | 0x8040 );
-
-		const volatile int a = 0x1fc00000;
-		__m128 b = _mm_castsi128_ps(_mm_set1_epi32(a));
-		int c = _mm_cvtsi128_si32(_mm_castps_si128(_mm_mul_ps(b,b)));
-
-		if (c != 0)
-		{
-			sys_log.fatal("Could not disable denormals.");
-		}
-	}
 
 	while (!g_fxo->is_init<cpu_profiler>())
 	{
diff --git a/rpcs3/Emu/CPU/CPUTranslator.cpp b/rpcs3/Emu/CPU/CPUTranslator.cpp
index a0a15fb34e..b492c26b9a 100644
--- a/rpcs3/Emu/CPU/CPUTranslator.cpp
+++ b/rpcs3/Emu/CPU/CPUTranslator.cpp
@@ -3,7 +3,7 @@
 #include "CPUTranslator.h"
 
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 
 llvm::LLVMContext g_llvm_ctx;
 
diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h
index 9cd3d8cb30..94e7686165 100644
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@@ -2961,11 +2961,11 @@ public:
 	}
 
 	// Call external function: provide name and function pointer
-	template <typename RT, typename... FArgs, LLVMValue... Args>
+	template <typename RetT = void, typename RT, typename... FArgs, LLVMValue... Args>
 	llvm::CallInst* call(std::string_view lame, RT(*_func)(FArgs...), Args... args)
 	{
 		static_assert(sizeof...(FArgs) == sizeof...(Args), "spu_llvm_recompiler::call(): unexpected arg number");
-		const auto type = llvm::FunctionType::get(get_type<RT>(), {args->getType()...}, false);
+		const auto type = llvm::FunctionType::get(get_type<std::conditional_t<std::is_void_v<RetT>, RT, RetT>>(), {args->getType()...}, false);
 		const auto func = llvm::cast<llvm::Function>(m_module->getOrInsertFunction({lame.data(), lame.size()}, type).getCallee());
 #ifdef _WIN32
 		func->setCallingConv(llvm::CallingConv::Win64);
@@ -3680,31 +3680,4 @@ struct fmt_unveil<llvm::TypeSize, void>
 	}
 };
 
-#ifndef _MSC_VER
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wignored-attributes"
-#endif
-
-template <>
-struct llvm_value_t<__m128> : llvm_value_t<f32[4]>
-{
-
-};
-
-template <>
-struct llvm_value_t<__m128d> : llvm_value_t<f64[2]>
-{
-
-};
-
-template <>
-struct llvm_value_t<__m128i> : llvm_value_t<u8[16]>
-{
-
-};
-
-#ifndef _MSC_VER
-#pragma GCC diagnostic pop
-#endif
-
 #endif
diff --git a/rpcs3/Emu/CPU/sse2neon.h b/rpcs3/Emu/CPU/sse2neon.h
new file mode 100644
index 0000000000..215b7e4dde
--- /dev/null
+++ b/rpcs3/Emu/CPU/sse2neon.h
@@ -0,0 +1,8776 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// This header file does not yet translate all of the SSE intrinsics.
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@biilabs.io>
+//   Mark Cheng <marktwtn@biilabs.io>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yanghau@biilabs.io>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#else /* non-GNU / non-clang compilers */
+#warning "Macro name collisions may happen with unsupported compiler."
+#ifndef FORCE_INLINE
+#define FORCE_INLINE static inline
+#endif
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if defined(__GNUC__) && (__GNUC__ <= 9)
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://docs.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128();
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 10 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ *
+ * Data (Number, Binary, Byte Index):
+    +------+------+-------------+------+------+-------------+
+    |      1      |      2      |      3      |      4      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |      5      |      6      |      7      |      8      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+ * Index (Byte Index):
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
+    +------+------+------+------+------+------+------+------+
+ * Result:
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+    +------+------+------+------+------+------+------+------+
+    |     256     |      2      |      5      |      6      | Number
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |      3      |      7      |      4      |      8      | Number
+    +------+------+------+------+------+------+-------------+
+ */
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint {
+    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
+    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
+    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
+    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
+    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compares for less than or equal.
+//
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+#else
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
+#endif
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then covert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+//
+//   FOR j := 0 to 3
+//     i := 16*j
+//     k := 32*j
+//     IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
+//       dst[i+15:i] := 0x7FFF
+//     ELSE
+//       dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
+    const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
+    const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
+    const __m128i maxMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
+    const __m128i betweenMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
+    const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
+                                            _mm_setzero_si128());
+    __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
+    __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
+    __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
+    __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
+    return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+//
+//   FOR j := 0 to 3
+//     i := 8*j
+//     k := 32*j
+//     IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
+//       dst[i+7:i] := 0x7F
+//     ELSE
+//       dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
+    const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
+    const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
+    const __m128i maxMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
+    const __m128i betweenMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
+    const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
+                                            _mm_setzero_si128());
+    __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
+    __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
+    __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
+    __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
+    int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
+    int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
+    static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
+    int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
+
+    return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
+//
+//   r0 := a0
+//   r1 := a1
+//   r2 := *p0
+//   r3 := *p1
+//
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+//
+//   dst[15:0] := MEM[mem_addr+15:mem_addr]
+//   dst[MAX:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[MAX:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Moves the upper two values of B into the lower two values of A.
+//
+//   r3 := a3
+//   r2 := a2
+//   r1 := b3
+//   r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+//   r3 := b1
+//   r2 := b0
+//   r1 := a1
+//   r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__)
+    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, shift));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] * b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
+{
+    (void) i;
+    __builtin_prefetch(p);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+//   dst[31:0] := (1.0 / a[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// The current precision is 1% error.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr()
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pi16(a, imm)                                           \
+    __extension__({                                                        \
+        vreinterpret_m64_s16(__builtin_shufflevector(                      \
+            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
+    })
+#else
+#define _mm_shuffle_pi16(a, imm)                                               \
+    __extension__({                                                            \
+        int16x4_t ret;                                                         \
+        ret =                                                                  \
+            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
+            1);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
+            2);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
+            3);                                                                \
+        vreinterpret_m64_s16(ret);                                             \
+    })
+#endif
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+    __sync_synchronize();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a, b, imm)                                \
+    __extension__({                                              \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
+        float32x4_t _shuf = __builtin_shufflevector(             \
+            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                           \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+//   *p0 := a2
+//   *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := a0
+//   *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[127:96]
+//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
+//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] - b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+//   r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+// 	     i := j*64
+// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+//   r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r7 := (a7 + b7) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r15 := (a15 + b15) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+    // no corollary for Neon?
+}
+
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
+//   ...
+//   r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+//   r0 := (a0 < b0) ? 0xffff : 0x0
+//   r1 := (a1 < b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 >= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 > *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 <= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 < *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     m := j*64
+//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      k := 64*j
+//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      k := 64*j
+//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 32*j
+//     k := 64*j
+//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+//   ENDFOR
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     m := j*64
+//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__aarch64__)
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#else
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 64*j
+//     k := 32*j
+//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int32_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
+                                                 vreinterpretq_f32_m128(a), 0));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double ret = *((double *) &a);
+    return (int32_t) ret;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+//
+//  FOR j := 0 to 1
+//    i := 64*j
+//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[MAX:32] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+//   dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0)[31:16]
+//   r1 := (a1 * b1)[31:16]
+//   ...
+//   r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+    return _mm_set_pd(0, a);
+}
+
+// Sets the 8 signed 16-bit integer values to w.
+//
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Sets the 16 signed 8-bit integer values to b.
+//
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm)                              \
+    __extension__({                                            \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
+        int32x4_t _shuf = __builtin_shufflevector(             \
+            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                        \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+//
+//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pd(a, b, imm8)                                          \
+    vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
+        vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
+        ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm)                             \
+    __extension__({                                             \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
+        int16x8_t _shuf = __builtin_shufflevector(              \
+            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+            (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                         \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = __builtin_shufflevector(                   \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF count[63:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+//
+//   tmp := imm8[7:0]
+//   IF tmp > 15
+//     tmp := 16
+//   FI
+//   dst[127:0] := a[127:0] << (tmp*8)
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
+FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)};
+    return vreinterpretq_m128i_u8(
+        vld1q_u8(((uint8_t const *) tmp) + (16 - imm)));
+}
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0 = sqrt(((double *) &a)[0]);
+    double a1 = sqrt(((double *) &a)[1]);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+#endif
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+//     ELSE
+//       dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
+//     FI
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
+//     FI
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+//     ELSE
+//       dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (_sse2neon_unlikely((imm) == 0)) {                              \
+            ret = a;                                                       \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {            \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF count[63:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~15)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u16(                                   \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~31)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u32(                                   \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~63)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u64(                                   \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+//
+//   tmp := imm8[7:0]
+//   IF tmp > 15
+//     tmp := 16
+//   FI
+//   dst[127:0] := a[127:0] >> (tmp*8)
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
+FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)};
+    return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Stores 128-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores 32-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#elif defined(__aarch64__)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ENDFOR
+//
+//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] - b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
+//
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[127:64]
+//     dst[127:64] := src2[127:64]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[63:0]
+//     dst[127:64] := src2[63:0]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := j*64
+//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+//
+// FOR j := 0 to 1
+//   i := j*64
+//   IF ((j & 1) == 0)
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ELSE
+//     dst[i+63:i] := a[i+63:i] + b[i+63:i]
+//   FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    static const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    static const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+{
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
+#else
+    double *da = (double *) &_a;
+    double *db = (double *) &_b;
+    double c[] = {da[0] - da[1], db[0] - db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally substract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+#else
+    float32x4x2_t c = vuzpq_f32(a, b);
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+//
+//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
+//   dst[127:0] := tmp[127:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
+FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    int idx;
+    uint8x16_t tmp[2];
+    if (imm >= 16) {
+        idx = imm - 16;
+        tmp[0] = vreinterpretq_u8_m128i(a);
+        tmp[1] = vdupq_n_u8(0);
+    } else {
+        idx = imm;
+        tmp[0] = vreinterpretq_u8_m128i(b);
+        tmp[1] = vreinterpretq_u8_m128i(a);
+    }
+    return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx));
+}
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+//
+//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
+//   dst[63:0] := tmp[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (_sse2neon_unlikely((imm) >= 16)) {                              \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+#else
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+//      a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+//
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     IF b[i+7] == 1
+//       dst[i+7:i] := 0
+//     ELSE
+//       index[2:0] := b[i+2:i]
+//       dst[i+7:i] := a[index*8+7:index*8]
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t)(0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      IF b[i+15:i] < 0
+//        dst[i+15:i] := -(a[i+15:i])
+//      ELSE IF b[i+15:i] == 0
+//        dst[i+15:i] := 0
+//      ELSE
+//        dst[i+15:i] := a[i+15:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 1
+//      i := j*32
+//      IF b[i+31:i] < 0
+//        dst[i+31:i] := -(a[i+31:i])
+//      ELSE IF b[i+31:i] == 0
+//        dst[i+31:i] := 0
+//      ELSE
+//        dst[i+31:i] := a[i+31:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      IF b[i+7:i] < 0
+//        dst[i+7:i] := -(a[i+7:i])
+//      ELSE IF b[i+7:i] == 0
+//        dst[i+7:i] := 0
+//      ELSE
+//        dst[i+7:i] := a[i+7:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                            \
+    __extension__({                                                           \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
+    })
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                \
+    __extension__({                                            \
+        const uint64_t _mask[2] = {                            \
+            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
+            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
+        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
+        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
+        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
+    })
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := CEIL(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+#if defined(__aarch64__)
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+#else
+    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
+    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+#if defined(__aarch64__)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(floor(f[1]), floor(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := FLOOR(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                              \
+    __extension__({                                                            \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
+                           vreinterpretq_f32_m128(a), 0);                      \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
+                           ((imm8 >> 4) & 0x3));                               \
+        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        vreinterpretq_m128_f32(                                                \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
+    })
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+//
+//   index[2:0] := 0
+//   min[15:0] := a[15:0]
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF a[i+15:i] < min[15:0]
+//           index[2:0] := j
+//           min[15:0] := a[i+15:i]
+//       FI
+//   ENDFOR
+//   dst[15:0] := min[15:0]
+//   dst[18:16] := index[2:0]
+//   dst[127:19] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+    // Find the minimum value
+#if defined(__aarch64__)
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+#else
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+#endif
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+#if defined(__aarch64__)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t zf =
+        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t cf =
+        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t result = vandq_u64(zf, cf);
+    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
+                  vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+/* AES */
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
+    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
+     (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4] = {
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    };
+    for (int i = 0; i < 16; i++)
+        vreinterpretq_nth_u8_m128i(a, i) =
+            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+    return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/rpcs3/Emu/Cell/Modules/cellAudio.cpp b/rpcs3/Emu/Cell/Modules/cellAudio.cpp
index 03a5ff926e..08692d2a33 100644
--- a/rpcs3/Emu/Cell/Modules/cellAudio.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellAudio.cpp
@@ -6,9 +6,12 @@
 #include "Emu/Cell/lv2/sys_event.h"
 #include "cellAudio.h"
 
-#include "emmintrin.h"
 #include <cmath>
 
+#if defined(ARCH_X64)
+#include "emmintrin.h"
+#endif
+
 LOG_CHANNEL(cellAudio);
 
 vm::gvar<char, AUDIO_PORT_OFFSET * AUDIO_PORT_COUNT> g_audio_buffer;
@@ -1118,6 +1121,7 @@ void cell_audio_thread::mix(float *out_buffer, s32 offset)
 		// 2x CVTPS2DQ (converts float to s32)
 		// PACKSSDW (converts s32 to s16 with signed saturation)
 
+#if defined(ARCH_X64)
 		for (usz i = 0; i < out_buffer_sz; i += 8)
 		{
 			const auto scale = _mm_set1_ps(0x8000);
@@ -1125,6 +1129,9 @@ void cell_audio_thread::mix(float *out_buffer, s32 offset)
 				_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(out_buffer + i), scale)),
 				_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(out_buffer + i + 4), scale)))));
 		}
+#else
+		fmt::throw_exception("Not supported");
+#endif
 	}
 }
 
diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
index 14d2ea2f91..49720beff7 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
@@ -17,7 +17,7 @@
 
 #include "util/asm.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 
 LOG_CHANNEL(cellSpurs);
 
@@ -738,7 +738,7 @@ s32 _spurs::create_handler(vm::ptr<CellSpurs> spurs, u32 ppuPriority)
 
 		void non_task()
 		{
-			BIND_FUNC(_spurs::handler_entry)(*this);
+			//BIND_FUNC(_spurs::handler_entry)(*this);
 		}
 	};
 
@@ -933,7 +933,7 @@ s32 _spurs::create_event_helper(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 p
 
 		void non_task()
 		{
-			BIND_FUNC(_spurs::event_helper_entry)(*this);
+			//BIND_FUNC(_spurs::event_helper_entry)(*this);
 		}
 	};
 
diff --git a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
index aaec2a7464..f559aeebff 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
@@ -11,7 +11,7 @@
 
 #include "util/asm.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 
 LOG_CHANNEL(cellSpurs);
 
@@ -1434,7 +1434,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 
 		// Verify taskset state is valid
 		if ((waiting & running) != v128{} || (ready & pready) != v128{} ||
-			(v128::andnot(enabled, running | ready | pready | signalled | waiting) != v128{}))
+			(gv_andn(enabled, running | ready | pready | signalled | waiting) != v128{}))
 		{
 			spu_log.error("Invalid taskset state");
 			spursHalt(spu);
@@ -1442,7 +1442,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 
 		// Find the number of tasks that have become ready since the last iteration
 		{
-			v128 newlyReadyTasks = v128::andnot(ready, signalled | pready);
+			v128 newlyReadyTasks = gv_andn(ready, signalled | pready);
 
 			numNewlyReadyTasks = utils::popcnt128(newlyReadyTasks._u);
 		}
@@ -1491,7 +1491,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 		}
 		case SPURS_TASKSET_REQUEST_POLL:
 		{
-			readyButNotRunning = v128::andnot(running, ready0);
+			readyButNotRunning = gv_andn(running, ready0);
 			if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK)
 			{
 				readyButNotRunning._u &= ~(u128{1} << (~taskset->wkl_flag_wait_task & 127));
@@ -1526,7 +1526,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 		}
 		case SPURS_TASKSET_REQUEST_SELECT_TASK:
 		{
-			readyButNotRunning = v128::andnot(running, ready0);
+			readyButNotRunning = gv_andn(running, ready0);
 			if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK)
 			{
 				readyButNotRunning._u &= ~(u128{1} << (~taskset->wkl_flag_wait_task & 127));
diff --git a/rpcs3/Emu/Cell/PPUAnalyser.h b/rpcs3/Emu/Cell/PPUAnalyser.h
index efb127f07b..77ca15c509 100644
--- a/rpcs3/Emu/Cell/PPUAnalyser.h
+++ b/rpcs3/Emu/Cell/PPUAnalyser.h
@@ -203,18 +203,31 @@ struct ppu_itype
 		VCFSX,
 		VCFUX,
 		VCMPBFP,
+		VCMPBFP_,
 		VCMPEQFP,
+		VCMPEQFP_,
 		VCMPEQUB,
+		VCMPEQUB_,
 		VCMPEQUH,
+		VCMPEQUH_,
 		VCMPEQUW,
+		VCMPEQUW_,
 		VCMPGEFP,
+		VCMPGEFP_,
 		VCMPGTFP,
+		VCMPGTFP_,
 		VCMPGTSB,
+		VCMPGTSB_,
 		VCMPGTSH,
+		VCMPGTSH_,
 		VCMPGTSW,
+		VCMPGTSW_,
 		VCMPGTUB,
+		VCMPGTUB_,
 		VCMPGTUH,
+		VCMPGTUH_,
 		VCMPGTUW,
+		VCMPGTUW_,
 		VCTSXS,
 		VCTUXS,
 		VEXPTEFP,
@@ -367,7 +380,9 @@ struct ppu_itype
 		LVSL,
 		LVEBX,
 		SUBFC,
+		SUBFCO,
 		ADDC,
+		ADDCO,
 		MULHDU,
 		MULHWU,
 		MFOCRF,
@@ -382,6 +397,7 @@ struct ppu_itype
 		LVSR,
 		LVEHX,
 		SUBF,
+		SUBFO,
 		LDUX,
 		DCBST,
 		LWZUX,
@@ -396,11 +412,14 @@ struct ppu_itype
 		LBZX,
 		LVX,
 		NEG,
+		NEGO,
 		LBZUX,
 		NOR,
 		STVEBX,
 		SUBFE,
+		SUBFEO,
 		ADDE,
+		ADDEO,
 		MTOCRF,
 		STDX,
 		STWCX,
@@ -410,17 +429,24 @@ struct ppu_itype
 		STWUX,
 		STVEWX,
 		SUBFZE,
+		SUBFZEO,
 		ADDZE,
+		ADDZEO,
 		STDCX,
 		STBX,
 		STVX,
 		SUBFME,
+		SUBFMEO,
 		MULLD,
+		MULLDO,
 		ADDME,
+		ADDMEO,
 		MULLW,
+		MULLWO,
 		DCBTST,
 		STBUX,
 		ADD,
+		ADDO,
 		DCBT,
 		LHZX,
 		EQV,
@@ -442,13 +468,17 @@ struct ppu_itype
 		STHUX,
 		OR,
 		DIVDU,
+		DIVDUO,
 		DIVWU,
+		DIVWUO,
 		MTSPR,
 		DCBI,
 		NAND,
 		STVXL,
 		DIVD,
+		DIVDO,
 		DIVW,
+		DIVWO,
 		LVLX,
 		LDBRX,
 		LSWX,
@@ -558,6 +588,112 @@ struct ppu_itype
 		FCTID,
 		FCTIDZ,
 		FCFID,
+
+		SUBFCO_,
+		ADDCO_,
+		SUBFO_,
+		NEGO_,
+		SUBFEO_,
+		ADDEO_,
+		SUBFZEO_,
+		ADDZEO_,
+		SUBFMEO_,
+		MULLDO_,
+		ADDMEO_,
+		MULLWO_,
+		ADDO_,
+		DIVDUO_,
+		DIVWUO_,
+		DIVDO_,
+		DIVWO_,
+
+		RLWIMI_,
+		RLWINM_,
+		RLWNM_,
+		RLDICL_,
+		RLDICR_,
+		RLDIC_,
+		RLDIMI_,
+		RLDCL_,
+		RLDCR_,
+		SUBFC_,
+		MULHDU_,
+		ADDC_,
+		MULHWU_,
+		SLW_,
+		CNTLZW_,
+		SLD_,
+		AND_,
+		SUBF_,
+		CNTLZD_,
+		ANDC_,
+		MULHD_,
+		MULHW_,
+		NEG_,
+		NOR_,
+		SUBFE_,
+		ADDE_,
+		SUBFZE_,
+		ADDZE_,
+		MULLD_,
+		SUBFME_,
+		ADDME_,
+		MULLW_,
+		ADD_,
+		EQV_,
+		XOR_,
+		ORC_,
+		OR_,
+		DIVDU_,
+		DIVWU_,
+		NAND_,
+		DIVD_,
+		DIVW_,
+		SRW_,
+		SRD_,
+		SRAW_,
+		SRAD_,
+		SRAWI_,
+		SRADI_,
+		EXTSH_,
+		EXTSB_,
+		EXTSW_,
+		FDIVS_,
+		FSUBS_,
+		FADDS_,
+		FSQRTS_,
+		FRES_,
+		FMULS_,
+		FMADDS_,
+		FMSUBS_,
+		FNMSUBS_,
+		FNMADDS_,
+		MTFSB1_,
+		MTFSB0_,
+		MTFSFI_,
+		MFFS_,
+		MTFSF_,
+		FRSP_,
+		FCTIW_,
+		FCTIWZ_,
+		FDIV_,
+		FSUB_,
+		FADD_,
+		FSQRT_,
+		FSEL_,
+		FMUL_,
+		FRSQRTE_,
+		FMSUB_,
+		FMADD_,
+		FNMSUB_,
+		FNMADD_,
+		FNEG_,
+		FMR_,
+		FNABS_,
+		FABS_,
+		FCTID_,
+		FCTIDZ_,
+		FCFID_,
 	};
 
 	// Enable address-of operator for ppu_decoder<>
@@ -570,6 +706,7 @@ struct ppu_itype
 struct ppu_iname
 {
 #define NAME(x) static constexpr const char& x = *#x;
+#define NAME_(x) static constexpr const char& x##_ = *#x ".";
 	NAME(UNK)
 	NAME(MFVSCR)
 	NAME(MTVSCR)
@@ -595,18 +732,31 @@ struct ppu_iname
 	NAME(VCFSX)
 	NAME(VCFUX)
 	NAME(VCMPBFP)
+	NAME_(VCMPBFP)
 	NAME(VCMPEQFP)
+	NAME_(VCMPEQFP)
 	NAME(VCMPEQUB)
+	NAME_(VCMPEQUB)
 	NAME(VCMPEQUH)
+	NAME_(VCMPEQUH)
 	NAME(VCMPEQUW)
+	NAME_(VCMPEQUW)
 	NAME(VCMPGEFP)
+	NAME_(VCMPGEFP)
 	NAME(VCMPGTFP)
+	NAME_(VCMPGTFP)
 	NAME(VCMPGTSB)
+	NAME_(VCMPGTSB)
 	NAME(VCMPGTSH)
+	NAME_(VCMPGTSH)
 	NAME(VCMPGTSW)
+	NAME_(VCMPGTSW)
 	NAME(VCMPGTUB)
+	NAME_(VCMPGTUB)
 	NAME(VCMPGTUH)
+	NAME_(VCMPGTUH)
 	NAME(VCMPGTUW)
+	NAME_(VCMPGTUW)
 	NAME(VCTSXS)
 	NAME(VCTUXS)
 	NAME(VEXPTEFP)
@@ -950,7 +1100,132 @@ struct ppu_iname
 	NAME(FCTID)
 	NAME(FCTIDZ)
 	NAME(FCFID)
+
+	NAME(SUBFCO)
+	NAME(ADDCO)
+	NAME(SUBFO)
+	NAME(NEGO)
+	NAME(SUBFEO)
+	NAME(ADDEO)
+	NAME(SUBFZEO)
+	NAME(ADDZEO)
+	NAME(SUBFMEO)
+	NAME(MULLDO)
+	NAME(ADDMEO)
+	NAME(MULLWO)
+	NAME(ADDO)
+	NAME(DIVDUO)
+	NAME(DIVWUO)
+	NAME(DIVDO)
+	NAME(DIVWO)
+
+	NAME_(SUBFCO)
+	NAME_(ADDCO)
+	NAME_(SUBFO)
+	NAME_(NEGO)
+	NAME_(SUBFEO)
+	NAME_(ADDEO)
+	NAME_(SUBFZEO)
+	NAME_(ADDZEO)
+	NAME_(SUBFMEO)
+	NAME_(MULLDO)
+	NAME_(ADDMEO)
+	NAME_(MULLWO)
+	NAME_(ADDO)
+	NAME_(DIVDUO)
+	NAME_(DIVWUO)
+	NAME_(DIVDO)
+	NAME_(DIVWO)
+
+	NAME_(RLWIMI)
+	NAME_(RLWINM)
+	NAME_(RLWNM)
+	NAME_(RLDICL)
+	NAME_(RLDICR)
+	NAME_(RLDIC)
+	NAME_(RLDIMI)
+	NAME_(RLDCL)
+	NAME_(RLDCR)
+	NAME_(SUBFC)
+	NAME_(MULHDU)
+	NAME_(ADDC)
+	NAME_(MULHWU)
+	NAME_(SLW)
+	NAME_(CNTLZW)
+	NAME_(SLD)
+	NAME_(AND)
+	NAME_(SUBF)
+	NAME_(CNTLZD)
+	NAME_(ANDC)
+	NAME_(MULHD)
+	NAME_(MULHW)
+	NAME_(NEG)
+	NAME_(NOR)
+	NAME_(SUBFE)
+	NAME_(ADDE)
+	NAME_(SUBFZE)
+	NAME_(ADDZE)
+	NAME_(MULLD)
+	NAME_(SUBFME)
+	NAME_(ADDME)
+	NAME_(MULLW)
+	NAME_(ADD)
+	NAME_(EQV)
+	NAME_(XOR)
+	NAME_(ORC)
+	NAME_(OR)
+	NAME_(DIVDU)
+	NAME_(DIVWU)
+	NAME_(NAND)
+	NAME_(DIVD)
+	NAME_(DIVW)
+	NAME_(SRW)
+	NAME_(SRD)
+	NAME_(SRAW)
+	NAME_(SRAD)
+	NAME_(SRAWI)
+	NAME_(SRADI)
+	NAME_(EXTSH)
+	NAME_(EXTSB)
+	NAME_(EXTSW)
+	NAME_(FDIVS)
+	NAME_(FSUBS)
+	NAME_(FADDS)
+	NAME_(FSQRTS)
+	NAME_(FRES)
+	NAME_(FMULS)
+	NAME_(FMADDS)
+	NAME_(FMSUBS)
+	NAME_(FNMSUBS)
+	NAME_(FNMADDS)
+	NAME_(MTFSB1)
+	NAME_(MTFSB0)
+	NAME_(MTFSFI)
+	NAME_(MFFS)
+	NAME_(MTFSF)
+	NAME_(FRSP)
+	NAME_(FCTIW)
+	NAME_(FCTIWZ)
+	NAME_(FDIV)
+	NAME_(FSUB)
+	NAME_(FADD)
+	NAME_(FSQRT)
+	NAME_(FSEL)
+	NAME_(FMUL)
+	NAME_(FRSQRTE)
+	NAME_(FMSUB)
+	NAME_(FMADD)
+	NAME_(FNMSUB)
+	NAME_(FNMADD)
+	NAME_(FNEG)
+	NAME_(FMR)
+	NAME_(FNABS)
+	NAME_(FABS)
+	NAME_(FCTID)
+	NAME_(FCTIDZ)
+	NAME_(FCFID)
 #undef NAME
+#undef NAME_
 };
 
 // PPU Analyser Context
diff --git a/rpcs3/Emu/Cell/PPUDisAsm.h b/rpcs3/Emu/Cell/PPUDisAsm.h
index eff5192c1e..fd04737b1e 100644
--- a/rpcs3/Emu/Cell/PPUDisAsm.h
+++ b/rpcs3/Emu/Cell/PPUDisAsm.h
@@ -351,18 +351,31 @@ public:
 	void VCFSX(ppu_opcode_t op);
 	void VCFUX(ppu_opcode_t op);
 	void VCMPBFP(ppu_opcode_t op);
+	void VCMPBFP_(ppu_opcode_t op) { return VCMPBFP(op); }
 	void VCMPEQFP(ppu_opcode_t op);
+	void VCMPEQFP_(ppu_opcode_t op) { return VCMPEQFP(op); }
 	void VCMPEQUB(ppu_opcode_t op);
+	void VCMPEQUB_(ppu_opcode_t op) { return VCMPEQUB(op); }
 	void VCMPEQUH(ppu_opcode_t op);
+	void VCMPEQUH_(ppu_opcode_t op) { return VCMPEQUH(op); }
 	void VCMPEQUW(ppu_opcode_t op);
+	void VCMPEQUW_(ppu_opcode_t op) { return VCMPEQUW(op); }
 	void VCMPGEFP(ppu_opcode_t op);
+	void VCMPGEFP_(ppu_opcode_t op) { return VCMPGEFP(op); }
 	void VCMPGTFP(ppu_opcode_t op);
+	void VCMPGTFP_(ppu_opcode_t op) { return VCMPGTFP(op); }
 	void VCMPGTSB(ppu_opcode_t op);
+	void VCMPGTSB_(ppu_opcode_t op) { return VCMPGTSB(op); }
 	void VCMPGTSH(ppu_opcode_t op);
+	void VCMPGTSH_(ppu_opcode_t op) { return VCMPGTSH(op); }
 	void VCMPGTSW(ppu_opcode_t op);
+	void VCMPGTSW_(ppu_opcode_t op) { return VCMPGTSW(op); }
 	void VCMPGTUB(ppu_opcode_t op);
+	void VCMPGTUB_(ppu_opcode_t op) { return VCMPGTUB(op); }
 	void VCMPGTUH(ppu_opcode_t op);
+	void VCMPGTUH_(ppu_opcode_t op) { return VCMPGTUH(op); }
 	void VCMPGTUW(ppu_opcode_t op);
+	void VCMPGTUW_(ppu_opcode_t op) { return VCMPGTUW(op); }
 	void VCTSXS(ppu_opcode_t op);
 	void VCTUXS(ppu_opcode_t op);
 	void VEXPTEFP(ppu_opcode_t op);
@@ -708,4 +721,128 @@ public:
 	void FCFID(ppu_opcode_t op);
 
 	void UNK(ppu_opcode_t op);
+
+	void SUBFCO(ppu_opcode_t op) { return SUBFC(op); }
+	void ADDCO(ppu_opcode_t op) { return ADDC(op); }
+	void SUBFO(ppu_opcode_t op) { return SUBF(op); }
+	void NEGO(ppu_opcode_t op) { return NEG(op); }
+	void SUBFEO(ppu_opcode_t op) { return SUBFE(op); }
+	void ADDEO(ppu_opcode_t op) { return ADDE(op); }
+	void SUBFZEO(ppu_opcode_t op) { return SUBFZE(op); }
+	void ADDZEO(ppu_opcode_t op) { return ADDZE(op); }
+	void SUBFMEO(ppu_opcode_t op) { return SUBFME(op); }
+	void MULLDO(ppu_opcode_t op) { return MULLD(op); }
+	void ADDMEO(ppu_opcode_t op) { return ADDME(op); }
+	void MULLWO(ppu_opcode_t op) { return MULLW(op); }
+	void ADDO(ppu_opcode_t op) { return ADD(op); }
+	void DIVDUO(ppu_opcode_t op) { return DIVDU(op); }
+	void DIVWUO(ppu_opcode_t op) { return DIVWU(op); }
+	void DIVDO(ppu_opcode_t op) { return DIVD(op); }
+	void DIVWO(ppu_opcode_t op) { return DIVW(op); }
+
+	void SUBFCO_(ppu_opcode_t op) { return SUBFC(op); }
+	void ADDCO_(ppu_opcode_t op) { return ADDC(op); }
+	void SUBFO_(ppu_opcode_t op) { return SUBF(op); }
+	void NEGO_(ppu_opcode_t op) { return NEG(op); }
+	void SUBFEO_(ppu_opcode_t op) { return SUBFE(op); }
+	void ADDEO_(ppu_opcode_t op) { return ADDE(op); }
+	void SUBFZEO_(ppu_opcode_t op) { return SUBFZE(op); }
+	void ADDZEO_(ppu_opcode_t op) { return ADDZE(op); }
+	void SUBFMEO_(ppu_opcode_t op) { return SUBFME(op); }
+	void MULLDO_(ppu_opcode_t op) { return MULLD(op); }
+	void ADDMEO_(ppu_opcode_t op) { return ADDME(op); }
+	void MULLWO_(ppu_opcode_t op) { return MULLW(op); }
+	void ADDO_(ppu_opcode_t op) { return ADD(op); }
+	void DIVDUO_(ppu_opcode_t op) { return DIVDU(op); }
+	void DIVWUO_(ppu_opcode_t op) { return DIVWU(op); }
+	void DIVDO_(ppu_opcode_t op) { return DIVD(op); }
+	void DIVWO_(ppu_opcode_t op) { return DIVW(op); }
+
+	void RLWIMI_(ppu_opcode_t op) { return RLWIMI(op); }
+	void RLWINM_(ppu_opcode_t op) { return RLWINM(op); }
+	void RLWNM_(ppu_opcode_t op) { return RLWNM(op); }
+	void RLDICL_(ppu_opcode_t op) { return RLDICL(op); }
+	void RLDICR_(ppu_opcode_t op) { return RLDICR(op); }
+	void RLDIC_(ppu_opcode_t op) { return RLDIC(op); }
+	void RLDIMI_(ppu_opcode_t op) { return RLDIMI(op); }
+	void RLDCL_(ppu_opcode_t op) { return RLDCL(op); }
+	void RLDCR_(ppu_opcode_t op) { return RLDCR(op); }
+	void SUBFC_(ppu_opcode_t op) { return SUBFC(op); }
+	void MULHDU_(ppu_opcode_t op) { return MULHDU(op); }
+	void ADDC_(ppu_opcode_t op) { return ADDC(op); }
+	void MULHWU_(ppu_opcode_t op) { return MULHWU(op); }
+	void SLW_(ppu_opcode_t op) { return SLW(op); }
+	void CNTLZW_(ppu_opcode_t op) { return CNTLZW(op); }
+	void SLD_(ppu_opcode_t op) { return SLD(op); }
+	void AND_(ppu_opcode_t op) { return AND(op); }
+	void SUBF_(ppu_opcode_t op) { return SUBF(op); }
+	void CNTLZD_(ppu_opcode_t op) { return CNTLZD(op); }
+	void ANDC_(ppu_opcode_t op) { return ANDC(op); }
+	void MULHD_(ppu_opcode_t op) { return MULHD(op); }
+	void MULHW_(ppu_opcode_t op) { return MULHW(op); }
+	void NEG_(ppu_opcode_t op) { return NEG(op); }
+	void NOR_(ppu_opcode_t op) { return NOR(op); }
+	void SUBFE_(ppu_opcode_t op) { return SUBFE(op); }
+	void ADDE_(ppu_opcode_t op) { return ADDE(op); }
+	void SUBFZE_(ppu_opcode_t op) { return SUBFZE(op); }
+	void ADDZE_(ppu_opcode_t op) { return ADDZE(op); }
+	void MULLD_(ppu_opcode_t op) { return MULLD(op); }
+	void SUBFME_(ppu_opcode_t op) { return SUBFME(op); }
+	void ADDME_(ppu_opcode_t op) { return ADDME(op); }
+	void MULLW_(ppu_opcode_t op) { return MULLW(op); }
+	void ADD_(ppu_opcode_t op) { return ADD(op); }
+	void EQV_(ppu_opcode_t op) { return EQV(op); }
+	void XOR_(ppu_opcode_t op) { return XOR(op); }
+	void ORC_(ppu_opcode_t op) { return ORC(op); }
+	void OR_(ppu_opcode_t op) { return OR(op); }
+	void DIVDU_(ppu_opcode_t op) { return DIVDU(op); }
+	void DIVWU_(ppu_opcode_t op) { return DIVWU(op); }
+	void NAND_(ppu_opcode_t op) { return NAND(op); }
+	void DIVD_(ppu_opcode_t op) { return DIVD(op); }
+	void DIVW_(ppu_opcode_t op) { return DIVW(op); }
+	void SRW_(ppu_opcode_t op) { return SRW(op); }
+	void SRD_(ppu_opcode_t op) { return SRD(op); }
+	void SRAW_(ppu_opcode_t op) { return SRAW(op); }
+	void SRAD_(ppu_opcode_t op) { return SRAD(op); }
+	void SRAWI_(ppu_opcode_t op) { return SRAWI(op); }
+	void SRADI_(ppu_opcode_t op) { return SRADI(op); }
+	void EXTSH_(ppu_opcode_t op) { return EXTSH(op); }
+	void EXTSB_(ppu_opcode_t op) { return EXTSB(op); }
+	void EXTSW_(ppu_opcode_t op) { return EXTSW(op); }
+	void FDIVS_(ppu_opcode_t op) { return FDIVS(op); }
+	void FSUBS_(ppu_opcode_t op) { return FSUBS(op); }
+	void FADDS_(ppu_opcode_t op) { return FADDS(op); }
+	void FSQRTS_(ppu_opcode_t op) { return FSQRTS(op); }
+	void FRES_(ppu_opcode_t op) { return FRES(op); }
+	void FMULS_(ppu_opcode_t op) { return FMULS(op); }
+	void FMADDS_(ppu_opcode_t op) { return FMADDS(op); }
+	void FMSUBS_(ppu_opcode_t op) { return FMSUBS(op); }
+	void FNMSUBS_(ppu_opcode_t op) { return FNMSUBS(op); }
+	void FNMADDS_(ppu_opcode_t op) { return FNMADDS(op); }
+	void MTFSB1_(ppu_opcode_t op) { return MTFSB1(op); }
+	void MTFSB0_(ppu_opcode_t op) { return MTFSB0(op); }
+	void MTFSFI_(ppu_opcode_t op) { return MTFSFI(op); }
+	void MFFS_(ppu_opcode_t op) { return MFFS(op); }
+	void MTFSF_(ppu_opcode_t op) { return MTFSF(op); }
+	void FRSP_(ppu_opcode_t op) { return FRSP(op); }
+	void FCTIW_(ppu_opcode_t op) { return FCTIW(op); }
+	void FCTIWZ_(ppu_opcode_t op) { return FCTIWZ(op); }
+	void FDIV_(ppu_opcode_t op) { return FDIV(op); }
+	void FSUB_(ppu_opcode_t op) { return FSUB(op); }
+	void FADD_(ppu_opcode_t op) { return FADD(op); }
+	void FSQRT_(ppu_opcode_t op) { return FSQRT(op); }
+	void FSEL_(ppu_opcode_t op) { return FSEL(op); }
+	void FMUL_(ppu_opcode_t op) { return FMUL(op); }
+	void FRSQRTE_(ppu_opcode_t op) { return FRSQRTE(op); }
+	void FMSUB_(ppu_opcode_t op) { return FMSUB(op); }
+	void FMADD_(ppu_opcode_t op) { return FMADD(op); }
+	void FNMSUB_(ppu_opcode_t op) { return FNMSUB(op); }
+	void FNMADD_(ppu_opcode_t op) { return FNMADD(op); }
+	void FNEG_(ppu_opcode_t op) { return FNEG(op); }
+	void FMR_(ppu_opcode_t op) { return FMR(op); }
+	void FNABS_(ppu_opcode_t op) { return FNABS(op); }
+	void FABS_(ppu_opcode_t op) { return FABS(op); }
+	void FCTID_(ppu_opcode_t op) { return FCTID(op); }
+	void FCTIDZ_(ppu_opcode_t op) { return FCTIDZ(op); }
+	void FCFID_(ppu_opcode_t op) { return FCFID(op); }
 };
diff --git a/rpcs3/Emu/Cell/PPUFunction.cpp b/rpcs3/Emu/Cell/PPUFunction.cpp
index 853adcd0f0..853a41a49d 100644
--- a/rpcs3/Emu/Cell/PPUFunction.cpp
+++ b/rpcs3/Emu/Cell/PPUFunction.cpp
@@ -1889,47 +1889,56 @@ extern std::string ppu_get_variable_name(const std::string& _module, u32 vnid)
 	return fmt::format("0x%08X", vnid);
 }
 
-std::vector<ppu_function_t>& ppu_function_manager::access(bool ghc)
+std::vector<ppu_intrp_func_t>& ppu_function_manager::access(bool ghc)
 {
-	static std::vector<ppu_function_t> list
+	static std::vector<ppu_intrp_func_t> list
 	{
-		[](ppu_thread& ppu) -> bool
+		[](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
 		{
+			ppu.cia = vm::get_addr(this_op);
 			ppu_log.error("Unregistered function called (LR=0x%x)", ppu.lr);
 			ppu.gpr[3] = 0;
 			ppu.cia = static_cast<u32>(ppu.lr) & ~3;
-			return false;
 		},
-		[](ppu_thread& ppu) -> bool
+		[](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
 		{
 			ppu.state += cpu_flag::ret;
-			ppu.cia += 4;
-			return false;
+			ppu.cia = vm::get_addr(this_op) + 4;
 		},
 	};
 
-	static std::vector<ppu_function_t> list_ghc
+#if defined(ARCH_X64)
+	static std::vector<ppu_intrp_func_t> list_ghc
 	{
-		build_function_asm<ppu_function_t>("ppu_unregistered", [](asmjit::x86::Assembler& c, auto& args)
+		build_function_asm<ppu_intrp_func_t>("ppu_unregistered", [](native_asm& c, auto& args)
 		{
 			using namespace asmjit;
 
+			// Take second ghc arg
 			c.mov(args[0], x86::rbp);
+			c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
+			c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
 			c.jmp(imm_ptr(list[0]));
 		}),
-		build_function_asm<ppu_function_t>("ppu_return", [](asmjit::x86::Assembler& c, auto& args)
+		build_function_asm<ppu_intrp_func_t>("ppu_return", [](native_asm& c, auto& args)
 		{
 			using namespace asmjit;
 
+			// Take second ghc arg
 			c.mov(args[0], x86::rbp);
+			c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
+			c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
 			c.jmp(imm_ptr(list[1]));
 		}),
 	};
+#elif defined(ARCH_ARM64)
+	static std::vector<ppu_intrp_func_t> list_ghc(list);
+#endif
 
 	return ghc ? list_ghc : list;
 }
 
-u32 ppu_function_manager::add_function(ppu_function_t function)
+u32 ppu_function_manager::add_function(ppu_intrp_func_t function)
 {
 	auto& list = access();
 	auto& list2 = access(true);
@@ -1937,13 +1946,22 @@ u32 ppu_function_manager::add_function(ppu_function_t function)
 	list.push_back(function);
 
 	// Generate trampoline
-	list2.push_back(build_function_asm<ppu_function_t>("ppu_trampolinea", [&](asmjit::x86::Assembler& c, auto& args)
+#if defined(ARCH_X64)
+	list2.push_back(build_function_asm<ppu_intrp_func_t>("ppu_trampolinea", [&](native_asm& c, auto& args)
 	{
 		using namespace asmjit;
 
+		// Take second ghc arg
 		c.mov(args[0], x86::rbp);
+		c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
+		c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
 		c.jmp(imm_ptr(function));
 	}));
+#elif defined(ARCH_ARM64)
+	list2.push_back(function);
+#else
+#error "Not implemented"
+#endif
 
 	return ::size32(list) - 1;
 }
diff --git a/rpcs3/Emu/Cell/PPUFunction.h b/rpcs3/Emu/Cell/PPUFunction.h
index 286e45535a..5b4569a02b 100644
--- a/rpcs3/Emu/Cell/PPUFunction.h
+++ b/rpcs3/Emu/Cell/PPUFunction.h
@@ -1,23 +1,22 @@
 #pragma once
 
 #include "PPUThread.h"
+#include "PPUInterpreter.h"
 
 #include "util/v128.hpp"
 
-using ppu_function_t = bool(*)(ppu_thread&);
-
-// BIND_FUNC macro "converts" any appropriate HLE function to ppu_function_t, binding it to PPU thread context.
-#define BIND_FUNC(func, ...) (static_cast<ppu_function_t>([](ppu_thread& ppu) -> bool {\
+// BIND_FUNC macro "converts" any appropriate HLE function to ppu_intrp_func_t, binding it to PPU thread context.
+#define BIND_FUNC(func, ...) (static_cast<ppu_intrp_func_t>([](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*) {\
 	const auto old_f = ppu.current_function;\
 	if (!old_f) ppu.last_function = #func;\
 	ppu.current_function = #func;\
+	ppu.cia = vm::get_addr(this_op); \
 	std::memcpy(ppu.syscall_args, ppu.gpr + 3, sizeof(ppu.syscall_args)); \
 	ppu_func_detail::do_call(ppu, func);\
 	static_cast<void>(ppu.test_stopped());\
 	ppu.current_function = old_f;\
 	ppu.cia += 4;\
 	__VA_ARGS__;\
-	return false;\
 }))
 
 struct ppu_va_args_t
@@ -257,9 +256,9 @@ class ppu_function_manager
 	};
 
 	// Access global function list
-	static std::vector<ppu_function_t>& access(bool ghc = false);
+	static std::vector<ppu_intrp_func_t>& access(bool ghc = false);
 
-	static u32 add_function(ppu_function_t function);
+	static u32 add_function(ppu_intrp_func_t function);
 
 public:
 	ppu_function_manager() = default;
@@ -270,7 +269,7 @@ public:
 
 	// Register function (shall only be called during global initialization)
 	template<typename T, T Func>
-	static inline u32 register_function(ppu_function_t func)
+	static inline u32 register_function(ppu_intrp_func_t func)
 	{
 		return registered<T, Func>::index = add_function(func);
 	}
diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp
index b73f865c4f..1ee69203b4 100644
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@@ -6,6 +6,7 @@
 #include "PPUThread.h"
 #include "Emu/Cell/Common.h"
 #include "Emu/Cell/PPUFunction.h"
+#include "Emu/Cell/PPUAnalyser.h"
 #include "Emu/Cell/timers.hpp"
 #include "Emu/IdManager.h"
 
@@ -15,21 +16,325 @@
 
 #include "util/asm.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 #include "util/sysinfo.hpp"
+#include "Utilities/JIT.h"
 
-#if !defined(_MSC_VER) && defined(__clang__)
+#if !defined(_MSC_VER)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wold-style-cast"
 #endif
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || !defined(__SSE2__)
 #define SSSE3_FUNC
 #else
 #define SSSE3_FUNC __attribute__((__target__("ssse3")))
 #endif
 
+#if defined(ARCH_ARM64)
+#if !defined(_MSC_VER)
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+#undef FORCE_INLINE
+#include "Emu/CPU/sse2neon.h"
+#endif
+
+#if (defined(ARCH_X64)) && !defined(__SSSE3__)
 const bool s_use_ssse3 = utils::has_ssse3();
+#else
+constexpr bool s_use_ssse3 = true; // Including non-x86
+#endif
+
+extern const ppu_decoder<ppu_itype> g_ppu_itype;
+extern const ppu_decoder<ppu_iname> g_ppu_iname;
+
+enum class ppu_exec_bit : u64
+{
+	has_oe,
+	has_rc,
+	set_sat,
+	use_nj,
+	set_vnan,
+	fix_vnan,
+	set_fpcc,
+	use_dfma,
+	set_cr_stats,
+
+	__bitset_enum_max
+};
+
+using enum ppu_exec_bit;
+
+// Helper for combining only used subset of exec flags at compile time
+template <ppu_exec_bit... Flags0>
+struct ppu_exec_select
+{
+	template <ppu_exec_bit Flag, ppu_exec_bit... Flags, typename F>
+	static ppu_intrp_func_t select(bs_t<ppu_exec_bit> selected, F func)
+	{
+		// Make sure there is no flag duplication, otherwise skip flag
+		if constexpr (((Flags0 != Flag) && ...))
+		{
+			// Test only relevant flags at runtime initialization (compile both variants)
+			if (selected & Flag)
+			{
+				// In this branch, selected flag is added to Flags0
+				return ppu_exec_select<Flags0..., Flag>::template select<Flags...>(selected, func);
+			}
+		}
+
+		return ppu_exec_select<Flags0...>::template select<Flags...>(selected, func);
+	}
+
+	template <typename F>
+	static ppu_intrp_func_t select(bs_t<ppu_exec_bit>, F func)
+	{
+		// Instantiate interpreter function with required set of flags
+		return func.template operator()<Flags0...>();
+	}
+
+	template <ppu_exec_bit... Flags1>
+	static auto select()
+	{
+#ifndef __INTELLISENSE__
+		return [](bs_t<ppu_exec_bit> selected, auto func)
+		{
+			return ppu_exec_select::select<Flags1...>(selected, func);
+		};
+#endif
+	}
+};
+
+// Switch between inlined interpreter invocation (exec) and builder function
+#if defined(ARCH_X64)
+#define RETURN(...) \
+	if constexpr (Build == 0) { \
+		static_cast<void>(exec); \
+		static const ppu_intrp_func_t f = build_function_asm<ppu_intrp_func_t, asmjit::ppu_builder>("ppu_"s + __func__, [&](asmjit::ppu_builder& c) { \
+			static ppu_opcode_t op{}; \
+			static ppu_abstract_t ppu; \
+			exec(__VA_ARGS__); \
+			c.ppu_ret(); \
+		}); \
+		return f; \
+	}
+#else
+#define RETURN RETURN_
+#endif
+
+#define RETURN_(...) \
+	if constexpr (Build == 0) { \
+		static_cast<void>(exec); \
+		return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn) { \
+			const auto fn = atomic_storage<ppu_intrp_func_t>::observe(next_fn->fn); \
+			exec(__VA_ARGS__); \
+			const auto next_op = this_op + 1; \
+			return fn(ppu, {*next_op}, next_op, next_fn + 1); \
+		}; \
+	}
+
+static constexpr ppu_opcode_t s_op{};
+
+namespace asmjit
+{
+#if defined(ARCH_X64)
+	struct ppu_builder : vec_builder
+	{
+		using base = vec_builder;
+
+#ifdef _WIN32
+		static constexpr x86::Gp arg_ppu = x86::rcx;
+		static constexpr x86::Gp arg_op = x86::edx;
+		static constexpr x86::Gp arg_this_op = x86::r8;
+		static constexpr x86::Gp arg_next_fn = x86::r9;
+#else
+		static constexpr x86::Gp arg_ppu = x86::rdi;
+		static constexpr x86::Gp arg_op = x86::esi;
+		static constexpr x86::Gp arg_this_op = x86::rdx;
+		static constexpr x86::Gp arg_next_fn = x86::rcx;
+#endif
+
+		u32 xmm_count = 0;
+		u32 ppu_base = 0;
+		x86::Xmm tmp;
+
+		ppu_builder(CodeHolder* ch)
+			: base(ch)
+		{
+			// Initialize pointer to next function
+			base::mov(x86::r11, x86::qword_ptr(arg_next_fn));
+		}
+
+		// Indexed offset to ppu.member
+		template <auto MPtr, u32 Size = sizeof((std::declval<ppu_thread&>().*MPtr)[0]), uint I, uint N>
+		x86::Mem ppu_mem(const bf_t<u32, I, N>&, bool last = false)
+		{
+			// Required index shift for array indexing
+			constexpr u32 Shift = std::countr_zero(sizeof((std::declval<ppu_thread&>().*MPtr)[0]));
+
+			const u32 offset = ::offset32(MPtr);
+
+			auto tmp_r32 = x86::eax;
+			auto reg_ppu = arg_ppu;
+
+			if (last)
+			{
+				tmp_r32 = arg_op.r32();
+			}
+			else
+			{
+				base::mov(tmp_r32, arg_op);
+
+				if (offset % 16 == 0 && ppu_base != offset)
+				{
+					// Optimistically precompute offset to avoid [ppu + tmp*x + offset] addressing
+					base::lea(x86::r10, x86::qword_ptr(arg_ppu, static_cast<s32>(offset)));
+					ppu_base = offset;
+				}
+			}
+
+			if (ppu_base == offset)
+			{
+				reg_ppu = x86::r10;
+			}
+
+			// Use max possible index shift
+			constexpr u32 X86Shift = Shift > 3 ? 3 : Shift;
+			constexpr u32 AddShift = Shift - X86Shift;
+			constexpr u32 AndMask = (1u << N) - 1;
+
+			if constexpr (I >= AddShift)
+			{
+				if constexpr (I != AddShift)
+					base::shr(tmp_r32, I - AddShift);
+				base::and_(tmp_r32, AndMask << AddShift);
+			}
+			else
+			{
+				base::and_(tmp_r32, AndMask << I);
+				base::shl(tmp_r32, I + AddShift);
+			}
+
+			return x86::ptr(reg_ppu, tmp_r32.r64(), X86Shift, static_cast<s32>(offset - ppu_base), Size);
+		}
+
+		// Generic offset to ppu.member
+		template <auto MPtr, u32 Size = sizeof(std::declval<ppu_thread&>().*MPtr)>
+		x86::Mem ppu_mem()
+		{
+			if (ppu_base == 0)
+			{
+				return x86::ptr(arg_ppu, static_cast<s32>(::offset32(MPtr)), Size);
+			}
+			else
+			{
+				return x86::ptr(x86::r10, static_cast<s32>(::offset32(MPtr) - ppu_base), Size);
+			}
+		}
+
+		template <u32 Size = 16, uint I, uint N>
+		x86::Mem ppu_vr(const bf_t<u32, I, N>& bf, bool last = false)
+		{
+			return ppu_mem<&ppu_thread::vr, Size>(bf, last);
+		}
+
+		x86::Mem ppu_sat()
+		{
+			return ppu_mem<&ppu_thread::sat>();
+		}
+
+		void ppu_ret(bool last = true)
+		{
+			base::add(arg_this_op, 4);
+			base::mov(arg_op, x86::dword_ptr(arg_this_op));
+			base::bswap(arg_op);
+			base::add(arg_next_fn, 8);
+			base::jmp(x86::r11);
+
+			// Embed constants (TODO: after last return)
+			if (last)
+				base::emit_consts();
+		}
+	};
+#elif defined(ARCH_ARM64)
+	struct ppu_builder : a64::Assembler
+	{
+	};
+#else
+	struct ppu_builder
+	{
+	};
+#endif
+}
+
+struct ppu_abstract_t
+{
+	struct abstract_vr
+	{
+		template <uint I, uint N>
+		struct lazy_vr : asmjit::mem_lazy
+		{
+			const asmjit::Operand& eval(bool is_lv)
+			{
+				if (is_lv && !this->isReg())
+				{
+					Operand::operator=(g_vc->vec_alloc());
+				#if defined(ARCH_X64)
+					g_vc->emit(asmjit::x86::Inst::kIdMovaps, *this, static_cast<asmjit::ppu_builder*>(g_vc)->ppu_vr(bf_t<u32, I, N>{}, false));
+				#endif
+				}
+
+				if (!is_lv)
+				{
+					if (this->isReg())
+					{
+						g_vc->vec_dealloc(asmjit::vec_type{this->id()});
+					}
+					else
+					{
+					#if defined(ARCH_X64)
+						Operand::operator=(static_cast<asmjit::ppu_builder*>(g_vc)->ppu_vr(bf_t<u32, I, N>{}, false));
+					#endif
+					}
+				}
+
+				return *this;
+			}
+
+			template <typename T>
+			void operator=(T&& _val) const
+			{
+				FOR_X64(store_op, kIdMovaps, kIdVmovaps, static_cast<asmjit::ppu_builder*>(g_vc)->ppu_vr(bf_t<u32, I, N>{}, true), std::forward<T>(_val));
+			}
+		};
+
+		template <uint I, uint N>
+		lazy_vr<I, N> operator[](const bf_t<u32, I, N>&) const
+		{
+			return {};
+		}
+	} vr;
+
+	struct abstract_sat : asmjit::mem_lazy
+	{
+		const asmjit::Operand& eval(bool)
+		{
+		#if defined(ARCH_X64)
+			Operand::operator=(static_cast<asmjit::ppu_builder*>(g_vc)->ppu_sat());
+		#endif
+
+			return *this;
+		}
+
+		template <typename T>
+		void operator=(T&& _val) const
+		{
+		#if defined(ARCH_X64)
+			FOR_X64(store_op, kIdMovaps, kIdVmovaps, static_cast<asmjit::ppu_builder*>(g_vc)->ppu_sat(), std::forward<T>(_val));
+		#endif
+		}
+	} sat{};
+};
 
 extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
 
@@ -56,6 +361,21 @@ inline void ppu_cr_set(ppu_thread& ppu, u32 field, const T& a, const T& b)
 	ppu_cr_set(ppu, field, a < b, a > b, a == b, ppu.xer.so);
 }
 
+// TODO
+template <ppu_exec_bit... Flags>
+void ppu_set_cr(ppu_thread& ppu, u32 field, bool le, bool gt, bool eq, bool so)
+{
+	ppu.cr[field * 4 + 0] = le;
+	ppu.cr[field * 4 + 1] = gt;
+	ppu.cr[field * 4 + 2] = eq;
+	ppu.cr[field * 4 + 3] = so;
+
+	if constexpr (((Flags == set_cr_stats) || ...))
+	{
+		*reinterpret_cast<u32*>(vm::g_stat_addr + ppu.cia) |= *reinterpret_cast<u32*>(ppu.cr.bits + field * 4);
+	}
+}
+
 // Set XER.OV bit (overflow)
 inline void ppu_ov_set(ppu_thread& ppu, bool bit)
 {
@@ -64,38 +384,62 @@ inline void ppu_ov_set(ppu_thread& ppu, bool bit)
 }
 
 // Write comparison results to FPCC field with optional CR field update
-template<typename T>
-inline void ppu_fpcc_set(ppu_thread& ppu, const T& a, const T& b, const bool rc, const u64 cr_field = 1)
+template <ppu_exec_bit... Flags>
+void ppu_set_fpcc(ppu_thread& ppu, f64 a, f64 b, u64 cr_field = 1)
 {
-	// TODO: Do not hardcode to be endian dependant
-	u32 fpcc = u32{a < b} << (8 * 0) | u32{a > b} << (8 * 1) | u32{a == b} << (8 * 2);
-
-	// Test FU
-	if (fpcc == 0) [[unlikely]] fpcc = 1 << (8 * 3);
-
-	// Write FPCC
-	ppu.fpscr.fields[4] = fpcc;
-
-	if (rc) [[unlikely]]
+	if constexpr (((Flags == set_fpcc || Flags == has_rc) || ...))
 	{
-		ppu.cr.fields[cr_field] = fpcc;
+		static_assert(std::endian::native == std::endian::little, "Not implemented");
 
-		if (g_cfg.core.ppu_debug) [[unlikely]]
+		bool fpcc[4];
+#if defined(ARCH_X64) && !defined(_M_X64)
+		__asm__("comisd %[b], %[a]\n"
+			: "=@ccb" (fpcc[0])
+			, "=@cca" (fpcc[1])
+			, "=@ccz" (fpcc[2])
+			, "=@ccp" (fpcc[3])
+			: [a] "x" (a)
+			, [b] "x" (b)
+			: "cc");
+		if (fpcc[3]) [[unlikely]]
 		{
-			*reinterpret_cast<u32*>(vm::g_stat_addr + ppu.cia) |= ppu.cr.fields[cr_field];
+			fpcc[0] = fpcc[1] = fpcc[2] = false;
+		}
+#else
+		const auto cmp = a <=> b;
+		fpcc[0] = cmp == std::partial_ordering::less;
+		fpcc[1] = cmp == std::partial_ordering::greater;
+		fpcc[2] = cmp == std::partial_ordering::equivalent;
+		fpcc[3] = cmp == std::partial_ordering::unordered;
+#endif
+
+		const u32 data = std::bit_cast<u32>(fpcc);
+
+		// Write FPCC
+		ppu.fpscr.fields[4] = data;
+
+		if constexpr (((Flags == has_rc) || ...))
+		{
+			// Previous behaviour was throwing an exception; TODO
+			ppu.cr.fields[cr_field] = data;
+
+			if (g_cfg.core.ppu_debug) [[unlikely]]
+			{
+				*reinterpret_cast<u32*>(vm::g_stat_addr + ppu.cia) |= data;
+			}
 		}
 	}
 }
 
 // Validate read data in case does not match reservation
 template <typename T>
-FORCE_INLINE auto ppu_feed_data(ppu_thread& ppu, u64 addr)
+auto ppu_feed_data(ppu_thread& ppu, u64 addr)
 {
 	static_assert(sizeof(T) <= 128, "Incompatible type-size, break down into smaller loads");
 
 	auto value = vm::_ref<T>(vm::cast(addr));
 
-	if (!ppu.use_full_rdata)
+	//if (!ppu.use_full_rdata)
 	{
 		return value;
 	}
@@ -143,6 +487,8 @@ FORCE_INLINE auto ppu_feed_data(ppu_thread& ppu, u64 addr)
 // Push called address to custom call history for debugging
 inline u32 ppu_record_call(ppu_thread& ppu, u32 new_cia, ppu_opcode_t op, bool indirect = false)
 {
+	return new_cia;
+
 	if (auto& history = ppu.call_history; !history.data.empty())
 	{
 		if (!op.lk)
@@ -169,62 +515,12 @@ inline u32 ppu_record_call(ppu_thread& ppu, u32 new_cia, ppu_opcode_t op, bool i
 		history.last_r1 = ppu.gpr[1];
 		history.last_r2 = ppu.gpr[2];
 	}
-
-	return new_cia;
-}
-
-// Compare 16 packed unsigned bytes (greater than)
-inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
-{
-	// (A xor 0x80) > (B xor 0x80)
-	const auto sign = _mm_set1_epi32(0x80808080);
-	return _mm_cmpgt_epi8(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
-}
-
-inline __m128i sse_cmpgt_epu16(__m128i A, __m128i B)
-{
-	const auto sign = _mm_set1_epi32(0x80008000);
-	return _mm_cmpgt_epi16(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
-}
-
-inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
-{
-	const auto sign = _mm_set1_epi32(0x80000000);
-	return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
-}
-
-extern __m128 sse_exp2_ps(__m128 A)
-{
-	const auto x0 = _mm_max_ps(_mm_min_ps(A, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f));
-	const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f));
-	const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1)));
-	const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2));
-	const auto x4 = _mm_mul_ps(x3, x3);
-	const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f)));
-	const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5)));
-	return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23)));
-}
-
-extern __m128 sse_log2_ps(__m128 A)
-{
-	const auto _1 = _mm_set1_ps(1.0f);
-	const auto _c = _mm_set1_ps(1.442695040f);
-	const auto x0 = _mm_max_ps(A, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
-	const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1);
-	const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1));
-	const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2);
-	const auto x4 = _mm_add_ps(x3, x3);
-	const auto x5 = _mm_mul_ps(x4, x4);
-	const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f));
-	const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f)));
-	const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127)));
-	return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8));
 }
 
 extern SAFE_BUFFERS(__m128i) sse_pshufb(__m128i data, __m128i index)
 {
-	v128 m = v128::fromV(_mm_and_si128(index, _mm_set1_epi8(0xf)));
-	v128 a = v128::fromV(data);
+	v128 m = _mm_and_si128(index, _mm_set1_epi8(0xf));
+	v128 a = data;
 	v128 r;
 
 	for (int i = 0; i < 16; i++)
@@ -232,7 +528,7 @@ extern SAFE_BUFFERS(__m128i) sse_pshufb(__m128i data, __m128i index)
 		r._u8[i] = a._u8[m._u8[i]];
 	}
 
-	return _mm_and_si128(r.vi, _mm_cmpgt_epi8(index, _mm_set1_epi8(-1)));
+	return _mm_and_si128(r, _mm_cmpgt_epi8(index, _mm_set1_epi8(-1)));
 }
 
 extern SSSE3_FUNC __m128i sse_altivec_vperm(__m128i A, __m128i B, __m128i C)
@@ -247,7 +543,7 @@ extern SSSE3_FUNC __m128i sse_altivec_vperm(__m128i A, __m128i B, __m128i C)
 extern SAFE_BUFFERS(__m128i) sse_altivec_vperm_v0(__m128i A, __m128i B, __m128i C)
 {
 	__m128i ab[2]{B, A};
-	v128 index = v128::fromV(_mm_andnot_si128(C, _mm_set1_epi8(0x1f)));
+	v128 index = _mm_andnot_si128(C, _mm_set1_epi8(0x1f));
 	v128 res;
 
 	for (int i = 0; i < 16; i++)
@@ -255,7 +551,7 @@ extern SAFE_BUFFERS(__m128i) sse_altivec_vperm_v0(__m128i A, __m128i B, __m128i
 		res._u8[i] = reinterpret_cast<u8*>(+ab)[index._u8[i]];
 	}
 
-	return res.vi;
+	return res;
 }
 
 extern __m128i sse_altivec_lvsl(u64 addr)
@@ -451,1182 +747,1406 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value);
 extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value);
 extern void ppu_trap(ppu_thread& ppu, u64 addr);
 
-
-class ppu_scale_table_t
-{
-	std::array<v128, 32 + 31> m_data{};
-
-public:
-	ppu_scale_table_t()
-	{
-		for (s32 i = -31; i < 32; i++)
-		{
-			m_data[i + 31].vf = _mm_set1_ps(static_cast<float>(std::exp2(i)));
-		}
-	}
-
-	FORCE_INLINE __m128 operator [] (s32 scale) const
-	{
-		return m_data[scale + 31].vf;
-	}
-}
-const g_ppu_scale_table;
-
-constexpr u32 ppu_inf_u32 = 0x7F800000u;
-static const f32 ppu_inf_f32 = std::bit_cast<f32>(ppu_inf_u32);
-constexpr u32 ppu_nan_u32 = 0x7FC00000u;
-static const f32 ppu_nan_f32 = std::bit_cast<f32>(ppu_nan_u32);
-static const v128 ppu_vec_nans = v128::from32p(ppu_nan_u32);
-
 // NaNs production precedence: NaN from Va, Vb, Vc
 // and lastly the result of the operation in case none of the operands is a NaN
 // Signaling NaNs are 'quieted' (MSB of fraction is set) with other bits of data remain the same
-inline v128 vec_select_nan(v128 a)
+inline v128 ppu_select_vnan(v128 a)
 {
 	return a;
 }
 
-inline v128 vec_select_nan(v128 a, v128 b)
+inline v128 ppu_select_vnan(v128 a, v128 b)
 {
-	const auto not_nan = v128::eq32f(a, a);
-	return (b & not_nan) | v128::andnot(not_nan, a | ppu_vec_nans);
+	return gv_selectfs(gv_eqfs(a, a), b, a | gv_bcst32(0x7fc00000u));
 }
 
-template <typename... Args>
-inline v128 vec_select_nan(v128 a, v128 b, Args... args)
+inline v128 ppu_select_vnan(v128 a, v128 b, Vector128 auto... args)
 {
-	return vec_select_nan(a, vec_select_nan(b, args...));
-}
-
-v128 vec_handle_nan(v128 result)
-{
-	const auto not_nan = v128::eq32f(result, result);
-	result = (result & not_nan) | v128::andnot(not_nan, ppu_vec_nans);
-
-	return result;
-}
-
-template<typename... Args>
-v128 vec_handle_nan(v128 result, Args... args)
-{
-	return vec_select_nan(args..., vec_handle_nan(result));
-}
-
-template<typename... Args>
-v128 vec_handle_nan(__m128 result, Args... args)
-{
-	return vec_handle_nan(v128::fromF(result), v128::fromF(args)...);
+	return ppu_select_vnan(a, ppu_select_vnan(b, args...));
 }
 
 // Flush denormals to zero if NJ is 1
-inline v128 vec_handle_denormal(ppu_thread& ppu, v128 a)
+template <ppu_exec_bit... Flags>
+inline v128 ppu_flush_denormal(const v128& mask, const v128& a)
 {
-	const auto mask = v128::from32p(ppu.jm_mask);
-	const auto nz = v128::fromV(_mm_srli_epi32(v128::eq32(mask & a, v128{}).vi, 1));
-	return v128::andnot(nz, a);
-}
-
-bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat != v128{}} | (u32{ppu.nj} << 16));
-	return true;
-}
-
-bool ppu_interpreter::MTVSCR(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const u32 vscr = ppu.vr[op.vb]._u32[3];
-	ppu.sat = v128::from32((vscr & 1) != 0);
-	ppu.nj  = (vscr & 0x10000) != 0;
-	ppu.jm_mask = ppu.nj ? ppu_inf_u32 : 0x7fff'ffff;
-	return true;
-}
-
-bool ppu_interpreter::VADDCUW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	ppu.vr[op.vd].vi = _mm_srli_epi32(_mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(0x80000000)), _mm_xor_si128(a, _mm_set1_epi32(0x7fffffff))), 31);
-	return true;
-}
-
-bool ppu_interpreter::VADDFP(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
-	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
-	const auto result = v128::addfs(a, b);
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
-	return true;
-}
-
-bool ppu_interpreter_fast::VADDSBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_adds_epi8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter_precise::VADDSBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 16; i++)
+	if constexpr (((Flags == use_nj) || ...))
 	{
-		const s16 sum = a._s8[i] + b._s8[i];
+		return gv_andn(gv_shr32(gv_eq32(mask & a, gv_bcst32(0)), 1), a);
+	}
+	else
+	{
+		return a;
+	}
+}
 
-		if (sum < INT8_MIN)
+inline v128 ppu_fix_vnan(v128 r)
+{
+	return gv_selectfs(gv_eqfs(r, r), r, gv_bcst32(0x7fc00000u));
+}
+
+template <ppu_exec_bit... Flags>
+inline v128 ppu_set_vnan(v128 r, Vector128 auto... args)
+{
+	if constexpr (((Flags == set_vnan) || ...) && sizeof...(args) > 0)
+	{
+		// Full propagation
+		return ppu_select_vnan(args..., ppu_fix_vnan(r));
+	}
+	else if constexpr (((Flags == fix_vnan) || ...))
+	{
+		// Only fix the result
+		return ppu_fix_vnan(r);
+	}
+	else
+	{
+		// Return as is
+		return r;
+	}
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto MFVSCR()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& sat, auto&& nj)
+	{
+		u32 sat_bit = 0;
+		if constexpr (((Flags == set_sat) || ...))
+			sat_bit = !gv_testz(sat); //!!sat._u;
+		d._u64[0] = 0;
+		d._u64[1] = u64(sat_bit | (u32{nj} << 16)) << 32;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.sat, ppu.nj);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto MTVSCR()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat, use_nj>();
+
+	static const auto exec = [](auto&& sat, auto&& nj, auto&& jm_mask, auto&& b)
+	{
+		const u32 vscr = b._u32[3];
+		if constexpr (((Flags == set_sat) || ...))
+			sat._u = vscr & 1;
+		if constexpr (((Flags == use_nj) || ...))
+			jm_mask = (vscr & 0x10000) ? 0x7f80'0000 : 0x7fff'ffff;
+		nj = (vscr & 0x10000) != 0;
+	};
+
+	RETURN_(ppu.sat, ppu.nj, ppu.jm_mask, ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDCUW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		// ~a is how much can be added to a without carry
+		d = gv_sub32(gv_geu32(~a, b), gv_bcst32(-1));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& jm_mask)
+	{
+		const auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		const auto a = ppu_flush_denormal<Flags...>(m, a_);
+		const auto b = ppu_flush_denormal<Flags...>(m, b_);
+		d = ppu_flush_denormal<Flags...>(m, ppu_set_vnan<Flags...>(gv_addfs(a, b), a, b));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDSBS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
+	{
+		if constexpr (((Flags == set_sat) || ...))
 		{
-			d._s8[i] = INT8_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (sum > INT8_MAX)
-		{
-			d._s8[i] = INT8_MAX;
-			ppu.sat._u32[0] = 1;
+			auto r = gv_adds_s8(a, b);
+			sat = gv_or32(gv_xor32(gv_add8(std::move(a), std::move(b)), std::move(r)), std::move(sat));
+			d = r;
 		}
 		else
 		{
-			d._s8[i] = static_cast<s8>(sum);
+			d = gv_adds_s8(std::move(a), std::move(b));
 		}
-	}
+	};
 
-	return true;
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter_fast::VADDSHS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDSHS()
 {
-	ppu.vr[op.vd].vi = _mm_adds_epi16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-bool ppu_interpreter_precise::VADDSHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 8; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		const s32 sum = a._s16[i] + b._s16[i];
-
-		if (sum < INT16_MIN)
+		if constexpr (((Flags == set_sat) || ...))
 		{
-			d._s16[i] = INT16_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (sum > INT16_MAX)
-		{
-			d._s16[i] = INT16_MAX;
-			ppu.sat._u32[0] = 1;
+			auto r = gv_adds_s16(a, b);
+			sat = gv_or32(gv_xor32(gv_add16(std::move(a), std::move(b)), std::move(r)), std::move(sat));
+			d = r;
 		}
 		else
 		{
-			d._s16[i] = static_cast<s16>(sum);
+			d = gv_adds_s16(std::move(a), std::move(b));
 		}
-	}
+	};
 
-	return true;
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-// TODO: fix
-bool ppu_interpreter_fast::VADDSWS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDSWS()
 {
-	const auto a = ppu.vr[op.va];
-	const auto b = ppu.vr[op.vb];
-	const auto s = v128::add32(a, b); // a + b
-	const auto m = (a ^ s) & (b ^ s); // overflow bit
-	const auto x = _mm_srai_epi32(m.vi, 31); // saturation mask
-	const auto y = _mm_srai_epi32(_mm_and_si128(s.vi, m.vi), 31); // positive saturation mask
-	ppu.vr[op.vd].vi = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(x, 1), y), _mm_or_si128(s.vi, x));
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-bool ppu_interpreter_precise::VADDSWS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 4; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		const s64 sum = s64{a._s32[i]} + b._s32[i];
-
-		if (sum < INT32_MIN)
+		if constexpr (((Flags == set_sat) || ...))
 		{
-			d._s32[i] = INT32_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (sum > INT32_MAX)
-		{
-			d._s32[i] = INT32_MAX;
-			ppu.sat._u32[0] = 1;
+			auto r = gv_adds_s32(a, b);
+			sat = gv_or32(gv_xor32(gv_add32(std::move(a), std::move(b)), std::move(r)), std::move(sat));
+			d = r;
 		}
 		else
 		{
-			d._s32[i] = static_cast<s32>(sum);
+			d = gv_adds_s32(std::move(a), std::move(b));
 		}
-	}
+	};
 
-	return true;
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter::VADDUBM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDUBM()
 {
-	ppu.vr[op.vd] = v128::add8(ppu.vr[op.va], ppu.vr[op.vb]);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter_fast::VADDUBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_adds_epu8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter_precise::VADDUBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 16; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		const u16 sum = a._u8[i] + b._u8[i];
+		d = gv_add8(std::move(a), std::move(b));
+	};
 
-		if (sum > UINT8_MAX)
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDUBS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
+	{
+		if constexpr (((Flags == set_sat) || ...))
 		{
-			d._u8[i] = UINT8_MAX;
-			ppu.sat._u32[0] = 1;
+			auto r = gv_addus_u8(a, b);
+			sat = gv_or32(gv_xor32(gv_add8(std::move(a), std::move(b)), std::move(r)), std::move(sat));
+			d = r;
 		}
 		else
 		{
-			d._u8[i] = static_cast<u8>(sum);
+			d = gv_addus_u8(std::move(a), std::move(b));
 		}
-	}
+	};
 
-	return true;
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter::VADDUHM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDUHM()
 {
-	ppu.vr[op.vd] = v128::add16(ppu.vr[op.va], ppu.vr[op.vb]);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter_fast::VADDUHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_adds_epu16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter_precise::VADDUHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 8; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		const u32 sum = a._u16[i] + b._u16[i];
+		d = gv_add16(std::move(a), std::move(b));
+	};
 
-		if (sum > UINT16_MAX)
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDUHS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
+	{
+		if constexpr (((Flags == set_sat) || ...))
 		{
-			d._u16[i] = UINT16_MAX;
-			ppu.sat._u32[0] = 1;
+			auto r = gv_addus_u16(a, b);
+			sat = gv_or32(gv_xor32(gv_add16(std::move(a), std::move(b)), std::move(r)), std::move(sat));
+			d = r;
 		}
 		else
 		{
-			d._u16[i] = static_cast<u16>(sum);
+			d = gv_addus_u16(std::move(a), std::move(b));
 		}
-	}
+	};
 
-	return true;
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter::VADDUWM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDUWM()
 {
-	ppu.vr[op.vd] = v128::add32(ppu.vr[op.va], ppu.vr[op.vb]);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-// TODO: fix
-bool ppu_interpreter_fast::VADDUWS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_add_epi32(a, b), _mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(0x80000000)), _mm_xor_si128(a, _mm_set1_epi32(0x7fffffff))));
-	return true;
-}
-
-bool ppu_interpreter_precise::VADDUWS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 4; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		const u64 sum = u64{a._u32[i]} + b._u32[i];
+		d = gv_add32(std::move(a), std::move(b));
+	};
 
-		if (sum > UINT32_MAX)
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VADDUWS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
+	{
+		if constexpr (((Flags == set_sat) || ...))
 		{
-			d._u32[i] = UINT32_MAX;
-			ppu.sat._u32[0] = 1;
+			auto r = gv_addus_u32(a, b);
+			sat = gv_or32(gv_xor32(gv_add32(std::move(a), std::move(b)), std::move(r)), std::move(sat));
+			d = r;
 		}
 		else
 		{
-			d._u32[i] = static_cast<u32>(sum);
+			d = gv_addus_u32(std::move(a), std::move(b));
 		}
-	}
+	};
 
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter::VAND(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VAND()
 {
-	ppu.vr[op.vd] = ppu.vr[op.va] & ppu.vr[op.vb];
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter::VANDC(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd] = v128::andnot(ppu.vr[op.vb], ppu.vr[op.va]);
-	return true;
-}
-
-bool ppu_interpreter::VAVGSB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va];
-	const auto b = v128::add8(ppu.vr[op.vb], v128::from8p(1)); // add 1
-	const auto summ = v128::add8(a, b) & v128::from8p(0xfe);
-	const auto sign = v128::from8p(0x80);
-	const auto overflow = (((a ^ summ) & (b ^ summ)) ^ summ ^ v128::eq8(b, sign)) & sign; // calculate msb
-	ppu.vr[op.vd].vi = _mm_or_si128(overflow.vi, _mm_srli_epi64(summ.vi, 1));
-	return true;
-}
-
-bool ppu_interpreter::VAVGSH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va];
-	const auto b = v128::add16(ppu.vr[op.vb], v128::from16p(1)); // add 1
-	const auto summ = v128::add16(a, b);
-	const auto sign = v128::from16p(0x8000);
-	const auto overflow = (((a ^ summ) & (b ^ summ)) ^ summ ^ v128::eq16(b, sign)) & sign; // calculate msb
-	ppu.vr[op.vd].vi = _mm_or_si128(overflow.vi, _mm_srli_epi16(summ.vi, 1));
-	return true;
-}
-
-bool ppu_interpreter::VAVGSW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va];
-	const auto b = v128::add32(ppu.vr[op.vb], v128::from32p(1)); // add 1
-	const auto summ = v128::add32(a, b);
-	const auto sign = v128::from32p(0x80000000);
-	const auto overflow = (((a ^ summ) & (b ^ summ)) ^ summ ^ v128::eq32(b, sign)) & sign; // calculate msb
-	ppu.vr[op.vd].vi = _mm_or_si128(overflow.vi, _mm_srli_epi32(summ.vi, 1));
-	return true;
-}
-
-bool ppu_interpreter::VAVGUB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_avg_epu8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter::VAVGUH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_avg_epu16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter::VAVGUW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va];
-	const auto b = ppu.vr[op.vb];
-	const auto summ = v128::add32(v128::add32(a, b), v128::from32p(1));
-	const auto carry = _mm_xor_si128(_mm_slli_epi32(sse_cmpgt_epu32(summ.vi, a.vi), 31), _mm_set1_epi32(0x80000000));
-	ppu.vr[op.vd].vi = _mm_or_si128(carry, _mm_srli_epi32(summ.vi, 1));
-	return true;
-}
-
-bool ppu_interpreter::VCFSX(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vf = _mm_mul_ps(_mm_cvtepi32_ps(ppu.vr[op.vb].vi), g_ppu_scale_table[0 - op.vuimm]);
-	return true;
-}
-
-bool ppu_interpreter::VCFUX(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto b = ppu.vr[op.vb].vi;
-	const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(b, 31)), _mm_set1_ps(0x80000000));
-	ppu.vr[op.vd].vf = _mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(b, _mm_set1_epi32(0x7fffffff))), fix), g_ppu_scale_table[0 - op.vuimm]);
-	return true;
-}
-
-bool ppu_interpreter::VCMPBFP(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vf;
-	const auto b = ppu.vr[op.vb].vf;
-	const auto sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-	const auto cmp1 = _mm_cmpnle_ps(a, b);
-	const auto cmp2 = _mm_cmpnge_ps(a, _mm_xor_ps(b, sign));
-	ppu.vr[op.vd].vf = _mm_or_ps(_mm_and_ps(cmp1, sign), _mm_and_ps(cmp2, _mm_castsi128_ps(_mm_set1_epi32(0x40000000))));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, false, false, _mm_movemask_ps(_mm_or_ps(cmp1, cmp2)) == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPEQFP(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_ps(ppu.vr[op.vd].vf = _mm_cmpeq_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xf, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPEQUB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8((ppu.vr[op.vd] = v128::eq8(ppu.vr[op.va], ppu.vr[op.vb])).vi);
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPEQUH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8((ppu.vr[op.vd] = v128::eq16(ppu.vr[op.va], ppu.vr[op.vb])).vi);
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPEQUW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8((ppu.vr[op.vd] = v128::eq32(ppu.vr[op.va], ppu.vr[op.vb])).vi);
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPGEFP(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_ps(ppu.vr[op.vd].vf = _mm_cmpge_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xf, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPGTFP(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_ps(ppu.vr[op.vd].vf = _mm_cmpgt_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xf, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPGTSB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8(ppu.vr[op.vd].vi = _mm_cmpgt_epi8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPGTSH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8(ppu.vr[op.vd].vi = _mm_cmpgt_epi16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPGTSW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8(ppu.vr[op.vd].vi = _mm_cmpgt_epi32(ppu.vr[op.va].vi, ppu.vr[op.vb].vi));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPGTUB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8(ppu.vr[op.vd].vi = sse_cmpgt_epu8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPGTUH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8(ppu.vr[op.vd].vi = sse_cmpgt_epu16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-bool ppu_interpreter::VCMPGTUW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto rmask = _mm_movemask_epi8(ppu.vr[op.vd].vi = sse_cmpgt_epu32(ppu.vr[op.va].vi, ppu.vr[op.vb].vi));
-	if (op.oe) [[unlikely]] ppu_cr_set(ppu, 6, rmask == 0xffff, false, rmask == 0, false);
-	return true;
-}
-
-// TODO: fix
-bool ppu_interpreter_fast::VCTSXS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto scaled = _mm_mul_ps(ppu.vr[op.vb].vf, g_ppu_scale_table[op.vuimm]);
-	ppu.vr[op.vd].vi = _mm_xor_si128(_mm_cvttps_epi32(scaled), _mm_castps_si128(_mm_cmpge_ps(scaled, _mm_set1_ps(0x80000000))));
-	return true;
-}
-
-bool ppu_interpreter_precise::VCTSXS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto uim = op.vuimm;
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 4; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		const f32 X = b._f[i];
-		const bool sign = std::signbit(X);
-		const s32 exp = fexpf(X);
-		const u32 frac = std::bit_cast<u32>(X) << 9;
-		const s32 exp2 = exp + uim - 127;
+		d = gv_andfs(std::move(a), std::move(b));
+	};
 
-		if (exp == 255)
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VANDC()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_andnfs(std::move(b), std::move(a));
+	};
+
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VAVGSB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_avgs8(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VAVGSH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_avgs16(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VAVGSW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_avgs32(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VAVGUB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_avgu8(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VAVGUH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_avgu16(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VAVGUW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_avgu32(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCFSX()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& b, u32 i)
+	{
+		d = gv_subus_u16(gv_cvts32_tofs(b), gv_bcst32(i));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm << 23);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCFUX()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& b, u32 i)
+	{
+		d = gv_subus_u16(gv_cvtu32_tofs(b), gv_bcst32(i));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm << 23);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPBFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto sign = gv_bcstfs(-0.);
+		const auto cmp1 = gv_nlefs(a, b);
+		const auto cmp2 = gv_ngefs(a, b ^ sign);
+		const auto r = (cmp1 & sign) | gv_shr32(cmp2 & sign, 1);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, false, false, gv_testz(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPEQFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_eqfs(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPEQUB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_eq8(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPEQUH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_eq16(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPEQUW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_eq32(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPGEFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_gefs(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPGTFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_gtfs(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPGTSB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_gts8(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPGTSH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_gts16(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPGTSW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_gts32(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPGTUB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_gtu8(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPGTUH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_gtu16(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCMPGTUW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, auto&& d, auto&& a, auto&& b)
+	{
+		const auto r = gv_gtu32(a, b);
+		if constexpr (((Flags == has_oe) || ...))
+			ppu_cr_set(ppu, 6, gv_testall1(r), false, gv_testall0(r), false);
+		d = r;
+	};
+
+	RETURN_(ppu, ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCTSXS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<fix_vnan, set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& b, auto&& sat, u32 i)
+	{
+		const auto s = gv_mulfs(b, gv_bcst32(i));
+		const auto l = gv_ltfs(s, gv_bcstfs(-2147483648.));
+		const auto h = gv_gefs(s, gv_bcstfs(2147483648.));
+		v128 r = s;
+#if !defined(ARCH_X64) && !defined(ARCH_ARM64)
+		r = gv_selectfs(l, gv_bcstfs(-2147483648.), r);
+#endif
+		r = gv_cvtfs_tos32(s);
+#if !defined(ARCH_ARM64)
+		r = gv_select32(h, gv_bcst32(0x7fffffff), r);
+#endif
+		if constexpr (((Flags == fix_vnan) || ...))
+			r = r & gv_eqfs(b, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | l | h;
+		d = r;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], ppu.sat, (op.vuimm + 127) << 23);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VCTUXS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<fix_vnan, set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& b, auto&& sat, u32 i)
+	{
+		const auto s = gv_mulfs(b, gv_bcst32(i));
+		const auto l = gv_ltfs(s, gv_bcstfs(0.));
+		const auto h = gv_gefs(s, gv_bcstfs(4294967296.));
+		v128 r = gv_cvtfs_tou32(s);
+#if !defined(ARCH_ARM64)
+		r = gv_andn(l, r); // saturate to zero
+#endif
+#if !defined(__AVX512VL__) && !defined(ARCH_ARM64)
+		r = r | h; // saturate to 0xffffffff
+#endif
+		if constexpr (((Flags == fix_vnan) || ...))
+			r = r & gv_eqfs(b, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | l | h;
+		d = r;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], ppu.sat, (op.vuimm + 127) << 23);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VEXPTEFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<fix_vnan>();
+
+	static const auto exec = [](auto&& d, auto&& b)
+	{
+		// for (u32 i = 0; i < 4; i++) d._f[i] = std::exp2f(b._f[i]);
+		d = ppu_set_vnan<Flags...>(gv_exp2_approxfs(b));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VLOGEFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<fix_vnan>();
+
+	static const auto exec = [](auto&& d, auto&& b)
+	{
+		// for (u32 i = 0; i < 4; i++) d._f[i] = std::log2f(b._f[i]);
+		d = ppu_set_vnan<Flags...>(gv_log2_approxfs(b));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMADDFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& c_, auto&& jm_mask)
+	{
+		const auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		const auto a = ppu_flush_denormal<Flags...>(m, a_);
+		const auto b = ppu_flush_denormal<Flags...>(m, b_);
+		const auto c = ppu_flush_denormal<Flags...>(m, c_);
+		d = ppu_flush_denormal<Flags...>(m, ppu_set_vnan<Flags...>(gv_fmafs(a, c, b)));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc], ppu.jm_mask);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMAXFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& jm_mask)
+	{
+		d = ppu_flush_denormal<Flags...>(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan<Flags...>(gv_maxfs(a, b), a, b));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMAXSB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_maxs8(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMAXSH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_maxs16(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMAXSW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_maxs32(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMAXUB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_maxu8(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMAXUH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_maxu16(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMAXUW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_maxu32(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMHADDSHS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c, auto&& sat)
+	{
+		const auto m = gv_muls_hds16(a, b);
+		const auto f = gv_gts16(gv_bcst16(0), c);
+		const auto x = gv_eq16(gv_maxs16(a, b), gv_bcst16(0x8000));
+		const auto r = gv_sub16(gv_adds_s16(m, c), x & f);
+		const auto s = gv_add16(m, c);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | gv_andn(x, s ^ r) | gv_andn(f, x);
+		d = r;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc], ppu.sat);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMHRADDSHS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c, auto&& sat)
+	{
+		if constexpr (((Flags != set_sat) && ...))
 		{
-			if (frac != 0)
+			d = gv_rmuladds_hds16(a, b, c);
+		}
+		else
+		{
+			const auto m = gv_rmuls_hds16(a, b);
+			const auto f = gv_gts16(gv_bcst16(0), c);
+			const auto x = gv_eq16(gv_maxs16(a, b), gv_bcst16(0x8000));
+			const auto r = gv_sub16(gv_adds_s16(m, c), x & f);
+			const auto s = gv_add16(m, c);
+			if constexpr (((Flags == set_sat) || ...))
+				sat = sat | gv_andn(x, s ^ r) | gv_andn(f, x);
+			d = r;
+		}
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc], ppu.sat);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMINFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& jm_mask)
+	{
+		d = ppu_flush_denormal<Flags...>(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan<Flags...>(gv_minfs(a, b), a, b));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMINSB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_mins8(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMINSH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_mins16(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMINSW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_mins32(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMINUB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_minu8(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMINUH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_minu16(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMINUW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_minu32(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMLADDUHM()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
+	{
+		d = gv_muladd16(a, b, c);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMRGHB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_unpackhi8(b, a);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMRGHH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_unpackhi16(b, a);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMRGHW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_unpackhi32(b, a);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMRGLB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_unpacklo8(b, a);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMRGLH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_unpacklo16(b, a);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMRGLW()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_unpacklo32(b, a);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMSUMMBM()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
+	{
+		d = gv_dotu8s8x4(b, a, c);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMSUMSHM()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
+	{
+		d = gv_dots16x2(a, b, c);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMSUMSHS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c, auto&& sat)
+	{
+		const auto r = gv_dots_s16x2(a, b, c);
+		const auto s = gv_dots16x2(a, b, c);
+		d = r;
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (s ^ r);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc], ppu.sat);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMSUMUBM()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
+	{
+		d = gv_dotu8x4(a, b, c);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMSUMUHM()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c)
+	{
+		d = gv_add32(c, gv_dotu16x2(a, b));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMSUMUHS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c, auto&& sat)
+	{
+		for (uint w = 0; w < 4; w++)
+		{
+			u64 result = 0;
+			u32 saturated = 0;
+
+			for (uint h = 0; h < 2; h++)
 			{
-				d._s32[i] = 0;
+				result += u64{a._u16[w * 2 + h]} * b._u16[w * 2 + h];
+			}
+
+			result += c._u32[w];
+
+			if (result > 0xffffffffu)
+			{
+				saturated = 0xffffffff;
+				if constexpr (((Flags == set_sat) || ...))
+					sat._u32[0] = 1;
 			}
 			else
-			{
-				ppu.sat._u32[0] = 1;
-				d._s32[i] = sign ? 0x80000000 : 0x7FFFFFFF;
-			}
-		}
-		else if (exp2 > 30)
-		{
-			ppu.sat._u32[0] = 1;
-			d._s32[i] = sign ? 0x80000000 : 0x7FFFFFFF;
-		}
-		else if (exp2 < 0)
-		{
-			d._s32[i] = 0;
-		}
-		else
-		{
-			s32 significand = (0x80000000 | (frac >> 1)) >> (31 - exp2);
-			d._s32[i] = sign ? -significand : significand;
-		}
-	}
+				saturated = static_cast<u32>(result);
 
-	return true;
+			d._u32[w] = saturated;
+		}
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc], ppu.sat);
 }
 
-bool ppu_interpreter_fast::VCTUXS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMULESB()
 {
-	const auto scaled1 = _mm_max_ps(_mm_mul_ps(ppu.vr[op.vb].vf, g_ppu_scale_table[op.vuimm]), _mm_set1_ps(0.0f));
-	const auto scaled2 = _mm_and_ps(_mm_sub_ps(scaled1, _mm_set1_ps(0x80000000)), _mm_cmpge_ps(scaled1, _mm_set1_ps(0x80000000)));
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_or_si128(_mm_cvttps_epi32(scaled1), _mm_cvttps_epi32(scaled2)), _mm_castps_si128(_mm_cmpge_ps(scaled1, _mm_set1_ps(0x100000000))));
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter_precise::VCTUXS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto uim = op.vuimm;
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 4; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		const f32 X = b._f[i];
-		const bool sign = std::signbit(X);
-		const s32 exp = fexpf(X);
-		const u32 frac = std::bit_cast<u32>(X) << 9;
-		const s32 exp2 = exp + uim - 127;
+		d = _mm_mullo_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(b, 8));
+	};
 
-		if (exp == 255)
-		{
-			if (frac != 0)
-			{
-				d._u32[i] = 0;
-			}
-			else
-			{
-				ppu.sat._u32[0] = 1;
-				d._u32[i] = sign ? 0 : 0xFFFFFFFF;
-			}
-		}
-		else if (exp2 > 31)
-		{
-			ppu.sat._u32[0] = 1;
-			d._u32[i] = sign ? 0 : 0xFFFFFFFF;
-		}
-		else if (exp2 < 0)
-		{
-			d._u32[i] = 0;
-		}
-		else if (sign)
-		{
-			ppu.sat._u32[0] = 1;
-			d._u32[i] = 0;
-		}
-		else
-		{
-			d._u32[i] = (0x80000000 | (frac >> 1)) >> (31 - exp2);
-		}
-	}
-
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
-bool ppu_interpreter::VEXPTEFP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMULESH()
 {
-	ppu.vr[op.vd].vf = sse_exp2_ps(ppu.vr[op.vb].vf);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	ppu.vr[op.vd] = _mm_madd_epi16(_mm_srli_epi32(ppu.vr[op.va], 16), _mm_srli_epi32(ppu.vr[op.vb], 16));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMULEUB()
 {
-	ppu.vr[op.vd].vf = sse_log2_ps(ppu.vr[op.vb].vf);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	ppu.vr[op.vd] = _mm_mullo_epi16(_mm_srli_epi16(ppu.vr[op.va], 8), _mm_srli_epi16(ppu.vr[op.vb], 8));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMULEUH()
 {
-	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]).vf;
-	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
-	const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]).vf;
-	const auto result = _mm_add_ps(_mm_mul_ps(a, c), b);
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result));
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
-	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
-	const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]);
-	ppu.vr[op.rd] = vec_handle_denormal(ppu, vec_handle_nan(v128::fma32f(a, c, b), a, b, c));
-	return true;
-}
-
-bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto a = ppu.vr[op.va];
 	const auto b = ppu.vr[op.vb];
-	const auto result = _mm_and_ps(_mm_max_ps(a.vf, b.vf), _mm_max_ps(b.vf, a.vf));
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(v128::fromF(result), a, b));
-	return true;
-}
-
-bool ppu_interpreter::VMAXSB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto m = _mm_cmpgt_epi8(a, b);
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
-	return true;
-}
-
-bool ppu_interpreter::VMAXSH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_max_epi16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMAXSW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto m = _mm_cmpgt_epi32(a, b);
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
-	return true;
-}
-
-bool ppu_interpreter::VMAXUB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_max_epu8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMAXUH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto mask = _mm_set1_epi32(0x80008000);
-	ppu.vr[op.vd].vi = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(ppu.vr[op.va].vi, mask), _mm_xor_si128(ppu.vr[op.vb].vi, mask)), mask);
-	return true;
-}
-
-bool ppu_interpreter::VMAXUW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto m = sse_cmpgt_epu32(a, b);
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
-	return true;
-}
-
-bool ppu_interpreter_fast::VMHADDSHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto c = ppu.vr[op.vc].vi;
-	const auto m = _mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1));
-	const auto s = _mm_cmpeq_epi16(m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000)
-	ppu.vr[op.vd].vi = _mm_adds_epi16(_mm_adds_epi16(_mm_xor_si128(m, s), c), _mm_srli_epi16(s, 15));
-	return true;
-}
-
-bool ppu_interpreter_precise::VMHADDSHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	const auto& c = ppu.vr[op.vc];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 8; i++)
-	{
-		const s32 prod = a._s16[i] * b._s16[i];
-		const s32 sum = (prod >> 15) + c._s16[i];
-
-		if (sum < INT16_MIN)
-		{
-			d._s16[i] = INT16_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (sum > INT16_MAX)
-		{
-			d._s16[i] = INT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._s16[i] = static_cast<s16>(sum);
-		}
-	}
-
-	return true;
-}
-
-bool ppu_interpreter_fast::VMHRADDSHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto c = ppu.vr[op.vc].vi;
-	const auto x80 = _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product
-	const auto al = _mm_unpacklo_epi16(a, x80);
-	const auto ah = _mm_unpackhi_epi16(a, x80);
-	const auto bl = _mm_unpacklo_epi16(b, x80);
-	const auto bh = _mm_unpackhi_epi16(b, x80);
-	const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15);
-	const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15);
-	const auto cl = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), c), 16);
-	const auto ch = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), c), 16);
-	ppu.vr[op.vd].vi = _mm_packs_epi32(_mm_add_epi32(ml, cl), _mm_add_epi32(mh, ch));
-	return true;
-}
-
-bool ppu_interpreter_precise::VMHRADDSHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	const auto& c = ppu.vr[op.vc];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 8; i++)
-	{
-		const s32 prod = a._s16[i] * b._s16[i];
-		const s32 sum = ((prod + 0x00004000) >> 15) + c._s16[i];
-
-		if (sum < INT16_MIN)
-		{
-			d._s16[i] = INT16_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (sum > INT16_MAX)
-		{
-			d._s16[i] = INT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._s16[i] = static_cast<s16>(sum);
-		}
-	}
-
-	return true;
-}
-
-bool ppu_interpreter::VMINFP(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vf;
-	const auto b = ppu.vr[op.vb].vf;
-	const auto result = _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
-	return true;
-}
-
-bool ppu_interpreter::VMINSB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto m = _mm_cmpgt_epi8(a, b);
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
-	return true;
-}
-
-bool ppu_interpreter::VMINSH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_min_epi16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMINSW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto m = _mm_cmpgt_epi32(a, b);
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
-	return true;
-}
-
-bool ppu_interpreter::VMINUB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_min_epu8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMINUH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto mask = _mm_set1_epi32(0x80008000);
-	ppu.vr[op.vd].vi = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(ppu.vr[op.va].vi, mask), _mm_xor_si128(ppu.vr[op.vb].vi, mask)), mask);
-	return true;
-}
-
-bool ppu_interpreter::VMINUW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto m = sse_cmpgt_epu32(a, b);
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
-	return true;
-}
-
-bool ppu_interpreter::VMLADDUHM(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_add_epi16(_mm_mullo_epi16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi), ppu.vr[op.vc].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMRGHB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_unpackhi_epi8(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMRGHH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_unpackhi_epi16(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMRGHW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_unpackhi_epi32(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMRGLB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_unpacklo_epi8(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMRGLH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_unpacklo_epi16(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMRGLW(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_unpacklo_epi32(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
-}
-
-bool ppu_interpreter::VMSUMMBM(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi; // signed bytes
-	const auto b = ppu.vr[op.vb].vi; // unsigned bytes
-	const auto c = ppu.vr[op.vc].vi;
-	const auto ah = _mm_srai_epi16(a, 8);
-	const auto bh = _mm_srli_epi16(b, 8);
-	const auto al = _mm_srai_epi16(_mm_slli_epi16(a, 8), 8);
-	const auto bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff));
-	const auto sh = _mm_madd_epi16(ah, bh);
-	const auto sl = _mm_madd_epi16(al, bl);
-	ppu.vr[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, sh), sl);
-	return true;
-}
-
-bool ppu_interpreter::VMSUMSHM(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_add_epi32(_mm_madd_epi16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi), ppu.vr[op.vc].vi);
-	return true;
-}
-
-bool ppu_interpreter_fast::VMSUMSHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	const auto& c = ppu.vr[op.vc];
-
-	for (uint w = 0; w < 4; w++)
-	{
-		s64 result = 0;
-		s32 saturated = 0;
-
-		for (uint h = 0; h < 2; h++)
-		{
-			result += a._s16[w * 2 + h] * b._s16[w * 2 + h];
-		}
-
-		result += c._s32[w];
-
-		if (result > 0x7fffffff)
-		{
-			saturated = 0x7fffffff;
-		}
-		else if (result < INT32_MIN)
-		{
-			saturated = 0x80000000;
-		}
-		else
-			saturated = static_cast<s32>(result);
-
-		d._s32[w] = saturated;
-	}
-	return true;
-}
-
-bool ppu_interpreter_precise::VMSUMSHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	const auto& c = ppu.vr[op.vc];
-
-	for (uint w = 0; w < 4; w++)
-	{
-		s64 result = 0;
-		s32 saturated = 0;
-
-		for (uint h = 0; h < 2; h++)
-		{
-			result += a._s16[w * 2 + h] * b._s16[w * 2 + h];
-		}
-
-		result += c._s32[w];
-
-		if (result > 0x7fffffff)
-		{
-			saturated = 0x7fffffff;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (result < INT32_MIN)
-		{
-			saturated = 0x80000000;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-			saturated = static_cast<s32>(result);
-
-		d._s32[w] = saturated;
-	}
-	return true;
-}
-
-bool ppu_interpreter::VMSUMUBM(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto c = ppu.vr[op.vc].vi;
-	const auto mask = _mm_set1_epi16(0x00ff);
-	const auto ah = _mm_srli_epi16(a, 8);
-	const auto al = _mm_and_si128(a, mask);
-	const auto bh = _mm_srli_epi16(b, 8);
-	const auto bl = _mm_and_si128(b, mask);
-	const auto sh = _mm_madd_epi16(ah, bh);
-	const auto sl = _mm_madd_epi16(al, bl);
-	ppu.vr[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, sh), sl);
-	return true;
-}
-
-bool ppu_interpreter::VMSUMUHM(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
-	const auto c = ppu.vr[op.vc].vi;
-	const auto ml = _mm_mullo_epi16(a, b); // low results
-	const auto mh = _mm_mulhi_epu16(a, b); // high results
-	const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)));
-	const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000)));
-	ppu.vr[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, ls), hs);
-	return true;
-}
-
-bool ppu_interpreter_fast::VMSUMUHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	const auto& c = ppu.vr[op.vc];
-
-	for (uint w = 0; w < 4; w++)
-	{
-		u64 result = 0;
-		u32 saturated = 0;
-
-		for (uint h = 0; h < 2; h++)
-		{
-			result += u64{a._u16[w * 2 + h]} * b._u16[w * 2 + h];
-		}
-
-		result += c._u32[w];
-
-		if (result > 0xffffffffu)
-		{
-			saturated = 0xffffffff;
-		}
-		else
-			saturated = static_cast<u32>(result);
-
-		d._u32[w] = saturated;
-	}
-	return true;
-}
-
-bool ppu_interpreter_precise::VMSUMUHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	const auto& c = ppu.vr[op.vc];
-
-	for (uint w = 0; w < 4; w++)
-	{
-		u64 result = 0;
-		u32 saturated = 0;
-
-		for (uint h = 0; h < 2; h++)
-		{
-			result += u64{a._u16[w * 2 + h]} * b._u16[w * 2 + h];
-		}
-
-		result += c._u32[w];
-
-		if (result > 0xffffffffu)
-		{
-			saturated = 0xffffffff;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-			saturated = static_cast<u32>(result);
-
-		d._u32[w] = saturated;
-	}
-	return true;
-}
-
-bool ppu_interpreter::VMULESB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_mullo_epi16(_mm_srai_epi16(ppu.vr[op.va].vi, 8), _mm_srai_epi16(ppu.vr[op.vb].vi, 8));
-	return true;
-}
-
-bool ppu_interpreter::VMULESH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_madd_epi16(_mm_srli_epi32(ppu.vr[op.va].vi, 16), _mm_srli_epi32(ppu.vr[op.vb].vi, 16));
-	return true;
-}
-
-bool ppu_interpreter::VMULEUB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_mullo_epi16(_mm_srli_epi16(ppu.vr[op.va].vi, 8), _mm_srli_epi16(ppu.vr[op.vb].vi, 8));
-	return true;
-}
-
-bool ppu_interpreter::VMULEUH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
 	const auto ml = _mm_mullo_epi16(a, b);
 	const auto mh = _mm_mulhi_epu16(a, b);
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_srli_epi32(ml, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000)));
-	return true;
+	ppu.vr[op.vd] = _mm_or_si128(_mm_srli_epi32(ml, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000)));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VMULOSB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMULOSB()
 {
-	ppu.vr[op.vd].vi = _mm_mullo_epi16(_mm_srai_epi16(_mm_slli_epi16(ppu.vr[op.va].vi, 8), 8), _mm_srai_epi16(_mm_slli_epi16(ppu.vr[op.vb].vi, 8), 8));
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	ppu.vr[op.vd] = _mm_mullo_epi16(_mm_srai_epi16(_mm_slli_epi16(ppu.vr[op.va], 8), 8), _mm_srai_epi16(_mm_slli_epi16(ppu.vr[op.vb], 8), 8));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VMULOSH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMULOSH()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto mask = _mm_set1_epi32(0x0000ffff);
-	ppu.vr[op.vd].vi = _mm_madd_epi16(_mm_and_si128(ppu.vr[op.va].vi, mask), _mm_and_si128(ppu.vr[op.vb].vi, mask));
-	return true;
+	ppu.vr[op.vd] = _mm_madd_epi16(_mm_and_si128(ppu.vr[op.va], mask), _mm_and_si128(ppu.vr[op.vb], mask));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VMULOUB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMULOUB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto mask = _mm_set1_epi16(0x00ff);
-	ppu.vr[op.vd].vi = _mm_mullo_epi16(_mm_and_si128(ppu.vr[op.va].vi, mask), _mm_and_si128(ppu.vr[op.vb].vi, mask));
-	return true;
+	ppu.vr[op.vd] = _mm_mullo_epi16(_mm_and_si128(ppu.vr[op.va], mask), _mm_and_si128(ppu.vr[op.vb], mask));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VMULOUH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VMULOUH()
 {
-	const auto a = ppu.vr[op.va].vi;
-	const auto b = ppu.vr[op.vb].vi;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
 	const auto ml = _mm_mullo_epi16(a, b);
 	const auto mh = _mm_mulhi_epu16(a, b);
-	ppu.vr[op.vd].vi = _mm_or_si128(_mm_slli_epi32(mh, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)));
-	return true;
+	ppu.vr[op.vd] = _mm_or_si128(_mm_slli_epi32(mh, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VNMSUBFP()
 {
-	const auto a = _mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
-	const auto b = _mm_set1_ps(-0.0f);
-	const auto result = _mm_xor_ps(a, b);
-	ppu.vr[op.vd] = vec_handle_nan(result);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	// An odd case with (FLT_MIN, FLT_MIN, FLT_MIN) produces FLT_MIN instead of 0
+	const auto s = _mm_set1_ps(-0.0f);
+	const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask);
+	const auto a = ppu_flush_denormal<Flags...>(m, ppu.vr[op.va]);
+	const auto b = ppu_flush_denormal<Flags...>(m, ppu.vr[op.vb]);
+	const auto c = ppu_flush_denormal<Flags...>(m, ppu.vr[op.vc]);
+	const auto r = _mm_xor_ps(gv_fmafs(a, c, _mm_xor_ps(b, s)), s);
+	ppu.vr[op.rd] = ppu_flush_denormal<Flags...>(m, ppu_set_vnan<Flags...>(r));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VNOR()
 {
-	const auto m = _mm_set1_ps(-0.0f);
-	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
-	const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]);
-	const auto b = v128::fromF(_mm_xor_ps(ppu.vr[op.vb].vf, m));
-	const auto r = v128::fromF(_mm_xor_ps(v128::fma32f(a, c, b).vf, m));
-	ppu.vr[op.rd] = vec_handle_denormal(ppu, vec_handle_nan(r, a, b, c));
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter::VNOR(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.vr[op.vd] = ~(ppu.vr[op.va] | ppu.vr[op.vb]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VOR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VOR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.vr[op.vd] = ppu.vr[op.va] | ppu.vr[op.vb];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VPERM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPERM()
 {
-	ppu.vr[op.vd].vi = s_use_ssse3
-		? sse_altivec_vperm(ppu.vr[op.va].vi, ppu.vr[op.vb].vi, ppu.vr[op.vc].vi)
-		: sse_altivec_vperm_v0(ppu.vr[op.va].vi, ppu.vr[op.vb].vi, ppu.vr[op.vc].vi);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	ppu.vr[op.vd] = s_use_ssse3
+		? sse_altivec_vperm(ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc])
+		: sse_altivec_vperm_v0(ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VPKPX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKPX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	v128 VB = ppu.vr[op.vb];
@@ -1644,372 +2164,178 @@ bool ppu_interpreter::VPKPX(ppu_thread& ppu, ppu_opcode_t op)
 		d._u16[3 - h] = (bb7 << 15) | (bb8 << 10) | (bb16 << 5) | bb24;
 		d._u16[4 + (3 - h)] = (ab7 << 15) | (ab8 << 10) | (ab16 << 5) | ab24;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::VPKSHSS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKSHSS()
 {
-	ppu.vr[op.vd].vi = _mm_packs_epi16(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-bool ppu_interpreter_precise::VPKSHSS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	v128 d;
-
-	for (u8 i = 0; i < 8; i++)
-	{
-		s16 result = a._s16[i];
-
-		if (result < INT8_MIN)
-		{
-			d._s8[i + 8] = INT8_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (result > INT8_MAX)
-		{
-			d._s8[i + 8] = INT8_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._s8[i + 8] = static_cast<s8>(result);
-		}
-
-		result = b._s16[i];
-
-		if (result < INT8_MIN)
-		{
-			d._s8[i] = INT8_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (result > INT8_MAX)
-		{
-			d._s8[i] = INT8_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._s8[i] = static_cast<s8>(result);
-		}
-	}
-
-	ppu.vr[op.vd] = d;
-	return true;
-}
-
-bool ppu_interpreter_fast::VPKSHUS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_packus_epi16(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
-}
-
-bool ppu_interpreter_precise::VPKSHUS(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto a = ppu.vr[op.va];
 	const auto b = ppu.vr[op.vb];
-
-	// Detect saturation
-	{
-		const u64 mask = 0xFF00FF00FF00FF00ULL;
-		const auto all_bits = a | b;
-		if ((all_bits._u64[0] | all_bits._u64[1]) & mask)
-		{
-			ppu.sat._u32[0] = 1;
-		}
-	}
-
-	ppu.vr[op.vd].vi = _mm_packus_epi16(b.vi, a.vi);
-	return true;
+	ppu.vr[op.vd] = _mm_packs_epi16(b, a);
+	if constexpr (((Flags == set_sat) || ...))
+		ppu.sat = ppu.sat | gv_shr16(gv_add16(a, gv_bcst16(0x80)) | gv_add16(b, gv_bcst16(0x80)), 8);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::VPKSWSS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKSHUS()
 {
-	ppu.vr[op.vd].vi = _mm_packs_epi32(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+	ppu.vr[op.vd] = _mm_packus_epi16(b, a);
+	if constexpr (((Flags == set_sat) || ...))
+		ppu.sat = ppu.sat | gv_shr16(a | b, 8);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::VPKSWSS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKSWSS()
 {
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	v128 d;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-	for (u8 i = 0; i < 4; i++)
-	{
-		s32 result = a._s32[i];
-
-		if (result < INT16_MIN)
-		{
-			d._s16[i + 4] = INT16_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (result > INT16_MAX)
-		{
-			d._s16[i + 4] = INT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._s16[i + 4] = static_cast<s16>(result);
-		}
-
-		result = b._s32[i];
-
-		if (result < INT16_MIN)
-		{
-			d._s16[i] = INT16_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (result > INT16_MAX)
-		{
-			d._s16[i] = INT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._s16[i] = static_cast<s16>(result);
-		}
-	}
-
-	ppu.vr[op.vd] = d;
-	return true;
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+	ppu.vr[op.vd] = _mm_packs_epi32(b, a);
+	if constexpr (((Flags == set_sat) || ...))
+		ppu.sat = ppu.sat | gv_shr32(gv_add32(a, gv_bcst32(0x8000)) | gv_add32(b, gv_bcst32(0x8000)), 16);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::VPKSWUS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKSWUS()
 {
-	//ppu.vr[op.vd].vi = _mm_packus_epi32(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	v128 VB = ppu.vr[op.vb];
-	for (uint h = 0; h < 4; h++)
-	{
-		s32 result = VA._s32[h];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-		if (result > UINT16_MAX)
-		{
-			result = UINT16_MAX;
-		}
-		else if (result < 0)
-		{
-			result = 0;
-		}
-
-		d._u16[h + 4] = result;
-
-		result = VB._s32[h];
-
-		if (result > UINT16_MAX)
-		{
-			result = UINT16_MAX;
-		}
-		else if (result < 0)
-		{
-			result = 0;
-		}
-
-		d._u16[h] = result;
-	}
-	return true;
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+#if defined(__SSE4_1__) || defined(ARCH_ARM64)
+	ppu.vr[op.vd] = _mm_packus_epi32(b, a);
+#else
+	const auto s = _mm_srai_epi16(_mm_packs_epi32(b, a), 15);
+	const auto r = gv_add16(_mm_packs_epi32(gv_sub32(b, gv_bcst32(0x8000)), gv_sub32(a, gv_bcst32(0x8000))), gv_bcst16(0x8000));
+	ppu.vr[op.vd] = gv_andn(s, r);
+#endif
+	if constexpr (((Flags == set_sat) || ...))
+		ppu.sat = ppu.sat | gv_shr32(a | b, 16);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::VPKSWUS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKUHUM()
 {
-	//ppu.vr[op.vd].vi = _mm_packus_epi32(ppu.vr[op.vb].vi, ppu.vr[op.va].vi);
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	v128 VB = ppu.vr[op.vb];
-	for (uint h = 0; h < 4; h++)
-	{
-		s32 result = VA._s32[h];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-		if (result > UINT16_MAX)
-		{
-			result = UINT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (result < 0)
-		{
-			result = 0;
-			ppu.sat._u32[0] = 1;
-		}
-
-		d._u16[h + 4] = result;
-
-		result = VB._s32[h];
-
-		if (result > UINT16_MAX)
-		{
-			result = UINT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (result < 0)
-		{
-			result = 0;
-			ppu.sat._u32[0] = 1;
-		}
-
-		d._u16[h] = result;
-	}
-	return true;
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+	ppu.vr[op.vd] = _mm_packus_epi16(b & _mm_set1_epi16(0xff), a & _mm_set1_epi16(0xff));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VPKUHUM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKUHUS()
 {
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	v128 VB = ppu.vr[op.vb];
-	for (uint b = 0; b < 8; b++)
-	{
-		d._u8[b + 8] = VA._u8[b * 2];
-		d._u8[b] = VB._u8[b * 2];
-	}
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+	const v128 s = _mm_cmpeq_epi8(_mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)), _mm_setzero_si128());
+	const v128 r = _mm_packus_epi16(b & _mm_set1_epi16(0xff), a & _mm_set1_epi16(0xff));
+	ppu.vr[op.vd] = r | ~s;
+	if constexpr (((Flags == set_sat) || ...))
+		ppu.sat = ppu.sat | gv_shr16(a | b, 8);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::VPKUHUS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKUWUM()
 {
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	v128 VB = ppu.vr[op.vb];
-	for (uint b = 0; b < 8; b++)
-	{
-		u16 result = VA._u16[b];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-		if (result > UINT8_MAX)
-		{
-			result = UINT8_MAX;
-		}
-
-		d._u8[b + 8] = static_cast<u8>(result);
-
-		result = VB._u16[b];
-
-		if (result > UINT8_MAX)
-		{
-			result = UINT8_MAX;
-		}
-
-		d._u8[b] = static_cast<u8>(result);
-	}
-	return true;
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+#if defined(__SSE4_1__) || defined(ARCH_ARM64)
+	const auto r = _mm_packus_epi32(b & _mm_set1_epi32(0xffff), a & _mm_set1_epi32(0xffff));
+#else
+	const auto r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(b, 16), 16), _mm_srai_epi32(_mm_slli_epi32(a, 16), 16));
+#endif
+	ppu.vr[op.vd] = r;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::VPKUHUS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VPKUWUS()
 {
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	v128 VB = ppu.vr[op.vb];
-	for (uint b = 0; b < 8; b++)
-	{
-		u16 result = VA._u16[b];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-		if (result > UINT8_MAX)
-		{
-			result = UINT8_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-
-		d._u8[b + 8] = static_cast<u8>(result);
-
-		result = VB._u16[b];
-
-		if (result > UINT8_MAX)
-		{
-			result = UINT8_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-
-		d._u8[b] = static_cast<u8>(result);
-	}
-	return true;
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+	const v128 s = _mm_cmpeq_epi16(_mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), _mm_setzero_si128());
+#if defined(__SSE4_1__) || defined(ARCH_ARM64)
+	const v128 r = _mm_packus_epi32(b & _mm_set1_epi32(0xffff), a & _mm_set1_epi32(0xffff));
+#else
+	const v128 r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(b, 16), 16), _mm_srai_epi32(_mm_slli_epi32(a, 16), 16));
+#endif
+	ppu.vr[op.vd] = r | ~s;
+	if constexpr (((Flags == set_sat) || ...))
+		ppu.sat = ppu.sat | gv_shr32(a | b, 16);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VPKUWUM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VREFP()
 {
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	v128 VB = ppu.vr[op.vb];
-	for (uint h = 0; h < 4; h++)
-	{
-		d._u16[h + 4] = VA._u16[h * 2];
-		d._u16[h] = VB._u16[h * 2];
-	}
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
 
-bool ppu_interpreter_fast::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	v128 VB = ppu.vr[op.vb];
-	for (uint h = 0; h < 4; h++)
-	{
-		u32 result = VA._u32[h];
-
-		if (result > UINT16_MAX)
-		{
-			result = UINT16_MAX;
-		}
-
-		d._u16[h + 4] = result;
-
-		result = VB._u32[h];
-
-		if (result > UINT16_MAX)
-		{
-			result = UINT16_MAX;
-		}
-
-		d._u16[h] = result;
-	}
-	return true;
-}
-
-bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	v128 VA = ppu.vr[op.va];
-	v128 VB = ppu.vr[op.vb];
-	for (uint h = 0; h < 4; h++)
-	{
-		u32 result = VA._u32[h];
-
-		if (result > UINT16_MAX)
-		{
-			result = UINT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-
-		d._u16[h + 4] = result;
-
-		result = VB._u32[h];
-
-		if (result > UINT16_MAX)
-		{
-			result = UINT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-
-		d._u16[h] = result;
-	}
-	return true;
-}
-
-bool ppu_interpreter::VREFP(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
-	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
+	const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask);
+	const auto b = ppu_flush_denormal<Flags...>(m, ppu.vr[op.vb]);
 	const auto result = _mm_div_ps(a, b);
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
-	return true;
+	ppu.vr[op.vd] = ppu_flush_denormal<Flags...>(m, ppu_set_vnan<Flags...>(result, a, b));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VRFIM()
 {
-	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask);
+	const auto b = ppu_flush_denormal<Flags...>(m, ppu.vr[op.vb]);
 	v128 d;
 
 	for (uint w = 0; w < 4; w++)
@@ -2017,12 +2343,18 @@ bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op)
 		d._f[w] = std::floor(b._f[w]);
 	}
 
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
-	return true;
+	ppu.vr[op.vd] = ppu_flush_denormal<Flags...>(m, ppu_set_vnan<Flags...>(d, b));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VRFIN(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VRFIN()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto b = ppu.vr[op.vb];
 	v128 d;
 
@@ -2031,13 +2363,20 @@ bool ppu_interpreter::VRFIN(ppu_thread& ppu, ppu_opcode_t op)
 		d._f[w] = std::nearbyint(b._f[w]);
 	}
 
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
-	return true;
+	ppu.vr[op.vd] = ppu_flush_denormal<Flags...>(gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask), ppu_set_vnan<Flags...>(d, b));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VRFIP()
 {
-	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask);
+	const auto b = ppu_flush_denormal<Flags...>(m, ppu.vr[op.vb]);
 	v128 d;
 
 	for (uint w = 0; w < 4; w++)
@@ -2045,12 +2384,18 @@ bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op)
 		d._f[w] = std::ceil(b._f[w]);
 	}
 
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
-	return true;
+	ppu.vr[op.vd] = ppu_flush_denormal<Flags...>(m, ppu_set_vnan<Flags...>(d, b));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VRFIZ(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VRFIZ()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto b = ppu.vr[op.vb];
 	v128 d;
 
@@ -2059,12 +2404,18 @@ bool ppu_interpreter::VRFIZ(ppu_thread& ppu, ppu_opcode_t op)
 		d._f[w] = std::truncf(b._f[w]);
 	}
 
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
-	return true;
+	ppu.vr[op.vd] = ppu_flush_denormal<Flags...>(gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask), ppu_set_vnan<Flags...>(d, b));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VRLB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VRLB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2073,11 +2424,17 @@ bool ppu_interpreter::VRLB(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u8[i] = utils::rol8(a._u8[i], b._u8[i]);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VRLH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VRLH()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2086,11 +2443,17 @@ bool ppu_interpreter::VRLH(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u16[i] = utils::rol16(a._u16[i], b._u8[i * 2] & 0xf);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VRLW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VRLW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2099,31 +2462,50 @@ bool ppu_interpreter::VRLW(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u32[w] = utils::rol32(a._u32[w], b._u8[w * 4] & 0x1f);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VRSQRTEFP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VRSQRTEFP()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
-	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
+	const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask);
+	const auto b = ppu_flush_denormal<Flags...>(m, ppu.vr[op.vb]);
 	const auto result = _mm_div_ps(a, _mm_sqrt_ps(b));
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
-	return true;
+	ppu.vr[op.vd] = ppu_flush_denormal<Flags...>(m, ppu_set_vnan<Flags...>(result, a, b));
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSEL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSEL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
 	const auto& c = ppu.vr[op.vc];
 
-	d = (b & c) | v128::andnot(c, a);
-	return true;
+	d = (b & c) | gv_andn(c, a);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	u8 sh = ppu.vr[op.vb]._u8[0] & 0x7;
@@ -2134,11 +2516,17 @@ bool ppu_interpreter::VSL(ppu_thread& ppu, ppu_opcode_t op)
 		sh = ppu.vr[op.vb]._u8[b] & 0x7;
 		d._u8[b] = (VA._u8[b] << sh) | (VA._u8[b - 1] >> (8 - sh));
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSLB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSLB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2147,11 +2535,17 @@ bool ppu_interpreter::VSLB(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u8[i] = a._u8[i] << (b._u8[i] & 0x7);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSLDOI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSLDOI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	u8 tmpSRC[32];
 	std::memcpy(tmpSRC, &ppu.vr[op.vb], 16);
@@ -2161,11 +2555,17 @@ bool ppu_interpreter::VSLDOI(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u8[15 - b] = tmpSRC[31 - (b + op.vsh)];
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSLH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSLH()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2174,11 +2574,17 @@ bool ppu_interpreter::VSLH(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u16[h] = a._u16[h] << (b._u16[h] & 0xf);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSLO(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSLO()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
@@ -2189,11 +2595,17 @@ bool ppu_interpreter::VSLO(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u8[15 - b] = VA._u8[15 - (b + nShift)];
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSLW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSLW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2202,11 +2614,17 @@ bool ppu_interpreter::VSLW(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u32[w] = a._u32[w] << (b._u32[w] & 0x1f);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSPLTB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSPLTB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	u8 byte = ppu.vr[op.vb]._u8[15 - op.vuimm];
 
@@ -2214,11 +2632,17 @@ bool ppu_interpreter::VSPLTB(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u8[b] = byte;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSPLTH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSPLTH()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	ensure((op.vuimm < 8));
 
@@ -2228,11 +2652,17 @@ bool ppu_interpreter::VSPLTH(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u16[h] = hword;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSPLTISB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSPLTISB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const s8 imm = op.vsimm;
 
@@ -2240,11 +2670,17 @@ bool ppu_interpreter::VSPLTISB(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u8[b] = imm;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSPLTISH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSPLTISH()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const s16 imm = op.vsimm;
 
@@ -2252,11 +2688,17 @@ bool ppu_interpreter::VSPLTISH(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u16[h] = imm;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSPLTISW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSPLTISW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const s32 imm = op.vsimm;
 
@@ -2264,11 +2706,17 @@ bool ppu_interpreter::VSPLTISW(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u32[w] = imm;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSPLTW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSPLTW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	ensure((op.vuimm < 4));
 
@@ -2278,11 +2726,17 @@ bool ppu_interpreter::VSPLTW(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u32[w] = word;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	u8 sh = ppu.vr[op.vb]._u8[15] & 0x7;
@@ -2293,11 +2747,17 @@ bool ppu_interpreter::VSR(ppu_thread& ppu, ppu_opcode_t op)
 		sh = ppu.vr[op.vb]._u8[b] & 0x7;
 		d._u8[b] = (VA._u8[b] >> sh) | (VA._u8[b + 1] << (8 - sh));
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSRAB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSRAB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2306,11 +2766,17 @@ bool ppu_interpreter::VSRAB(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._s8[i] = a._s8[i] >> (b._u8[i] & 0x7);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSRAH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSRAH()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2319,11 +2785,17 @@ bool ppu_interpreter::VSRAH(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._s16[h] = a._s16[h] >> (b._u16[h] & 0xf);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSRAW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSRAW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2332,11 +2804,17 @@ bool ppu_interpreter::VSRAW(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._s32[w] = a._s32[w] >> (b._u32[w] & 0x1f);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSRB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSRB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2345,11 +2823,17 @@ bool ppu_interpreter::VSRB(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u8[i] = a._u8[i] >> (b._u8[i] & 0x7);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSRH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSRH()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2358,11 +2842,17 @@ bool ppu_interpreter::VSRH(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u16[h] = a._u16[h] >> (b._u16[h] & 0xf);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSRO(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSRO()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	v128 VA = ppu.vr[op.va];
 	u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf;
@@ -2373,11 +2863,17 @@ bool ppu_interpreter::VSRO(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u8[b] = VA._u8[b + nShift];
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSRW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSRW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto& d = ppu.vr[op.vd];
 	const auto& a = ppu.vr[op.va];
 	const auto& b = ppu.vr[op.vb];
@@ -2386,644 +2882,518 @@ bool ppu_interpreter::VSRW(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		d._u32[w] = a._u32[w] >> (b._u32[w] & 0x1f);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::VSUBCUW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBCUW()
 {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+	const auto r = gv_shr32(gv_geu32(a, b), 31);
+	ppu.vr[op.vd] = r;
+	};
+	RETURN_(ppu, op);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBFP()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_nj, set_vnan, fix_vnan>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask);
+	const auto a = ppu_flush_denormal<Flags...>(m, ppu.vr[op.va]);
+	const auto b = ppu_flush_denormal<Flags...>(m, ppu.vr[op.vb]);
+	const auto r = gv_subfs(a, b);
+	ppu.vr[op.vd] = ppu_flush_denormal<Flags...>(m, ppu_set_vnan<Flags...>(r, a, b));
+	};
+	RETURN_(ppu, op);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBSBS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		d._u32[w] = a._u32[w] < b._u32[w] ? 0 : 1;
-	}
-	return true;
+		const auto s = gv_sub8(a, b);
+		const auto r = gv_subs_s8(a, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (s ^ r);
+		d = r;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter::VSUBFP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBSHS()
 {
-	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
-	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
-	const auto result = v128::subfs(a, b);
-	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-bool ppu_interpreter_fast::VSUBSBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_subs_epi8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter_precise::VSUBSBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 16; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		const s16 diff = a._s8[i] - b._s8[i];
+		const auto s = gv_sub16(a, b);
+		const auto r = gv_subs_s16(a, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (s ^ r);
+		d = r;
+	};
 
-		if (diff < INT8_MIN)
-		{
-			d._s8[i] = INT8_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (diff > INT8_MAX)
-		{
-			d._s8[i] = INT8_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._s8[i] = static_cast<s8>(diff);
-		}
-	}
-
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter_fast::VSUBSHS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBSWS()
 {
-	ppu.vr[op.vd].vi = _mm_subs_epi16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-bool ppu_interpreter_precise::VSUBSHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 8; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		const s32 diff = a._s16[i] - b._s16[i];
+		const auto s = gv_sub32(a, b);
+		const auto r = gv_subs_s32(a, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (s ^ r);
+		d = r;
+	};
 
-		if (diff < INT16_MIN)
-		{
-			d._s16[i] = INT16_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (diff > INT16_MAX)
-		{
-			d._s16[i] = INT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._s16[i] = static_cast<s16>(diff);
-		}
-	}
-
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter_fast::VSUBSWS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBUBM()
 {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		const s64 result = s64{a._s32[w]} - b._s32[w];
+		d = gv_sub8(a, b);
+	};
 
-		if (result < INT32_MIN)
-		{
-			d._s32[w] = INT32_MIN;
-		}
-		else if (result > INT32_MAX)
-		{
-			d._s32[w] = INT32_MAX;
-		}
-		else
-			d._s32[w] = static_cast<s32>(result);
-	}
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
-bool ppu_interpreter_precise::VSUBSWS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBUBS()
 {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		const s64 result = s64{a._s32[w]} - b._s32[w];
+		const auto s = gv_sub8(a, b);
+		const auto r = gv_subus_u8(a, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (s ^ r);
+		d = r;
+	};
 
-		if (result < INT32_MIN)
-		{
-			d._s32[w] = INT32_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (result > INT32_MAX)
-		{
-			d._s32[w] = INT32_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-			d._s32[w] = static_cast<s32>(result);
-	}
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter::VSUBUBM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBUHM()
 {
-	ppu.vr[op.vd] = v128::sub8(ppu.vr[op.va], ppu.vr[op.vb]);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter_fast::VSUBUBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd].vi = _mm_subs_epu8(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
-}
-
-bool ppu_interpreter_precise::VSUBUBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
-
-	for (u8 i = 0; i < 16; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		const s16 diff = a._u8[i] - b._u8[i];
+		d = gv_sub16(a, b);
+	};
 
-		if (diff < 0)
-		{
-			d._u8[i] = 0;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (diff > UINT8_MAX)
-		{
-			d._u8[i] = UINT8_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._u8[i] = static_cast<u8>(diff);
-		}
-	}
-
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
-bool ppu_interpreter::VSUBUHM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBUHS()
 {
-	ppu.vr[op.vd] = v128::sub16(ppu.vr[op.va], ppu.vr[op.vb]);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
+	{
+		const auto s = gv_sub16(a, b);
+		const auto r = gv_subus_u16(a, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (s ^ r);
+		d = r;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter_fast::VSUBUHS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBUWM()
 {
-	ppu.vr[op.vd].vi = _mm_subs_epu16(ppu.vr[op.va].vi, ppu.vr[op.vb].vi);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
+	{
+		d = gv_sub32(a, b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
-bool ppu_interpreter_precise::VSUBUHS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUBUWS()
 {
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-	auto& d = ppu.vr[op.vd];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-	for (u8 i = 0; i < 8; i++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		const s32 diff = a._u16[i] - b._u16[i];
+		const auto s = gv_sub32(a, b);
+		const auto r = gv_subus_u32(a, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (s ^ r);
+		d = r;
+	};
 
-		if (diff < 0)
-		{
-			d._u16[i] = 0;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (diff > UINT16_MAX)
-		{
-			d._u16[i] = UINT16_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-		{
-			d._u16[i] = static_cast<u16>(diff);
-		}
-	}
-
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter::VSUBUWM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUMSWS()
 {
-	ppu.vr[op.vd] = v128::sub32(ppu.vr[op.va], ppu.vr[op.vb]);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-bool ppu_interpreter_fast::VSUBUWS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		const s64 result = s64{a._u32[w]} - b._u32[w];
-
-		if (result < 0)
-		{
-			d._u32[w] = 0;
-		}
-		else
-			d._u32[w] = static_cast<u32>(result);
-	}
-	return true;
-}
-
-bool ppu_interpreter_precise::VSUBUWS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint w = 0; w < 4; w++)
-	{
-		const s64 result = s64{a._u32[w]} - b._u32[w];
-
-		if (result < 0)
-		{
-			d._u32[w] = 0;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-			d._u32[w] = static_cast<u32>(result);
-	}
-	return true;
-}
-
-bool ppu_interpreter_fast::VSUMSWS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	s64 sum = b._s32[0];
-
-	for (uint w = 0; w < 4; w++)
-	{
-		sum += a._s32[w];
-	}
-
-	d.clear();
-	if (sum > INT32_MAX)
-	{
-		d._s32[0] = INT32_MAX;
-	}
-	else if (sum < INT32_MIN)
-	{
-		d._s32[0] = INT32_MIN;
-	}
-	else
-		d._s32[0] = static_cast<s32>(sum);
-	return true;
-}
-
-bool ppu_interpreter_precise::VSUMSWS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	s64 sum = b._s32[0];
-
-	for (uint w = 0; w < 4; w++)
-	{
-		sum += a._s32[w];
-	}
-
-	d.clear();
-	if (sum > INT32_MAX)
-	{
-		d._s32[0] = INT32_MAX;
-		ppu.sat._u32[0] = 1;
-	}
-	else if (sum < INT32_MIN)
-	{
-		d._s32[0] = INT32_MIN;
-		ppu.sat._u32[0] = 1;
-	}
-	else
-		d._s32[0] = static_cast<s32>(sum);
-	return true;
-}
-
-bool ppu_interpreter_fast::VSUM2SWS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	v128 d;
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint n = 0; n < 2; n++)
-	{
-		const s64 sum = s64{a._s32[n * 2]} + a._s32[n * 2 + 1] + b._s32[n * 2];
-
+		s64 sum = s64{b._s32[0]} + a._s32[0] + a._s32[1] + a._s32[2] + a._s32[3];
 		if (sum > INT32_MAX)
 		{
-			d._s32[n * 2] = INT32_MAX;
+			sum = u32(INT32_MAX);
+			if constexpr (((Flags == set_sat) || ...))
+				sat._bytes[0] = 1;
 		}
 		else if (sum < INT32_MIN)
 		{
-			d._s32[n * 2] = INT32_MIN;
+			sum = u32(INT32_MIN);
+			if constexpr (((Flags == set_sat) || ...))
+				sat._bytes[0] = 1;
 		}
 		else
-			d._s32[n * 2] = static_cast<s32>(sum);
-	}
-	d._s32[1] = 0;
-	d._s32[3] = 0;
-	ppu.vr[op.vd] = d;
-	return true;
+		{
+			sum = static_cast<u32>(sum);
+		}
+
+		d._u = sum;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
 }
 
-bool ppu_interpreter_precise::VSUM2SWS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUM2SWS()
 {
-	v128 d;
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
 
-	for (uint n = 0; n < 2; n++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
 	{
-		const s64 sum = s64{a._s32[n * 2]} + a._s32[n * 2 + 1] + b._s32[n * 2];
+#if defined(__AVX512VL__)
+		const auto x = gv_add64(gv_sar64(gv_shl64(a, 32), 32), gv_sar64(a, 32));
+		const auto y = gv_add64(x, gv_sar64(gv_shl64(b, 32), 32));
+		const auto r = _mm_unpacklo_epi32(_mm_cvtsepi64_epi32(y), _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+		const auto x = vaddl_s32(vget_low_s32(vuzp1q_s32(a, a)), vget_low_s32(vuzp2q_s32(a, a)));
+		const auto y = vaddw_s32(x, vget_low_s32(vuzp1q_s32(b, b)));
+		const auto r = vmovl_u32(uint32x2_t(vqmovn_s64(y)));
+#else
+		v128 y{};
+		y._s64[0] = s64{a._s32[0]} + a._s32[1] + b._s32[0];
+		y._s64[1] = s64{a._s32[2]} + a._s32[3] + b._s32[2];
+		v128 r{};
+		r._u64[0] = y._s64[0] > INT32_MAX ? INT32_MAX : y._s64[0] < INT32_MIN ? u32(INT32_MIN) : static_cast<u32>(y._s64[0]);
+		r._u64[1] = y._s64[1] > INT32_MAX ? INT32_MAX : y._s64[1] < INT32_MIN ? u32(INT32_MIN) : static_cast<u32>(y._s64[1]);
+#endif
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | gv_shr64(gv_add64(y, gv_bcst64(0x80000000u)), 32);
+		d = r;
+	};
 
-		if (sum > INT32_MAX)
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUM4SBS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
+	{
+		//const auto r = _mm_dpbusds_epi32(b, _mm_set1_epi8(1), a);
+		//const auto s = _mm_dpbusd_epi32(b, _mm_set1_epi8(1), a);
+		const auto x = gv_hadds8x4(a);
+		const auto r = gv_adds_s32(x, b);
+		const auto s = gv_add32(x, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (r ^ s);
+		d = r;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUM4SHS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
+	{
+		//const auto r = _mm_dpwssds_epi32(b, a, _mm_set1_epi16(1));
+		//const auto s = _mm_dpwssd_epi32(b, a, _mm_set1_epi16(1));
+		const auto x = gv_hadds16x2(a);
+		const auto r = gv_adds_s32(x, b);
+		const auto s = gv_add32(x, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (r ^ s);
+		d = r;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VSUM4UBS()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_sat>();
+
+	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
+	{
+		const auto x = gv_haddu8x4(a);
+		const auto r = gv_addus_u32(x, b);
+		if constexpr (((Flags == set_sat) || ...))
+			sat = sat | (r ^ gv_add32(x, b));
+		d = r;
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VUPKHPX()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+#if defined(ARCH_X64_0)
+	static const auto make = [](asmjit::ppu_builder& c)
+	{
+		const auto [v0, v1, v2] = c.vec_alloc<3>();
+		EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
+		EMIT(psrad, v0, v0, c.imm(16));
+		EMIT(pslld, v1, v0, c.imm(6));
+		EMIT(pslld, v2, v0, c.imm(3));
+		BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
+		BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
+		BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
+		EMIT(por, v0, v0, v1);
+		EMIT(por, v0, v0, v2);
+		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
+		c.ppu_ret();
+	};
+#endif
+	static const auto exec = [](auto&& d, auto&& b)
+	{
+		const auto x = gv_extend_hi_s16(b);
+		d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00));
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VUPKHSB()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+#if defined(ARCH_X64_0)
+	static const auto make = [](asmjit::ppu_builder& c)
+	{
+		const auto v0 = c.vec_alloc();
+		EMIT(punpckhbw, v0, v0, c.ppu_vr(s_op.vb));
+		EMIT(psraw, v0, v0, c.imm(8));
+		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
+		c.ppu_ret();
+	};
+#endif
+	static const auto exec = [](auto&& d, auto&& b)
+	{
+		d = gv_extend_hi_s8(b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VUPKHSH()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+#if defined(ARCH_X64_0)
+	static const auto make = [](asmjit::ppu_builder& c)
+	{
+		const auto v0 = c.vec_alloc();
+		EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb));
+		EMIT(psrad, v0, v0, c.imm(16));
+		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
+		c.ppu_ret();
+	};
+#endif
+	static const auto exec = [](auto&& d, auto&& b)
+	{
+		d = gv_extend_hi_s16(b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
+}
+
+template <u32 Build, ppu_exec_bit... Flags>
+auto VUPKLPX()
+{
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+#if defined(ARCH_X64_0)
+	static const auto make = [](asmjit::ppu_builder& c)
+	{
+		const auto [v0, v1, v2] = c.vec_alloc<3>();
+		if (utils::has_sse41())
 		{
-			d._s32[n * 2] = INT32_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (sum < INT32_MIN)
-		{
-			d._s32[n * 2] = INT32_MIN;
-			ppu.sat._u32[0] = 1;
+			LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
 		}
 		else
-			d._s32[n * 2] = static_cast<s32>(sum);
-	}
+		{
+			EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
+			EMIT(psrad, v0, v0, c.imm(16));
+		}
+		EMIT(pslld, v1, v0, c.imm(6));
+		EMIT(pslld, v2, v0, c.imm(3));
+		BCST(pand, d, v0, v0, c.get_bcst<u32>(0xff00001f));
+		BCST(pand, d, v1, v1, c.get_bcst<u32>(0x1f0000));
+		BCST(pand, d, v2, v2, c.get_bcst<u32>(0x1f00));
+		EMIT(por, v0, v0, v1);
+		EMIT(por, v0, v0, v2);
+		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
+		c.ppu_ret();
+	};
+#endif
+	static const auto exec = [](auto&& d, auto&& b)
+	{
+		const auto x = gv_extend_lo_s16(b);
+		d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00));
+	};
 
-	d._s32[1] = 0;
-	d._s32[3] = 0;
-	ppu.vr[op.vd] = d;
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
-bool ppu_interpreter_fast::VSUM4SBS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VUPKLSB()
 {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-	for (uint w = 0; w < 4; w++)
+#if defined(ARCH_X64_0)
+	static const auto make = [](asmjit::ppu_builder& c)
 	{
-		s64 sum = b._s32[w];
-
-		for (uint b = 0; b < 4; b++)
+		const auto v0 = c.vec_alloc();
+		if (utils::has_sse41())
 		{
-			sum += a._s8[w * 4 + b];
-		}
-
-		if (sum > INT32_MAX)
-		{
-			d._s32[w] = INT32_MAX;
-		}
-		else if (sum < INT32_MIN)
-		{
-			d._s32[w] = INT32_MIN;
+			LDST(pmovsxbw, v0, c.ppu_vr<8>(s_op.vb));
 		}
 		else
-			d._s32[w] = static_cast<s32>(sum);
-	}
-	return true;
+		{
+			EMIT(punpcklbw, v0, v0, c.ppu_vr(s_op.vb));
+			EMIT(psraw, v0, v0, c.imm(8));
+		}
+		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
+		c.ppu_ret();
+	};
+#endif
+	static const auto exec = [](auto&& d, auto&& b)
+	{
+		d = gv_extend_lo_s8(b);
+	};
+
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
-bool ppu_interpreter_precise::VSUM4SBS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VUPKLSH()
 {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-	for (uint w = 0; w < 4; w++)
+#if defined(ARCH_X64_0)
+	static const auto make = [](asmjit::ppu_builder& c)
 	{
-		s64 sum = b._s32[w];
-
-		for (uint b = 0; b < 4; b++)
+		const auto v0 = c.vec_alloc();
+		if (utils::has_sse41())
 		{
-			sum += a._s8[w * 4 + b];
-		}
-
-		if (sum > INT32_MAX)
-		{
-			d._s32[w] = INT32_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (sum < INT32_MIN)
-		{
-			d._s32[w] = INT32_MIN;
-			ppu.sat._u32[0] = 1;
+			LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb));
 		}
 		else
-			d._s32[w] = static_cast<s32>(sum);
-	}
-	return true;
-}
-
-bool ppu_interpreter_fast::VSUM4SHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint w = 0; w < 4; w++)
-	{
-		s64 sum = b._s32[w];
-
-		for (uint h = 0; h < 2; h++)
 		{
-			sum += a._s16[w * 2 + h];
+			EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb));
+			EMIT(psrad, v0, v0, c.imm(16));
 		}
-
-		if (sum > INT32_MAX)
-		{
-			d._s32[w] = INT32_MAX;
-		}
-		else if (sum < INT32_MIN)
-		{
-			d._s32[w] = INT32_MIN;
-		}
-		else
-			d._s32[w] = static_cast<s32>(sum);
-	}
-	return true;
-}
-
-bool ppu_interpreter_precise::VSUM4SHS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
-
-	for (uint w = 0; w < 4; w++)
+		LDST(movaps, c.ppu_vr(s_op.vd, true), v0);
+		c.ppu_ret();
+	};
+#endif
+	static const auto exec = [](auto&& d, auto&& b)
 	{
-		s64 sum = b._s32[w];
+		d = gv_extend_lo_s16(b);
+	};
 
-		for (uint h = 0; h < 2; h++)
-		{
-			sum += a._s16[w * 2 + h];
-		}
-
-		if (sum > INT32_MAX)
-		{
-			d._s32[w] = INT32_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else if (sum < INT32_MIN)
-		{
-			d._s32[w] = INT32_MIN;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-			d._s32[w] = static_cast<s32>(sum);
-	}
-	return true;
+	RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]);
 }
 
-bool ppu_interpreter_fast::VSUM4UBS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto VXOR()
 {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-	for (uint w = 0; w < 4; w++)
+	static const auto exec = [](auto&& d, auto&& a, auto&& b)
 	{
-		u64 sum = b._u32[w];
+		d = gv_xorfs(std::move(a), std::move(b));
+	};
 
-		for (uint b = 0; b < 4; b++)
-		{
-			sum += a._u8[w * 4 + b];
-		}
-
-		if (sum > UINT32_MAX)
-		{
-			d._u32[w] = UINT32_MAX;
-		}
-		else
-			d._u32[w] = static_cast<u32>(sum);
-	}
-	return true;
+	RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]);
 }
 
-bool ppu_interpreter_precise::VSUM4UBS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto TDI()
 {
-	auto& d = ppu.vr[op.vd];
-	const auto& a = ppu.vr[op.va];
-	const auto& b = ppu.vr[op.vb];
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-	for (uint w = 0; w < 4; w++)
+	if constexpr (Build == 0) return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
 	{
-		u64 sum = b._u32[w];
-
-		for (uint b = 0; b < 4; b++)
-		{
-			sum += a._u8[w * 4 + b];
-		}
-
-		if (sum > UINT32_MAX)
-		{
-			d._u32[w] = UINT32_MAX;
-			ppu.sat._u32[0] = 1;
-		}
-		else
-			d._u32[w] = static_cast<u32>(sum);
-	}
-	return true;
-}
-
-bool ppu_interpreter::VUPKHPX(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	v128 VB = ppu.vr[op.vb];
-	for (uint w = 0; w < 4; w++)
-	{
-		d._s8[w * 4 + 3] = VB._s8[8 + w * 2 + 1] >> 7;  // signed shift sign extends
-		d._u8[w * 4 + 2] = (VB._u8[8 + w * 2 + 1] >> 2) & 0x1f;
-		d._u8[w * 4 + 1] = ((VB._u8[8 + w * 2 + 1] & 0x3) << 3) | ((VB._u8[8 + w * 2 + 0] >> 5) & 0x7);
-		d._u8[w * 4 + 0] = VB._u8[8 + w * 2 + 0] & 0x1f;
-	}
-	return true;
-}
-
-bool ppu_interpreter::VUPKHSB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	v128 VB = ppu.vr[op.vb];
-	for (uint h = 0; h < 8; h++)
-	{
-		d._s16[h] = VB._s8[8 + h];
-	}
-	return true;
-}
-
-bool ppu_interpreter::VUPKHSH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	v128 VB = ppu.vr[op.vb];
-	for (uint w = 0; w < 4; w++)
-	{
-		d._s32[w] = VB._s16[4 + w];
-	}
-	return true;
-}
-
-bool ppu_interpreter::VUPKLPX(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	v128 VB = ppu.vr[op.vb];
-	for (uint w = 0; w < 4; w++)
-	{
-		d._s8[w * 4 + 3] = VB._s8[w * 2 + 1] >> 7;  // signed shift sign extends
-		d._u8[w * 4 + 2] = (VB._u8[w * 2 + 1] >> 2) & 0x1f;
-		d._u8[w * 4 + 1] = ((VB._u8[w * 2 + 1] & 0x3) << 3) | ((VB._u8[w * 2 + 0] >> 5) & 0x7);
-		d._u8[w * 4 + 0] = VB._u8[w * 2 + 0] & 0x1f;
-	}
-	return true;
-}
-
-bool ppu_interpreter::VUPKLSB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	v128 VB = ppu.vr[op.vb];
-	for (uint h = 0; h < 8; h++)
-	{
-		d._s16[h] = VB._s8[h];
-	}
-	return true;
-}
-
-bool ppu_interpreter::VUPKLSH(ppu_thread& ppu, ppu_opcode_t op)
-{
-	auto& d = ppu.vr[op.vd];
-	v128 VB = ppu.vr[op.vb];
-	for (uint w = 0; w < 4; w++)
-	{
-		d._s32[w] = VB._s16[w];
-	}
-	return true;
-}
-
-bool ppu_interpreter::VXOR(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.vr[op.vd] = ppu.vr[op.va] ^ ppu.vr[op.vb];
-	return true;
-}
-
-bool ppu_interpreter::TDI(ppu_thread& ppu, ppu_opcode_t op)
-{
 	const s64 a = ppu.gpr[op.ra], b = op.simm16;
 	const u64 a_ = a, b_ = b;
 
@@ -3033,15 +3403,22 @@ bool ppu_interpreter::TDI(ppu_thread& ppu, ppu_opcode_t op)
 		((op.bo & 0x2) && a_ < b_) ||
 		((op.bo & 0x1) && a_ > b_))
 	{
-		ppu_trap(ppu, ppu.cia);
-		return false;
+		[[unlikely]]
+		ppu_trap(ppu, vm::get_addr(this_op));
+		return;
 	}
-
-	return true;
+	return next_fn->fn(ppu, {this_op[1]}, this_op + 1, next_fn + 1);
+	};
 }
 
-bool ppu_interpreter::TWI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto TWI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	if constexpr (Build == 0) return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
+	{
 	const s32 a = static_cast<u32>(ppu.gpr[op.ra]), b = op.simm16;
 	const u32 a_ = a, b_ = b;
 
@@ -3051,31 +3428,49 @@ bool ppu_interpreter::TWI(ppu_thread& ppu, ppu_opcode_t op)
 		((op.bo & 0x2) && a_ < b_) ||
 		((op.bo & 0x1) && a_ > b_))
 	{
-		ppu_trap(ppu, ppu.cia);
-		return false;
+		[[unlikely]]
+		ppu_trap(ppu, vm::get_addr(this_op));
+		return;
 	}
-
-	return true;
+	return next_fn->fn(ppu, {this_op[1]}, this_op + 1, next_fn + 1);
+	};
 }
 
-bool ppu_interpreter::MULLI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MULLI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.rd] = static_cast<s64>(ppu.gpr[op.ra]) * op.simm16;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SUBFIC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SUBFIC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 a = ppu.gpr[op.ra];
 	const s64 i = op.simm16;
 	const auto r = add64_flags(~a, i, 1);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CMPLI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CMPLI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	if (op.l10)
 	{
 		ppu_cr_set<u64>(ppu, op.crfd, ppu.gpr[op.ra], op.uimm16);
@@ -3084,11 +3479,17 @@ bool ppu_interpreter::CMPLI(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		ppu_cr_set<u32>(ppu, op.crfd, static_cast<u32>(ppu.gpr[op.ra]), op.uimm16);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CMPI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CMPI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	if (op.l10)
 	{
 		ppu_cr_set<s64>(ppu, op.crfd, ppu.gpr[op.ra], op.simm16);
@@ -3097,92 +3498,159 @@ bool ppu_interpreter::CMPI(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		ppu_cr_set<s32>(ppu, op.crfd, static_cast<u32>(ppu.gpr[op.ra]), op.simm16);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ADDIC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ADDIC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const s64 a = ppu.gpr[op.ra];
 	const s64 i = op.simm16;
 	const auto r = add64_flags(a, i);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
 	if (op.main & 1) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ADDI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ADDI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.rd] = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ADDIS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ADDIS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.rd] = op.ra ? ppu.gpr[op.ra] + (op.simm16 * 65536) : (op.simm16 * 65536);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::BC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto BC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	if constexpr (Build == 0) return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
+	{
 	const bool bo0 = (op.bo & 0x10) != 0;
 	const bool bo1 = (op.bo & 0x08) != 0;
 	const bool bo2 = (op.bo & 0x04) != 0;
 	const bool bo3 = (op.bo & 0x02) != 0;
 
 	ppu.ctr -= (bo2 ^ true);
-	if (op.lk) ppu.lr = ppu.cia + 4;
+	const u32 link = vm::get_addr(this_op) + 4;
+	if (op.lk) ppu.lr = link;
 
 	const bool ctr_ok = bo2 | ((ppu.ctr != 0) ^ bo3);
 	const bool cond_ok = bo0 | (!!(ppu.cr[op.bi]) ^ (bo1 ^ true));
 
+	const u32 old_cia = ppu.cia;
+
 	if (ctr_ok && cond_ok)
 	{
+		ppu.cia = vm::get_addr(this_op);
 		// Provide additional information by using the origin of the call
 		// Because this is a fixed target branch there's no abiguity about it
 		ppu_record_call(ppu, ppu.cia, op);
 
 		ppu.cia = (op.aa ? 0 : ppu.cia) + op.bt14;
-		return false;
+	}
+	else if (!ppu.state) [[likely]]
+	{
+		return next_fn->fn(ppu, {this_op[1]}, this_op + 1, next_fn + 1);
 	}
 	else
 	{
-		return true;
+		ppu.cia = link;
 	}
+
+	ppu.exec_bytes += link - old_cia;
+	};
 }
 
-bool ppu_interpreter::SC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SC()
 {
-	if (op.opcode != ppu_instructions::SC(0))
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	if constexpr (Build == 0)
 	{
-		return UNK(ppu, op);
-	}
+		return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func*)
+		{
+			const u32 old_cia = ppu.cia;
+			ppu.cia = vm::get_addr(this_op);
+			ppu.exec_bytes += ppu.cia - old_cia;
+			if (op.opcode != ppu_instructions::SC(0))
+			{
+				fmt::throw_exception("Unknown/Illegal SC: 0x%08x", op.opcode);
+			}
 
-	ppu_execute_syscall(ppu, ppu.gpr[11]);
-	return false;
+			ppu_execute_syscall(ppu, ppu.gpr[11]);
+		};
+	}
 }
 
-bool ppu_interpreter::B(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto B()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	if constexpr (Build == 0) return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func*)
+	{
+	const u32 old_cia = ppu.cia;
+	const u32 link = (ppu.cia = vm::get_addr(this_op)) + 4;
 	// Provide additional information by using the origin of the call
 	// Because this is a fixed target branch there's no abiguity about it
 	ppu_record_call(ppu, ppu.cia, op);
 
-	const u32 link = ppu.cia + 4;
 	ppu.cia = (op.aa ? 0 : ppu.cia) + op.bt24;
 	if (op.lk) ppu.lr = link;
-	return false;
+	ppu.exec_bytes += link - old_cia;
+	};
 }
 
-bool ppu_interpreter::MCRF(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MCRF()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	CHECK_SIZE(ppu_thread::cr, 32);
 	ppu.cr.fields[op.crfd] = ppu.cr.fields[op.crfs];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::BCLR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto BCLR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	if constexpr (Build == 0) return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
+	{
 	const bool bo0 = (op.bo & 0x10) != 0;
 	const bool bo1 = (op.bo & 0x08) != 0;
 	const bool bo2 = (op.bo & 0x04) != 0;
@@ -3194,191 +3662,377 @@ bool ppu_interpreter::BCLR(ppu_thread& ppu, ppu_opcode_t op)
 	const bool cond_ok = bo0 | (!!(ppu.cr[op.bi]) ^ (bo1 ^ true));
 
 	const u32 target = static_cast<u32>(ppu.lr) & ~3;
-	if (op.lk) ppu.lr = ppu.cia + 4;
+	const u32 link = vm::get_addr(this_op) + 4;
+	if (op.lk) ppu.lr = link;
+
+	const u32 old_cia = ppu.cia;
 
 	if (ctr_ok && cond_ok)
 	{
-		ppu.cia = ppu_record_call(ppu, target, op, true);
-		return false;
+		ppu_record_call(ppu, target, op, true);
+		ppu.cia = target;
+	}
+	else if (!ppu.state) [[likely]]
+	{
+		return next_fn->fn(ppu, {this_op[1]}, this_op + 1, next_fn + 1);
 	}
 	else
 	{
-		return true;
+		ppu.cia = link;
 	}
+
+	ppu.exec_bytes += link - old_cia;
+	};
 }
 
-bool ppu_interpreter::CRNOR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CRNOR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.cr[op.crbd] = (ppu.cr[op.crba] | ppu.cr[op.crbb]) ^ true;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CRANDC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CRANDC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.cr[op.crbd] = ppu.cr[op.crba] & (ppu.cr[op.crbb] ^ true);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ISYNC(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ISYNC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
 	atomic_fence_acquire();
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CRXOR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CRXOR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.cr[op.crbd] = ppu.cr[op.crba] ^ ppu.cr[op.crbb];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CRNAND(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CRNAND()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.cr[op.crbd] = (ppu.cr[op.crba] & ppu.cr[op.crbb]) ^ true;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CRAND(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CRAND()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.cr[op.crbd] = ppu.cr[op.crba] & ppu.cr[op.crbb];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CREQV(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CREQV()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.cr[op.crbd] = (ppu.cr[op.crba] ^ ppu.cr[op.crbb]) ^ true;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CRORC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CRORC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.cr[op.crbd] = ppu.cr[op.crba] | (ppu.cr[op.crbb] ^ true);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CROR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CROR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.cr[op.crbd] = ppu.cr[op.crba] | ppu.cr[op.crbb];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::BCCTR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto BCCTR()
 {
-	if (op.lk) ppu.lr = ppu.cia + 4;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	if constexpr (Build == 0) return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
+	{
+	const u32 link = vm::get_addr(this_op) + 4;
+	if (op.lk) ppu.lr = link;
+	const u32 old_cia = ppu.cia;
 
 	if (op.bo & 0x10 || ppu.cr[op.bi] == ((op.bo & 0x8) != 0))
 	{
-		ppu.cia = ppu_record_call(ppu, static_cast<u32>(ppu.ctr) & ~3, op, true);
-		return false;
+		const u32 target = static_cast<u32>(ppu.ctr) & ~3;
+		ppu_record_call(ppu, target, op, true);
+		ppu.cia = target;
+	}
+	else if (!ppu.state) [[likely]]
+	{
+		return next_fn->fn(ppu, {this_op[1]}, this_op + 1, next_fn + 1);
+	}
+	else
+	{
+		ppu.cia = link;
 	}
 
-	return true;
+	ppu.exec_bytes += link - old_cia;
+	};
 }
 
-bool ppu_interpreter::RLWIMI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLWIMI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 mask = ppu_rotate_mask(32 + op.mb32, 32 + op.me32);
 	ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (dup32(utils::rol32(static_cast<u32>(ppu.gpr[op.rs]), op.sh32)) & mask);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::RLWINM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLWINM()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = dup32(utils::rol32(static_cast<u32>(ppu.gpr[op.rs]), op.sh32)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::RLWNM(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLWNM()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = dup32(utils::rol32(static_cast<u32>(ppu.gpr[op.rs]), ppu.gpr[op.rb] & 0x1f)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ORI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ORI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] | op.uimm16;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ORIS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ORIS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] | (u64{op.uimm16} << 16);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::XORI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto XORI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] ^ op.uimm16;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::XORIS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto XORIS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] ^ (u64{op.uimm16} << 16);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ANDI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ANDI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] & op.uimm16;
 	ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ANDIS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ANDIS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] & (u64{op.uimm16} << 16);
 	ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::RLDICL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLDICL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull >> op.mbe64);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::RLDICR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLDICR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull << (op.mbe64 ^ 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::RLDIC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLDIC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & ppu_rotate_mask(op.mbe64, op.sh64 ^ 63);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::RLDIMI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLDIMI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 mask = ppu_rotate_mask(op.mbe64, op.sh64 ^ 63);
 	ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (utils::rol64(ppu.gpr[op.rs], op.sh64) & mask);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::RLDCL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLDCL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull >> op.mbe64);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::RLDCR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto RLDCR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull << (op.mbe64 ^ 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CMP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CMP()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	if (op.l10)
 	{
 		ppu_cr_set<s64>(ppu, op.crfd, ppu.gpr[op.ra], ppu.gpr[op.rb]);
@@ -3387,11 +4041,18 @@ bool ppu_interpreter::CMP(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		ppu_cr_set<s32>(ppu, op.crfd, static_cast<u32>(ppu.gpr[op.ra]), static_cast<u32>(ppu.gpr[op.rb]));
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::TW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto TW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	if constexpr (Build == 0) return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
+	{
 	s32 a = static_cast<s32>(ppu.gpr[op.ra]);
 	s32 b = static_cast<s32>(ppu.gpr[op.rb]);
 
@@ -3401,67 +4062,117 @@ bool ppu_interpreter::TW(ppu_thread& ppu, ppu_opcode_t op)
 		(static_cast<u32>(a) < static_cast<u32>(b) && (op.bo & 0x2)) ||
 		(static_cast<u32>(a) > static_cast<u32>(b) && (op.bo & 0x1)))
 	{
-		ppu_trap(ppu, ppu.cia);
-		return false;
+		[[unlikely]]
+		ppu_trap(ppu, vm::get_addr(this_op));
+		return;
 	}
-
-	return true;
+	return next_fn->fn(ppu, {this_op[1]}, this_op + 1, next_fn + 1);
+	};
 }
 
-bool ppu_interpreter::LVSL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVSL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-	ppu.vr[op.vd].vi = sse_altivec_lvsl(addr);
-	return true;
+	ppu.vr[op.vd] = sse_altivec_lvsl(addr);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVEBX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVEBX()
 {
-	return LVX(ppu, op);
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~0xfull;
+	ppu.vr[op.vd] = ppu_feed_data<v128>(ppu, addr);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SUBFC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SUBFC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const u64 RB = ppu.gpr[op.rb];
 	const auto r = add64_flags(~RA, RB, 1);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, r.result, 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MULHDU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MULHDU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.rd] = utils::umulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ADDC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ADDC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const u64 RB = ppu.gpr[op.rb];
 	const auto r = add64_flags(RA, RB);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, r.result, 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MULHWU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MULHWU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	u32 a = static_cast<u32>(ppu.gpr[op.ra]);
 	u32 b = static_cast<u32>(ppu.gpr[op.rb]);
 	ppu.gpr[op.rd] = (u64{a} * b) >> 32;
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MFOCRF(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MFOCRF()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	if (op.l11)
 	{
 		// MFOCRF
@@ -3477,67 +4188,119 @@ bool ppu_interpreter::MFOCRF(ppu_thread& ppu, ppu_opcode_t op)
 		be_t<v128> lane0, lane1;
 		std::memcpy(&lane0, ppu.cr.bits, sizeof(v128));
 		std::memcpy(&lane1, ppu.cr.bits + 16, sizeof(v128));
-		const u32 mh = _mm_movemask_epi8(_mm_slli_epi64(lane0.value().vi, 7));
-		const u32 ml = _mm_movemask_epi8(_mm_slli_epi64(lane1.value().vi, 7));
+		const u32 mh = _mm_movemask_epi8(_mm_slli_epi64(lane0.value(), 7));
+		const u32 ml = _mm_movemask_epi8(_mm_slli_epi64(lane1.value(), 7));
 
 		ppu.gpr[op.rd] = (mh << 16) | ml;
 	}
 
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWARX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWARX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_lwarx(ppu, vm::cast(addr));
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LDX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LDX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<u64>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWZX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWZX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<u32>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SLW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SLW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = static_cast<u32>(ppu.gpr[op.rs] << (ppu.gpr[op.rb] & 0x3f));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CNTLZW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CNTLZW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = std::countl_zero(static_cast<u32>(ppu.gpr[op.rs]));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SLD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SLD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 n = ppu.gpr[op.rb] & 0x7f;
 	ppu.gpr[op.ra] = n & 0x40 ? 0 : ppu.gpr[op.rs] << n;
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::AND(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto AND()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] & ppu.gpr[op.rb];
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CMPL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CMPL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	if (op.l10)
 	{
 		ppu_cr_set<u64>(ppu, op.crfd, ppu.gpr[op.ra], ppu.gpr[op.rb]);
@@ -3546,68 +4309,129 @@ bool ppu_interpreter::CMPL(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		ppu_cr_set<u32>(ppu, op.crfd, static_cast<u32>(ppu.gpr[op.ra]), static_cast<u32>(ppu.gpr[op.rb]));
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVSR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVSR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-	ppu.vr[op.vd].vi = sse_altivec_lvsr(addr);
-	return true;
+	ppu.vr[op.vd] = sse_altivec_lvsr(addr);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVEHX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVEHX()
 {
-	return LVX(ppu, op);
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~0xfull;
+	ppu.vr[op.vd] = ppu_feed_data<v128>(ppu, addr);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SUBF(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SUBF()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const u64 RB = ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = RB - RA;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LDUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LDUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<u64>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DCBST(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DCBST()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWZUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWZUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<u32>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::CNTLZD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto CNTLZD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = std::countl_zero(ppu.gpr[op.rs]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ANDC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ANDC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] & ~ppu.gpr[op.rb];
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::TD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto TD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	if constexpr (Build == 0) return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
+	{
 	const s64 a = ppu.gpr[op.ra], b = ppu.gpr[op.rb];
 	const u64 a_ = a, b_ = b;
 
@@ -3617,118 +4441,213 @@ bool ppu_interpreter::TD(ppu_thread& ppu, ppu_opcode_t op)
 		((op.bo & 0x2) && a_ < b_) ||
 		((op.bo & 0x1) && a_ > b_))
 	{
-		ppu_trap(ppu, ppu.cia);
-		return false;
+		[[unlikely]]
+		ppu_trap(ppu, vm::get_addr(this_op));
+		return;
 	}
-
-	return true;
+	return next_fn->fn(ppu, {this_op[1]}, this_op + 1, next_fn + 1);
+	};
 }
 
-bool ppu_interpreter::LVEWX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVEWX()
 {
-	return LVX(ppu, op);
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~0xfull;
+	ppu.vr[op.vd] = ppu_feed_data<v128>(ppu, addr);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MULHD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MULHD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.rd] = utils::mulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MULHW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MULHW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	s32 a = static_cast<s32>(ppu.gpr[op.ra]);
 	s32 b = static_cast<s32>(ppu.gpr[op.rb]);
 	ppu.gpr[op.rd] = (s64{a} * b) >> 32;
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LDARX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LDARX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_ldarx(ppu, vm::cast(addr));
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DCBF(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DCBF()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LBZX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LBZX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<u8>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~0xfull;
 	ppu.vr[op.vd] = ppu_feed_data<v128>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::NEG(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto NEG()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	ppu.gpr[op.rd] = 0 - RA;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (~RA >> 63 == 0) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (~RA >> 63 == 0) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LBZUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LBZUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<u8>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::NOR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto NOR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ~(ppu.gpr[op.rs] | ppu.gpr[op.rb]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVEBX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVEBX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	const u8 eb = addr & 0xf;
 	vm::write8(vm::cast(addr), ppu.vr[op.vs]._u8[15 - eb]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SUBFE(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SUBFE()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const u64 RB = ppu.gpr[op.rb];
 	const auto r = add64_flags(~RA, RB, ppu.xer.ca);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, r.result, 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ADDE(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ADDE()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const u64 RB = ppu.gpr[op.rb];
 	const auto r = add64_flags(RA, RB, ppu.xer.ca);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, r.result, 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MTOCRF(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MTOCRF()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	alignas(4) static const u8 s_table[16][4]
 	{
 		{0, 0, 0, 0},
@@ -3772,107 +4691,189 @@ bool ppu_interpreter::MTOCRF(ppu_thread& ppu, ppu_opcode_t op)
 			}
 		}
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STDX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STDX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::write64(vm::cast(addr), ppu.gpr[op.rs]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STWCX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STWCX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu_cr_set(ppu, 0, false, false, ppu_stwcx(ppu, vm::cast(addr), static_cast<u32>(ppu.gpr[op.rs])), ppu.xer.so);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STWX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STWX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::write32(vm::cast(addr), static_cast<u32>(ppu.gpr[op.rs]));
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVEHX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVEHX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~1ULL;
 	const u8 eb = (addr & 0xf) >> 1;
 	vm::write16(vm::cast(addr), ppu.vr[op.vs]._u16[7 - eb]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STDUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STDUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	vm::write64(vm::cast(addr), ppu.gpr[op.rs]);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STWUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STWUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	vm::write32(vm::cast(addr), static_cast<u32>(ppu.gpr[op.rs]));
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVEWX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVEWX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~3ULL;
 	const u8 eb = (addr & 0xf) >> 2;
 	vm::write32(vm::cast(addr), ppu.vr[op.vs]._u32[3 - eb]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SUBFZE(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SUBFZE()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const auto r = add64_flags(~RA, 0, ppu.xer.ca);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (~RA >> 63 == 0) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (~RA >> 63 == 0) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, r.result, 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ADDZE(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ADDZE()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const auto r = add64_flags(RA, 0, ppu.xer.ca);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (RA >> 63 == 0) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (RA >> 63 == 0) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, r.result, 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STDCX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STDCX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu_cr_set(ppu, 0, false, false, ppu_stdcx(ppu, vm::cast(addr), ppu.gpr[op.rs]), ppu.xer.so);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STBX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STBX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::write8(vm::cast(addr), static_cast<u8>(ppu.gpr[op.rs]));
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~0xfull;
 	vm::_ref<v128>(vm::cast(addr)) = ppu.vr[op.vs];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MULLD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MULLD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const s64 RA = ppu.gpr[op.ra];
 	const s64 RB = ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = RA * RB;
@@ -3881,105 +4882,194 @@ bool ppu_interpreter::MULLD(ppu_thread& ppu, ppu_opcode_t op)
 		const s64 high = utils::mulh64(RA, RB);
 		ppu_ov_set(ppu, high != s64(ppu.gpr[op.rd]) >> 63);
 	}
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SUBFME(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SUBFME()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const auto r = add64_flags(~RA, ~0ull, ppu.xer.ca);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (~RA >> 63 == 1) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (~RA >> 63 == 1) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, r.result, 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ADDME(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ADDME()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const s64 RA = ppu.gpr[op.ra];
 	const auto r = add64_flags(RA, ~0ull, ppu.xer.ca);
 	ppu.gpr[op.rd] = r.result;
 	ppu.xer.ca = r.carry;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (u64(RA) >> 63 == 1) && (u64(RA) >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, r.result, 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (u64(RA) >> 63 == 1) && (u64(RA) >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, r.result, 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MULLW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MULLW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.rd] = s64{static_cast<s32>(ppu.gpr[op.ra])} * static_cast<s32>(ppu.gpr[op.rb]);
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, s64(ppu.gpr[op.rd]) < INT32_MIN || s64(ppu.gpr[op.rd]) > INT32_MAX);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, s64(ppu.gpr[op.rd]) < INT32_MIN || s64(ppu.gpr[op.rd]) > INT32_MAX);
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DCBTST(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DCBTST()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STBUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STBUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	vm::write8(vm::cast(addr), static_cast<u8>(ppu.gpr[op.rs]));
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ADD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ADD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const u64 RB = ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = RA + RB;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DCBT(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DCBT()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LHZX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHZX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<u16>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::EQV(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto EQV()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ~(ppu.gpr[op.rs] ^ ppu.gpr[op.rb]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ECIWX(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ECIWX()
 {
-	ppu_log.fatal("ECIWX");
-	return false;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	fmt::throw_exception("ECIWX");
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LHZUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHZUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<u16>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::XOR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto XOR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] ^ ppu.gpr[op.rb];
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MFSPR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MFSPR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 n = (op.spr >> 5) | ((op.spr & 0x1f) << 5);
 
 	switch (n)
@@ -3994,37 +5084,67 @@ bool ppu_interpreter::MFSPR(ppu_thread& ppu, ppu_opcode_t op)
 	default: fmt::throw_exception("MFSPR 0x%x", n);
 	}
 
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWAX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWAX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<s32>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DST(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DST()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LHAX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHAX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<s16>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVXL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVXL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~0xfull;
 	ppu.vr[op.vd] = ppu_feed_data<v128>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MFTB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MFTB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 n = (op.spr >> 5) | ((op.spr & 0x1f) << 5);
 
 	switch (n)
@@ -4034,87 +5154,159 @@ bool ppu_interpreter::MFTB(ppu_thread& ppu, ppu_opcode_t op)
 	default: fmt::throw_exception("MFTB 0x%x", n);
 	}
 
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWAUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWAUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<s32>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DSTST(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DSTST()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LHAUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHAUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<s16>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STHX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STHX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::write16(vm::cast(addr), static_cast<u16>(ppu.gpr[op.rs]));
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ORC(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ORC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] | ~ppu.gpr[op.rb];
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ECOWX(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ECOWX()
 {
-	ppu_log.fatal("ECOWX");
-	return false;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	fmt::throw_exception("ECOWX");
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STHUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STHUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	vm::write16(vm::cast(addr), static_cast<u16>(ppu.gpr[op.rs]));
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::OR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto OR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ppu.gpr[op.rs] | ppu.gpr[op.rb];
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DIVDU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DIVDU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 RA = ppu.gpr[op.ra];
 	const u64 RB = ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = RB == 0 ? 0 : RA / RB;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, RB == 0);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, RB == 0);
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DIVWU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DIVWU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 RA = static_cast<u32>(ppu.gpr[op.ra]);
 	const u32 RB = static_cast<u32>(ppu.gpr[op.rb]);
 	ppu.gpr[op.rd] = RB == 0 ? 0 : RA / RB;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, RB == 0);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, RB == 0);
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MTSPR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MTSPR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 n = (op.spr >> 5) | ((op.spr & 0x1f) << 5);
 
 	switch (n)
@@ -4134,66 +5326,119 @@ bool ppu_interpreter::MTSPR(ppu_thread& ppu, ppu_opcode_t op)
 	default: fmt::throw_exception("MTSPR 0x%x", n);
 	}
 
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DCBI(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DCBI()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::NAND(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto NAND()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = ~(ppu.gpr[op.rs] & ppu.gpr[op.rb]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVXL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVXL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]) & ~0xfull;
 	vm::_ref<v128>(vm::cast(addr)) = ppu.vr[op.vs];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DIVD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DIVD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const s64 RA = ppu.gpr[op.ra];
 	const s64 RB = ppu.gpr[op.rb];
 	const bool o = RB == 0 || (RA == INT64_MIN && RB == -1);
 	ppu.gpr[op.rd] = o ? 0 : RA / RB;
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, o);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, o);
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DIVW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DIVW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const s32 RA = static_cast<s32>(ppu.gpr[op.ra]);
 	const s32 RB = static_cast<s32>(ppu.gpr[op.rb]);
 	const bool o = RB == 0 || (RA == INT32_MIN && RB == -1);
 	ppu.gpr[op.rd] = o ? 0 : static_cast<u32>(RA / RB);
-	if (op.oe) [[unlikely]] ppu_ov_set(ppu, o);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
-	return true;
+	if constexpr (((Flags == has_oe) || ...))
+		ppu_ov_set(ppu, o);
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVLX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVLX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-	ppu.vr[op.vd].vi = s_use_ssse3 ? sse_cellbe_lvlx(ppu, addr) : sse_cellbe_lvlx_v0(ppu, addr);
-	return true;
+	ppu.vr[op.vd] = s_use_ssse3 ? sse_cellbe_lvlx(ppu, addr) : sse_cellbe_lvlx_v0(ppu, addr);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LDBRX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LDBRX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<le_t<u64>>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LSWX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LSWX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	u32 count = ppu.xer.cnt & 0x7f;
 	for (; count >= 4; count -= 4, addr += 4, op.rd = (op.rd + 1) & 31)
@@ -4210,47 +5455,85 @@ bool ppu_interpreter::LSWX(ppu_thread& ppu, ppu_opcode_t op)
 		}
 		ppu.gpr[op.rd] = value;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWBRX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWBRX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<le_t<u32>>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LFSX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LFSX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.fpr[op.frd] = ppu_feed_data<f32>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SRW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SRW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = (ppu.gpr[op.rs] & 0xffffffff) >> (ppu.gpr[op.rb] & 0x3f);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SRD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SRD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 n = ppu.gpr[op.rb] & 0x7f;
 	ppu.gpr[op.ra] = n & 0x40 ? 0 : ppu.gpr[op.rs] >> n;
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVRX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVRX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-	ppu.vr[op.vd].vi = s_use_ssse3 ? sse_cellbe_lvrx(ppu, addr) : sse_cellbe_lvrx_v0(ppu, addr);
-	return true;
+	ppu.vr[op.vd] = s_use_ssse3 ? sse_cellbe_lvrx(ppu, addr) : sse_cellbe_lvrx_v0(ppu, addr);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LSWI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LSWI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	u64 addr = op.ra ? ppu.gpr[op.ra] : 0;
 	u64 N = op.rb ? op.rb : 32;
 	u8 reg = op.rd;
@@ -4278,54 +5561,96 @@ bool ppu_interpreter::LSWI(ppu_thread& ppu, ppu_opcode_t op)
 		}
 		reg = (reg + 1) % 32;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LFSUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LFSUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	ppu.fpr[op.frd] = ppu_feed_data<f32>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SYNC(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SYNC()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
 	atomic_fence_seq_cst();
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LFDX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LFDX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.fpr[op.frd] = ppu_feed_data<f64>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LFDUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LFDUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	ppu.fpr[op.frd] = ppu_feed_data<f64>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVLX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVLX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-	s_use_ssse3 ? sse_cellbe_stvlx(addr, ppu.vr[op.vs].vi) : sse_cellbe_stvlx_v0(addr, ppu.vr[op.vs].vi);
-	return true;
+	s_use_ssse3 ? sse_cellbe_stvlx(addr, ppu.vr[op.vs]) : sse_cellbe_stvlx_v0(addr, ppu.vr[op.vs]);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STDBRX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STDBRX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::_ref<le_t<u64>>(vm::cast(addr)) = ppu.gpr[op.rs];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STSWX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STSWX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	u32 count = ppu.xer.cnt & 0x7F;
 	for (; count >= 4; count -= 4, addr += 4, op.rs = (op.rs + 1) & 31)
@@ -4341,40 +5666,70 @@ bool ppu_interpreter::STSWX(ppu_thread& ppu, ppu_opcode_t op)
 			vm::write8(vm::cast(addr + byte), byte_value);
 		}
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STWBRX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STWBRX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::_ref<le_t<u32>>(vm::cast(addr)) = static_cast<u32>(ppu.gpr[op.rs]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFSX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFSX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::_ref<f32>(vm::cast(addr)) = static_cast<float>(ppu.fpr[op.frs]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVRX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVRX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-	s_use_ssse3 ? sse_cellbe_stvrx(addr, ppu.vr[op.vs].vi) : sse_cellbe_stvrx_v0(addr, ppu.vr[op.vs].vi);
-	return true;
+	s_use_ssse3 ? sse_cellbe_stvrx(addr, ppu.vr[op.vs]) : sse_cellbe_stvrx_v0(addr, ppu.vr[op.vs]);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFSUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFSUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	vm::_ref<f32>(vm::cast(addr)) = static_cast<float>(ppu.fpr[op.frs]);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STSWI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STSWI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	u64 addr = op.ra ? ppu.gpr[op.ra] : 0;
 	u64 N = op.rb ? op.rb : 32;
 	u8 reg = op.rd;
@@ -4400,38 +5755,63 @@ bool ppu_interpreter::STSWI(ppu_thread& ppu, ppu_opcode_t op)
 		}
 		reg = (reg + 1) % 32;
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFDX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFDX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::_ref<f64>(vm::cast(addr)) = ppu.fpr[op.frs];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFDUX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFDUX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + ppu.gpr[op.rb];
 	vm::_ref<f64>(vm::cast(addr)) = ppu.fpr[op.frs];
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVLXL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVLXL()
 {
-	return LVLX(ppu, op);
+	return LVLX<Build, Flags...>();
 }
 
-bool ppu_interpreter::LHBRX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHBRX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	ppu.gpr[op.rd] = ppu_feed_data<le_t<u16>>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SRAW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SRAW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	s32 RS = static_cast<s32>(ppu.gpr[op.rs]);
 	u8 shift = ppu.gpr[op.rb] & 63;
 	if (shift > 31)
@@ -4445,12 +5825,19 @@ bool ppu_interpreter::SRAW(ppu_thread& ppu, ppu_opcode_t op)
 		ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast<u64>(RS));
 	}
 
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SRAD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SRAD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	s64 RS = ppu.gpr[op.rs];
 	u8 shift = ppu.gpr[op.rb] & 127;
 	if (shift > 63)
@@ -4464,99 +5851,174 @@ bool ppu_interpreter::SRAD(ppu_thread& ppu, ppu_opcode_t op)
 		ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast<u64>(RS));
 	}
 
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LVRXL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LVRXL()
 {
-	return LVRX(ppu, op);
+	return LVRX<Build, Flags...>();
 }
 
-bool ppu_interpreter::DSS(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DSS()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SRAWI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SRAWI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	s32 RS = static_cast<u32>(ppu.gpr[op.rs]);
 	ppu.gpr[op.ra] = RS >> op.sh32;
 	ppu.xer.ca = (RS < 0) && (static_cast<u32>(ppu.gpr[op.ra] << op.sh32) != static_cast<u32>(RS));
 
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::SRADI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto SRADI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	auto sh = op.sh64;
 	s64 RS = ppu.gpr[op.rs];
 	ppu.gpr[op.ra] = RS >> sh;
 	ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << sh) != static_cast<u64>(RS));
 
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::EIEIO(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto EIEIO()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
 	atomic_fence_seq_cst();
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVLXL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVLXL()
 {
-	return STVLX(ppu, op);
+	return STVLX<Build, Flags...>();
 }
 
-bool ppu_interpreter::STHBRX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STHBRX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::_ref<le_t<u16>>(vm::cast(addr)) = static_cast<u16>(ppu.gpr[op.rs]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::EXTSH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto EXTSH()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = static_cast<s16>(ppu.gpr[op.rs]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STVRXL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STVRXL()
 {
-	return STVRX(ppu, op);
+	return STVRX<Build, Flags...>();
 }
 
-bool ppu_interpreter::EXTSB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto EXTSB()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = static_cast<s8>(ppu.gpr[op.rs]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFIWX(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFIWX()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	vm::write32(vm::cast(addr), static_cast<u32>(std::bit_cast<u64>(ppu.fpr[op.frs])));
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::EXTSW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto EXTSW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.gpr[op.ra] = static_cast<s32>(ppu.gpr[op.rs]);
-	if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::ICBI(ppu_thread&, ppu_opcode_t)
+template <u32 Build, ppu_exec_bit... Flags>
+auto ICBI()
 {
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&&, auto) {
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::DCBZ(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto DCBZ()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
 	const u32 addr0 = vm::cast(addr) & ~127;
 
@@ -4564,46 +6026,76 @@ bool ppu_interpreter::DCBZ(ppu_thread& ppu, ppu_opcode_t op)
 	{
 		alignas(64) static constexpr u8 zero_buf[128]{};
 		do_cell_atomic_128_store(addr0, zero_buf);
-		return true;
+		return;
 	}
 
 	std::memset(vm::base(addr0), 0, 128);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWZ(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWZ()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	ppu.gpr[op.rd] = ppu_feed_data<u32>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWZU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWZU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	ppu.gpr[op.rd] = ppu_feed_data<u32>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LBZ(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LBZ()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	ppu.gpr[op.rd] = ppu_feed_data<u8>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LBZU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LBZU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	ppu.gpr[op.rd] = ppu_feed_data<u8>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STW()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	const u32 value = static_cast<u32>(ppu.gpr[op.rs]);
 	vm::write32(vm::cast(addr), value);
 
@@ -4613,351 +6105,537 @@ bool ppu_interpreter::STW(ppu_thread& ppu, ppu_opcode_t op)
 		vm::reservation_update(vm::cast(addr));
 	}
 
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STWU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STWU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	vm::write32(vm::cast(addr), static_cast<u32>(ppu.gpr[op.rs]));
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STB()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	vm::write8(vm::cast(addr), static_cast<u8>(ppu.gpr[op.rs]));
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STBU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STBU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	vm::write8(vm::cast(addr), static_cast<u8>(ppu.gpr[op.rs]));
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LHZ(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHZ()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	ppu.gpr[op.rd] = ppu_feed_data<u16>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LHZU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHZU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	ppu.gpr[op.rd] = ppu_feed_data<u16>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LHA(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHA()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	ppu.gpr[op.rd] = ppu_feed_data<s16>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LHAU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LHAU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	ppu.gpr[op.rd] = ppu_feed_data<s16>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STH(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STH()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	vm::write16(vm::cast(addr), static_cast<u16>(ppu.gpr[op.rs]));
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STHU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STHU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	vm::write16(vm::cast(addr), static_cast<u16>(ppu.gpr[op.rs]));
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LMW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LMW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	for (u32 i = op.rd; i<32; ++i, addr += 4)
 	{
 		ppu.gpr[i] = ppu_feed_data<u32>(ppu, addr);
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STMW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STMW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	for (u32 i = op.rs; i<32; ++i, addr += 4)
 	{
 		vm::write32(vm::cast(addr), static_cast<u32>(ppu.gpr[i]));
 	}
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LFS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LFS()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	ppu.fpr[op.frd] = ppu_feed_data<f32>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LFSU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LFSU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	ppu.fpr[op.frd] = ppu_feed_data<f32>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LFD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LFD()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	ppu.fpr[op.frd] = ppu_feed_data<f64>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LFDU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LFDU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	ppu.fpr[op.frd] = ppu_feed_data<f64>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFS()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	vm::_ref<f32>(vm::cast(addr)) = static_cast<float>(ppu.fpr[op.frs]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFSU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFSU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	vm::_ref<f32>(vm::cast(addr)) = static_cast<float>(ppu.fpr[op.frs]);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFD()
 {
-	const u64 addr = op.ra ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	const u64 addr = op.ra || 1 ? ppu.gpr[op.ra] + op.simm16 : op.simm16;
 	vm::_ref<f64>(vm::cast(addr)) = ppu.fpr[op.frs];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STFDU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STFDU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + op.simm16;
 	vm::_ref<f64>(vm::cast(addr)) = ppu.fpr[op.frs];
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.simm16 & ~3) + (op.ra ? ppu.gpr[op.ra] : 0);
 	ppu.gpr[op.rd] = ppu_feed_data<u64>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LDU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LDU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + (op.simm16 & ~3);
 	ppu.gpr[op.rd] = ppu_feed_data<u64>(ppu, addr);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::LWA(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto LWA()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.simm16 & ~3) + (op.ra ? ppu.gpr[op.ra] : 0);
 	ppu.gpr[op.rd] = ppu_feed_data<s32>(ppu, addr);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STD()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = (op.simm16 & ~3) + (op.ra ? ppu.gpr[op.ra] : 0);
 	vm::write64(vm::cast(addr), ppu.gpr[op.rs]);
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::STDU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto STDU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u64 addr = ppu.gpr[op.ra] + (op.simm16 & ~3);
 	vm::write64(vm::cast(addr), ppu.gpr[op.rs]);
 	ppu.gpr[op.ra] = addr;
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::FDIVS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FDIVS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] / ppu.fpr[op.frb]);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FDIVS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FSUBS()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] / ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FSUBS(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] - ppu.fpr[op.frb]);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FSUBS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FADDS()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] - ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FADDS(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] + ppu.fpr[op.frb]);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FADDS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FSQRTS()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] + ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FSQRTS(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = f32(std::sqrt(ppu.fpr[op.frb]));
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FSQRTS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FRES()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(std::sqrt(ppu.fpr[op.frb]));
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FRES(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = f32(1.0 / ppu.fpr[op.frb]);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FRES(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FMULS()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(1.0 / ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FMULS(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] * ppu.fpr[op.frc]);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FMULS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FMADDS()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] * ppu.fpr[op.frc]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	if constexpr (((Flags == use_dfma) || ...))
+		ppu.fpr[op.frd] = f32(std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]));
+	else
+		ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb]);
+
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::FMADDS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FMSUBS()
 {
-	ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb]);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc, use_dfma>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	if constexpr (((Flags == use_dfma) || ...))
+		ppu.fpr[op.frd] = f32(std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]));
+	else
+		ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb]);
+
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FMADDS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FNMSUBS()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]));
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc, use_dfma>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	if constexpr (((Flags == use_dfma) || ...))
+		ppu.fpr[op.frd] = f32(-std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]));
+	else
+		ppu.fpr[op.frd] = f32(-(ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb]));
+
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::FMSUBS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FNMADDS()
 {
-	ppu.fpr[op.frd] = f32(ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb]);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc, use_dfma>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	if constexpr (((Flags == use_dfma) || ...))
+		ppu.fpr[op.frd] = f32(-std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]));
+	else
+		ppu.fpr[op.frd] = f32(-(ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb]));
+
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FMSUBS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MTFSB1()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]));
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter_fast::FNMSUBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.fpr[op.frd] = f32(-(ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb]));
-	return true;
-}
-
-bool ppu_interpreter_precise::FNMSUBS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const f64 res = ppu.fpr[op.frd] = f32(-std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]));
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
-
-bool ppu_interpreter_fast::FNMADDS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.fpr[op.frd] = f32(-(ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb]));
-	return true;
-}
-
-bool ppu_interpreter_precise::FNMADDS(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const f64 res = ppu.fpr[op.frd] = f32(-std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]));
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
-
-bool ppu_interpreter::MTFSB1(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 bit = op.crbd;
 	if (bit < 16 || bit > 19) ppu_log.warning("MTFSB1(%d)", bit);
 	ppu.fpscr.bits[bit] = 1;
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MCRFS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MCRFS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	if (op.crfs != 4) ppu_log.warning("MCRFS(%d)", op.crfs);
 	ppu.cr.fields[op.crfd] = ppu.fpscr.fields[op.crfs];
-	return true;
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MTFSB0(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MTFSB0()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 bit = op.crbd;
 	if (bit < 16 || bit > 19) ppu_log.warning("MTFSB0(%d)", bit);
 	ppu.fpscr.bits[bit] = 0;
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MTFSFI(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MTFSFI()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const u32 bf = op.crfd;
 
 	if (bf != 4)
@@ -4987,270 +6665,1392 @@ bool ppu_interpreter::MTFSFI(ppu_thread& ppu, ppu_opcode_t op)
 		ppu.fpscr.fields[bf] = all_values[op.i];
 	}
 
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MFFS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MFFS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu_log.warning("MFFS");
 	ppu.fpr[op.frd] = std::bit_cast<f64>(u64{ppu.fpscr.fl} << 15 | u64{ppu.fpscr.fg} << 14 | u64{ppu.fpscr.fe} << 13 | u64{ppu.fpscr.fu} << 12);
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::MTFSF(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto MTFSF()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](auto&& ppu, auto) {
 	ppu_log.warning("MTFSF");
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FCMPU(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FCMPU()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const f64 a = ppu.fpr[op.fra];
 	const f64 b = ppu.fpr[op.frb];
-	ppu_fpcc_set(ppu, a, b, true, op.crfd);
-	return true;
+	ppu_set_fpcc<set_fpcc, has_rc, Flags...>(ppu, a, b, op.crfd);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FCTIW(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FCTIW()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto b = _mm_load_sd(&ppu.fpr[op.frb]);
 	const auto res = _mm_xor_si128(_mm_cvtpd_epi32(b), _mm_castpd_si128(_mm_cmpge_pd(b, _mm_set1_pd(0x80000000))));
 	ppu.fpr[op.frd] = std::bit_cast<f64, s64>(_mm_cvtsi128_si32(res));
-	if (op.rc) [[unlikely]] fmt::throw_exception("%s: op.rc", __func__); //ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, 0., 0.); // undefined (TODO)
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FCTIWZ(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FCTIWZ()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto b = _mm_load_sd(&ppu.fpr[op.frb]);
 	const auto res = _mm_xor_si128(_mm_cvttpd_epi32(b), _mm_castpd_si128(_mm_cmpge_pd(b, _mm_set1_pd(0x80000000))));
 	ppu.fpr[op.frd] = std::bit_cast<f64, s64>(_mm_cvtsi128_si32(res));
-	if (op.rc) [[unlikely]] fmt::throw_exception("%s: op.rc", __func__); //ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, 0., 0.); // undefined (TODO)
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::FRSP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FRSP()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = f32(ppu.fpr[op.frb]);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FRSP(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FDIV()
 {
-	const f64 res = ppu.fpr[op.frd] = f32(ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FDIV(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = ppu.fpr[op.fra] / ppu.fpr[op.frb];
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FDIV(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FSUB()
 {
-	const f64 res = ppu.fpr[op.frd] = ppu.fpr[op.fra] / ppu.fpr[op.frb];
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FSUB(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = ppu.fpr[op.fra] - ppu.fpr[op.frb];
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FSUB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FADD()
 {
-	const f64 res = ppu.fpr[op.frd] = ppu.fpr[op.fra] - ppu.fpr[op.frb];
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FADD(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = ppu.fpr[op.fra] + ppu.fpr[op.frb];
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FADD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FSQRT()
 {
-	const f64 res = ppu.fpr[op.frd] = ppu.fpr[op.fra] + ppu.fpr[op.frb];
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FSQRT(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = std::sqrt(ppu.fpr[op.frb]);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FSQRT(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FSEL()
 {
-	const f64 res = ppu.fpr[op.frd] = std::sqrt(ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-bool ppu_interpreter::FSEL(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = ppu.fpr[op.fra] >= 0.0 ? ppu.fpr[op.frc] : ppu.fpr[op.frb];
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::FMUL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FMUL()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc];
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FMUL(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FRSQRTE()
 {
-	const f64 res = ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc];
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
 
-bool ppu_interpreter_fast::FRSQRTE(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = 1.0 / std::sqrt(ppu.fpr[op.frb]);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FRSQRTE(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FMSUB()
 {
-	const f64 res = ppu.fpr[op.frd] = 1.0 / std::sqrt(ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc, use_dfma>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	if constexpr (((Flags == use_dfma) || ...))
+		ppu.fpr[op.frd] = std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]);
+	else
+		ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb];
+
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::FMSUB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FMADD()
 {
-	ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb];
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc, use_dfma>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	if constexpr (((Flags == use_dfma) || ...))
+		ppu.fpr[op.frd] = std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]);
+	else
+		ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb];
+
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FMSUB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FNMSUB()
 {
-	const f64 res = ppu.fpr[op.frd] = std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc, use_dfma>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	if constexpr (((Flags == use_dfma) || ...))
+		ppu.fpr[op.frd] = -std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]);
+	else
+		ppu.fpr[op.frd] = -(ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb]);
+
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_fast::FMADD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FNMADD()
 {
-	ppu.fpr[op.frd] = ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb];
-	return true;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc, use_dfma>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
+	if constexpr (((Flags == use_dfma) || ...))
+		ppu.fpr[op.frd] = -std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]);
+	else
+		ppu.fpr[op.frd] = -(ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb]);
+
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter_precise::FMADD(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FCMPO()
 {
-	const f64 res = ppu.fpr[op.frd] = std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
+	return FCMPU<Build, Flags...>();
 }
 
-bool ppu_interpreter_fast::FNMSUB(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FNEG()
 {
-	ppu.fpr[op.frd] = -(ppu.fpr[op.fra] * ppu.fpr[op.frc] - ppu.fpr[op.frb]);
-	return true;
-}
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<use_dfma>();
 
-bool ppu_interpreter_precise::FNMSUB(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const f64 res = ppu.fpr[op.frd] = -std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], -ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
-
-bool ppu_interpreter_fast::FNMADD(ppu_thread& ppu, ppu_opcode_t op)
-{
-	ppu.fpr[op.frd] = -(ppu.fpr[op.fra] * ppu.fpr[op.frc] + ppu.fpr[op.frb]);
-	return true;
-}
-
-bool ppu_interpreter_precise::FNMADD(ppu_thread& ppu, ppu_opcode_t op)
-{
-	const f64 res = ppu.fpr[op.frd] = -std::fma(ppu.fpr[op.fra], ppu.fpr[op.frc], ppu.fpr[op.frb]);
-	ppu_fpcc_set(ppu, res, 0., op.rc);
-	return true;
-}
-
-bool ppu_interpreter::FCMPO(ppu_thread& ppu, ppu_opcode_t op)
-{
-	return FCMPU(ppu, op);
-}
-
-bool ppu_interpreter::FNEG(ppu_thread& ppu, ppu_opcode_t op)
-{
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = -ppu.fpr[op.frb];
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FMR(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FMR()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = ppu.fpr[op.frb];
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FNABS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FNABS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = -std::fabs(ppu.fpr[op.frb]);
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FABS(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FABS()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	ppu.fpr[op.frd] = std::fabs(ppu.fpr[op.frb]);
-	if (op.rc) [[unlikely]] ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	if constexpr (((Flags == has_rc) || ...))
+		ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FCTID(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FCTID()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto b = _mm_load_sd(&ppu.fpr[op.frb]);
 	const auto res = _mm_xor_si128(_mm_set1_epi64x(_mm_cvtsd_si64(b)), _mm_castpd_si128(_mm_cmpge_pd(b, _mm_set1_pd(f64(1ull << 63)))));
 	ppu.fpr[op.frd] = std::bit_cast<f64>(_mm_cvtsi128_si64(res));
-	if (op.rc) [[unlikely]] fmt::throw_exception("%s: op.rc", __func__); //ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, 0., 0.); // undefined (TODO)
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FCTIDZ(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FCTIDZ()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	const auto b = _mm_load_sd(&ppu.fpr[op.frb]);
 	const auto res = _mm_xor_si128(_mm_set1_epi64x(_mm_cvttsd_si64(b)), _mm_castpd_si128(_mm_cmpge_pd(b, _mm_set1_pd(f64(1ull << 63)))));
 	ppu.fpr[op.frd] = std::bit_cast<f64>(_mm_cvtsi128_si64(res));
-	if (op.rc) [[unlikely]] fmt::throw_exception("%s: op.rc", __func__); //ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, 0., 0.); // undefined (TODO)
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::FCFID(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto FCFID()
 {
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<set_fpcc>();
+
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
 	_mm_store_sd(&ppu.fpr[op.frd], _mm_cvtsi64_sd(_mm_setzero_pd(), std::bit_cast<s64>(ppu.fpr[op.frb])));
-	if (op.rc) [[unlikely]] fmt::throw_exception("%s: op.rc", __func__); //ppu_cr_set(ppu, 1, ppu.fpscr.fg, ppu.fpscr.fl, ppu.fpscr.fe, ppu.fpscr.fu);
-	return true;
+	ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
+	};
+	RETURN_(ppu, op);
 }
 
-bool ppu_interpreter::UNK(ppu_thread& ppu, ppu_opcode_t op)
+template <u32 Build, ppu_exec_bit... Flags>
+auto UNK()
 {
-	// HLE function index
-	const u32 index = (ppu.cia - g_fxo->get<ppu_function_manager>().addr) / 8;
+	if constexpr (Build == 0xf1a6)
+		return ppu_exec_select<Flags...>::template select<>();
 
-	const auto& hle_funcs = ppu_function_manager::get();
-
-	if (ppu.cia % 8 == 4 && index < hle_funcs.size())
+	if constexpr (Build == 0)
 	{
-		return hle_funcs[index](ppu);
+		return +[](ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func*)
+		{
+			const u32 old_cia = ppu.cia;
+			ppu.cia = vm::get_addr(this_op);
+			ppu.exec_bytes += ppu.cia - old_cia;
+
+			// HLE function index
+			const u32 index = (ppu.cia - g_fxo->get<ppu_function_manager>().addr) / 8;
+
+			const auto& hle_funcs = ppu_function_manager::get();
+
+			if (ppu.cia % 8 == 4 && index < hle_funcs.size())
+			{
+				return hle_funcs[index](ppu, op, this_op, nullptr);
+			}
+
+			fmt::throw_exception("Unknown/Illegal opcode: 0x%08x at 0x%x", op.opcode, ppu.cia);
+		};
+	}
+}
+
+template <typename IT>
+struct ppu_interpreter_t
+{
+	IT MFVSCR;
+	IT MTVSCR;
+	IT VADDCUW;
+	IT VADDFP;
+	IT VADDSBS;
+	IT VADDSHS;
+	IT VADDSWS;
+	IT VADDUBM;
+	IT VADDUBS;
+	IT VADDUHM;
+	IT VADDUHS;
+	IT VADDUWM;
+	IT VADDUWS;
+	IT VAND;
+	IT VANDC;
+	IT VAVGSB;
+	IT VAVGSH;
+	IT VAVGSW;
+	IT VAVGUB;
+	IT VAVGUH;
+	IT VAVGUW;
+	IT VCFSX;
+	IT VCFUX;
+	IT VCMPBFP;
+	IT VCMPBFP_;
+	IT VCMPEQFP;
+	IT VCMPEQFP_;
+	IT VCMPEQUB;
+	IT VCMPEQUB_;
+	IT VCMPEQUH;
+	IT VCMPEQUH_;
+	IT VCMPEQUW;
+	IT VCMPEQUW_;
+	IT VCMPGEFP;
+	IT VCMPGEFP_;
+	IT VCMPGTFP;
+	IT VCMPGTFP_;
+	IT VCMPGTSB;
+	IT VCMPGTSB_;
+	IT VCMPGTSH;
+	IT VCMPGTSH_;
+	IT VCMPGTSW;
+	IT VCMPGTSW_;
+	IT VCMPGTUB;
+	IT VCMPGTUB_;
+	IT VCMPGTUH;
+	IT VCMPGTUH_;
+	IT VCMPGTUW;
+	IT VCMPGTUW_;
+	IT VCTSXS;
+	IT VCTUXS;
+	IT VEXPTEFP;
+	IT VLOGEFP;
+	IT VMADDFP;
+	IT VMAXFP;
+	IT VMAXSB;
+	IT VMAXSH;
+	IT VMAXSW;
+	IT VMAXUB;
+	IT VMAXUH;
+	IT VMAXUW;
+	IT VMHADDSHS;
+	IT VMHRADDSHS;
+	IT VMINFP;
+	IT VMINSB;
+	IT VMINSH;
+	IT VMINSW;
+	IT VMINUB;
+	IT VMINUH;
+	IT VMINUW;
+	IT VMLADDUHM;
+	IT VMRGHB;
+	IT VMRGHH;
+	IT VMRGHW;
+	IT VMRGLB;
+	IT VMRGLH;
+	IT VMRGLW;
+	IT VMSUMMBM;
+	IT VMSUMSHM;
+	IT VMSUMSHS;
+	IT VMSUMUBM;
+	IT VMSUMUHM;
+	IT VMSUMUHS;
+	IT VMULESB;
+	IT VMULESH;
+	IT VMULEUB;
+	IT VMULEUH;
+	IT VMULOSB;
+	IT VMULOSH;
+	IT VMULOUB;
+	IT VMULOUH;
+	IT VNMSUBFP;
+	IT VNOR;
+	IT VOR;
+	IT VPERM;
+	IT VPKPX;
+	IT VPKSHSS;
+	IT VPKSHUS;
+	IT VPKSWSS;
+	IT VPKSWUS;
+	IT VPKUHUM;
+	IT VPKUHUS;
+	IT VPKUWUM;
+	IT VPKUWUS;
+	IT VREFP;
+	IT VRFIM;
+	IT VRFIN;
+	IT VRFIP;
+	IT VRFIZ;
+	IT VRLB;
+	IT VRLH;
+	IT VRLW;
+	IT VRSQRTEFP;
+	IT VSEL;
+	IT VSL;
+	IT VSLB;
+	IT VSLDOI;
+	IT VSLH;
+	IT VSLO;
+	IT VSLW;
+	IT VSPLTB;
+	IT VSPLTH;
+	IT VSPLTISB;
+	IT VSPLTISH;
+	IT VSPLTISW;
+	IT VSPLTW;
+	IT VSR;
+	IT VSRAB;
+	IT VSRAH;
+	IT VSRAW;
+	IT VSRB;
+	IT VSRH;
+	IT VSRO;
+	IT VSRW;
+	IT VSUBCUW;
+	IT VSUBFP;
+	IT VSUBSBS;
+	IT VSUBSHS;
+	IT VSUBSWS;
+	IT VSUBUBM;
+	IT VSUBUBS;
+	IT VSUBUHM;
+	IT VSUBUHS;
+	IT VSUBUWM;
+	IT VSUBUWS;
+	IT VSUMSWS;
+	IT VSUM2SWS;
+	IT VSUM4SBS;
+	IT VSUM4SHS;
+	IT VSUM4UBS;
+	IT VUPKHPX;
+	IT VUPKHSB;
+	IT VUPKHSH;
+	IT VUPKLPX;
+	IT VUPKLSB;
+	IT VUPKLSH;
+	IT VXOR;
+	IT TDI;
+	IT TWI;
+	IT MULLI;
+	IT SUBFIC;
+	IT CMPLI;
+	IT CMPI;
+	IT ADDIC;
+	IT ADDI;
+	IT ADDIS;
+	IT BC;
+	IT SC;
+	IT B;
+	IT MCRF;
+	IT BCLR;
+	IT CRNOR;
+	IT CRANDC;
+	IT ISYNC;
+	IT CRXOR;
+	IT CRNAND;
+	IT CRAND;
+	IT CREQV;
+	IT CRORC;
+	IT CROR;
+	IT BCCTR;
+	IT RLWIMI;
+	IT RLWINM;
+	IT RLWNM;
+	IT ORI;
+	IT ORIS;
+	IT XORI;
+	IT XORIS;
+	IT ANDI;
+	IT ANDIS;
+	IT RLDICL;
+	IT RLDICR;
+	IT RLDIC;
+	IT RLDIMI;
+	IT RLDCL;
+	IT RLDCR;
+	IT CMP;
+	IT TW;
+	IT LVSL;
+	IT LVEBX;
+	IT SUBFC;
+	IT ADDC;
+	IT MULHDU;
+	IT MULHWU;
+	IT MFOCRF;
+	IT LWARX;
+	IT LDX;
+	IT LWZX;
+	IT SLW;
+	IT CNTLZW;
+	IT SLD;
+	IT AND;
+	IT CMPL;
+	IT LVSR;
+	IT LVEHX;
+	IT SUBF;
+	IT LDUX;
+	IT DCBST;
+	IT LWZUX;
+	IT CNTLZD;
+	IT ANDC;
+	IT TD;
+	IT LVEWX;
+	IT MULHD;
+	IT MULHW;
+	IT LDARX;
+	IT DCBF;
+	IT LBZX;
+	IT LVX;
+	IT NEG;
+	IT LBZUX;
+	IT NOR;
+	IT STVEBX;
+	IT SUBFE;
+	IT ADDE;
+	IT MTOCRF;
+	IT STDX;
+	IT STWCX;
+	IT STWX;
+	IT STVEHX;
+	IT STDUX;
+	IT STWUX;
+	IT STVEWX;
+	IT SUBFZE;
+	IT ADDZE;
+	IT STDCX;
+	IT STBX;
+	IT STVX;
+	IT SUBFME;
+	IT MULLD;
+	IT ADDME;
+	IT MULLW;
+	IT DCBTST;
+	IT STBUX;
+	IT ADD;
+	IT DCBT;
+	IT LHZX;
+	IT EQV;
+	IT ECIWX;
+	IT LHZUX;
+	IT XOR;
+	IT MFSPR;
+	IT LWAX;
+	IT DST;
+	IT LHAX;
+	IT LVXL;
+	IT MFTB;
+	IT LWAUX;
+	IT DSTST;
+	IT LHAUX;
+	IT STHX;
+	IT ORC;
+	IT ECOWX;
+	IT STHUX;
+	IT OR;
+	IT DIVDU;
+	IT DIVWU;
+	IT MTSPR;
+	IT DCBI;
+	IT NAND;
+	IT STVXL;
+	IT DIVD;
+	IT DIVW;
+	IT LVLX;
+	IT LDBRX;
+	IT LSWX;
+	IT LWBRX;
+	IT LFSX;
+	IT SRW;
+	IT SRD;
+	IT LVRX;
+	IT LSWI;
+	IT LFSUX;
+	IT SYNC;
+	IT LFDX;
+	IT LFDUX;
+	IT STVLX;
+	IT STDBRX;
+	IT STSWX;
+	IT STWBRX;
+	IT STFSX;
+	IT STVRX;
+	IT STFSUX;
+	IT STSWI;
+	IT STFDX;
+	IT STFDUX;
+	IT LVLXL;
+	IT LHBRX;
+	IT SRAW;
+	IT SRAD;
+	IT LVRXL;
+	IT DSS;
+	IT SRAWI;
+	IT SRADI;
+	IT EIEIO;
+	IT STVLXL;
+	IT STHBRX;
+	IT EXTSH;
+	IT STVRXL;
+	IT EXTSB;
+	IT STFIWX;
+	IT EXTSW;
+	IT ICBI;
+	IT DCBZ;
+	IT LWZ;
+	IT LWZU;
+	IT LBZ;
+	IT LBZU;
+	IT STW;
+	IT STWU;
+	IT STB;
+	IT STBU;
+	IT LHZ;
+	IT LHZU;
+	IT LHA;
+	IT LHAU;
+	IT STH;
+	IT STHU;
+	IT LMW;
+	IT STMW;
+	IT LFS;
+	IT LFSU;
+	IT LFD;
+	IT LFDU;
+	IT STFS;
+	IT STFSU;
+	IT STFD;
+	IT STFDU;
+	IT LD;
+	IT LDU;
+	IT LWA;
+	IT STD;
+	IT STDU;
+	IT FDIVS;
+	IT FSUBS;
+	IT FADDS;
+	IT FSQRTS;
+	IT FRES;
+	IT FMULS;
+	IT FMADDS;
+	IT FMSUBS;
+	IT FNMSUBS;
+	IT FNMADDS;
+	IT MTFSB1;
+	IT MCRFS;
+	IT MTFSB0;
+	IT MTFSFI;
+	IT MFFS;
+	IT MTFSF;
+	IT FCMPU;
+	IT FRSP;
+	IT FCTIW;
+	IT FCTIWZ;
+	IT FDIV;
+	IT FSUB;
+	IT FADD;
+	IT FSQRT;
+	IT FSEL;
+	IT FMUL;
+	IT FRSQRTE;
+	IT FMSUB;
+	IT FMADD;
+	IT FNMSUB;
+	IT FNMADD;
+	IT FCMPO;
+	IT FNEG;
+	IT FMR;
+	IT FNABS;
+	IT FABS;
+	IT FCTID;
+	IT FCTIDZ;
+	IT FCFID;
+	IT UNK;
+
+	IT SUBFCO;
+	IT ADDCO;
+	IT SUBFO;
+	IT NEGO;
+	IT SUBFEO;
+	IT ADDEO;
+	IT SUBFZEO;
+	IT ADDZEO;
+	IT SUBFMEO;
+	IT MULLDO;
+	IT ADDMEO;
+	IT MULLWO;
+	IT ADDO;
+	IT DIVDUO;
+	IT DIVWUO;
+	IT DIVDO;
+	IT DIVWO;
+
+	IT SUBFCO_;
+	IT ADDCO_;
+	IT SUBFO_;
+	IT NEGO_;
+	IT SUBFEO_;
+	IT ADDEO_;
+	IT SUBFZEO_;
+	IT ADDZEO_;
+	IT SUBFMEO_;
+	IT MULLDO_;
+	IT ADDMEO_;
+	IT MULLWO_;
+	IT ADDO_;
+	IT DIVDUO_;
+	IT DIVWUO_;
+	IT DIVDO_;
+	IT DIVWO_;
+
+	IT RLWIMI_;
+	IT RLWINM_;
+	IT RLWNM_;
+	IT RLDICL_;
+	IT RLDICR_;
+	IT RLDIC_;
+	IT RLDIMI_;
+	IT RLDCL_;
+	IT RLDCR_;
+	IT SUBFC_;
+	IT MULHDU_;
+	IT ADDC_;
+	IT MULHWU_;
+	IT SLW_;
+	IT CNTLZW_;
+	IT SLD_;
+	IT AND_;
+	IT SUBF_;
+	IT CNTLZD_;
+	IT ANDC_;
+	IT MULHD_;
+	IT MULHW_;
+	IT NEG_;
+	IT NOR_;
+	IT SUBFE_;
+	IT ADDE_;
+	IT SUBFZE_;
+	IT ADDZE_;
+	IT MULLD_;
+	IT SUBFME_;
+	IT ADDME_;
+	IT MULLW_;
+	IT ADD_;
+	IT EQV_;
+	IT XOR_;
+	IT ORC_;
+	IT OR_;
+	IT DIVDU_;
+	IT DIVWU_;
+	IT NAND_;
+	IT DIVD_;
+	IT DIVW_;
+	IT SRW_;
+	IT SRD_;
+	IT SRAW_;
+	IT SRAD_;
+	IT SRAWI_;
+	IT SRADI_;
+	IT EXTSH_;
+	IT EXTSB_;
+	IT EXTSW_;
+	IT FDIVS_;
+	IT FSUBS_;
+	IT FADDS_;
+	IT FSQRTS_;
+	IT FRES_;
+	IT FMULS_;
+	IT FMADDS_;
+	IT FMSUBS_;
+	IT FNMSUBS_;
+	IT FNMADDS_;
+	IT MTFSB1_;
+	IT MTFSB0_;
+	IT MTFSFI_;
+	IT MFFS_;
+	IT MTFSF_;
+	IT FRSP_;
+	IT FCTIW_;
+	IT FCTIWZ_;
+	IT FDIV_;
+	IT FSUB_;
+	IT FADD_;
+	IT FSQRT_;
+	IT FSEL_;
+	IT FMUL_;
+	IT FRSQRTE_;
+	IT FMSUB_;
+	IT FMADD_;
+	IT FNMSUB_;
+	IT FNMADD_;
+	IT FNEG_;
+	IT FMR_;
+	IT FNABS_;
+	IT FABS_;
+	IT FCTID_;
+	IT FCTIDZ_;
+	IT FCFID_;
+
+	/* Optimized variants */
+};
+
+ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept
+{
+	// Obtain required set of flags from settings
+	bs_t<ppu_exec_bit> selected{};
+	if (g_cfg.core.ppu_set_sat_bit)
+		selected += set_sat;
+	if (g_cfg.core.ppu_use_nj_bit)
+		selected += use_nj;
+	if (g_cfg.core.ppu_set_vnan)
+		selected += set_vnan;
+	if (g_cfg.core.ppu_fix_vnan)
+		selected += fix_vnan;
+	if (g_cfg.core.ppu_set_fpcc)
+		selected += set_fpcc;
+	if (g_cfg.core.use_accurate_dfma)
+		selected += use_dfma;
+	if (g_cfg.core.ppu_debug)
+		selected += set_cr_stats; // TODO
+
+	ptrs = std::make_unique<decltype(ptrs)::element_type>();
+
+#ifndef __INTELLISENSE__
+
+#define INIT_VCMP(name) \
+	ptrs->name = ::name<0>(); \
+	ptrs->name##_ = ::name<0, has_oe>(); \
+
+#define INIT_OV(name) \
+	ptrs->name = ::name<0>(); \
+	ptrs->name##O = ::name<0, has_oe>(); \
+
+#define INIT_RC(name) \
+	ptrs->name = ::name<0xf1a6>()(selected, []<ppu_exec_bit... Flags>() { \
+		return ::name<0, Flags...>(); \
+	}); \
+	ptrs->name##_ = ::name<0xf1a6, set_fpcc>()(selected, []<ppu_exec_bit... Flags>() { \
+		/* Minor optimization: has_rc implies set_fpcc so don't compile has_rc alone */ \
+		return ::name<0, has_rc, Flags...>(); \
+	}); \
+
+#define INIT_RC_OV(name) \
+	ptrs->name = ::name<0>(); \
+	ptrs->name##O = ::name<0, has_oe>(); \
+	ptrs->name##_ = ::name<0, has_rc>(); \
+	ptrs->name##O_ = ::name<0, has_oe, has_rc>(); \
+
+	// Initialize instructions with their own sets of supported flags (except INIT_VCMP, INIT_OV, INIT_RC_OV)
+#define INIT(name) \
+	ptrs->name = ::name<0xf1a6>()(selected, []<ppu_exec_bit... Flags>() { \
+		return ::name<0, Flags...>(); \
+	}); \
+
+	INIT(MFVSCR);
+	INIT(MTVSCR);
+	INIT(VADDCUW);
+	INIT(VADDFP);
+	INIT(VADDSBS);
+	INIT(VADDSHS);
+	INIT(VADDSWS);
+	INIT(VADDUBM);
+	INIT(VADDUBS);
+	INIT(VADDUHM);
+	INIT(VADDUHS);
+	INIT(VADDUWM);
+	INIT(VADDUWS);
+	INIT(VAND);
+	INIT(VANDC);
+	INIT(VAVGSB);
+	INIT(VAVGSH);
+	INIT(VAVGSW);
+	INIT(VAVGUB);
+	INIT(VAVGUH);
+	INIT(VAVGUW);
+	INIT(VCFSX);
+	INIT(VCFUX);
+	INIT_VCMP(VCMPBFP);
+	INIT_VCMP(VCMPEQFP);
+	INIT_VCMP(VCMPEQUB);
+	INIT_VCMP(VCMPEQUH);
+	INIT_VCMP(VCMPEQUW);
+	INIT_VCMP(VCMPGEFP);
+	INIT_VCMP(VCMPGTFP);
+	INIT_VCMP(VCMPGTSB);
+	INIT_VCMP(VCMPGTSH);
+	INIT_VCMP(VCMPGTSW);
+	INIT_VCMP(VCMPGTUB);
+	INIT_VCMP(VCMPGTUH);
+	INIT_VCMP(VCMPGTUW);
+	INIT(VCTSXS);
+	INIT(VCTUXS);
+	INIT(VEXPTEFP);
+	INIT(VLOGEFP);
+	INIT(VMADDFP);
+	INIT(VMAXFP);
+	INIT(VMAXSB);
+	INIT(VMAXSH);
+	INIT(VMAXSW);
+	INIT(VMAXUB);
+	INIT(VMAXUH);
+	INIT(VMAXUW);
+	INIT(VMHADDSHS);
+	INIT(VMHRADDSHS);
+	INIT(VMINFP);
+	INIT(VMINSB);
+	INIT(VMINSH);
+	INIT(VMINSW);
+	INIT(VMINUB);
+	INIT(VMINUH);
+	INIT(VMINUW);
+	INIT(VMLADDUHM);
+	INIT(VMRGHB);
+	INIT(VMRGHH);
+	INIT(VMRGHW);
+	INIT(VMRGLB);
+	INIT(VMRGLH);
+	INIT(VMRGLW);
+	INIT(VMSUMMBM);
+	INIT(VMSUMSHM);
+	INIT(VMSUMSHS);
+	INIT(VMSUMUBM);
+	INIT(VMSUMUHM);
+	INIT(VMSUMUHS);
+	INIT(VMULESB);
+	INIT(VMULESH);
+	INIT(VMULEUB);
+	INIT(VMULEUH);
+	INIT(VMULOSB);
+	INIT(VMULOSH);
+	INIT(VMULOUB);
+	INIT(VMULOUH);
+	INIT(VNMSUBFP);
+	INIT(VNOR);
+	INIT(VOR);
+	INIT(VPERM);
+	INIT(VPKPX);
+	INIT(VPKSHSS);
+	INIT(VPKSHUS);
+	INIT(VPKSWSS);
+	INIT(VPKSWUS);
+	INIT(VPKUHUM);
+	INIT(VPKUHUS);
+	INIT(VPKUWUM);
+	INIT(VPKUWUS);
+	INIT(VREFP);
+	INIT(VRFIM);
+	INIT(VRFIN);
+	INIT(VRFIP);
+	INIT(VRFIZ);
+	INIT(VRLB);
+	INIT(VRLH);
+	INIT(VRLW);
+	INIT(VRSQRTEFP);
+	INIT(VSEL);
+	INIT(VSL);
+	INIT(VSLB);
+	INIT(VSLDOI);
+	INIT(VSLH);
+	INIT(VSLO);
+	INIT(VSLW);
+	INIT(VSPLTB);
+	INIT(VSPLTH);
+	INIT(VSPLTISB);
+	INIT(VSPLTISH);
+	INIT(VSPLTISW);
+	INIT(VSPLTW);
+	INIT(VSR);
+	INIT(VSRAB);
+	INIT(VSRAH);
+	INIT(VSRAW);
+	INIT(VSRB);
+	INIT(VSRH);
+	INIT(VSRO);
+	INIT(VSRW);
+	INIT(VSUBCUW);
+	INIT(VSUBFP);
+	INIT(VSUBSBS);
+	INIT(VSUBSHS);
+	INIT(VSUBSWS);
+	INIT(VSUBUBM);
+	INIT(VSUBUBS);
+	INIT(VSUBUHM);
+	INIT(VSUBUHS);
+	INIT(VSUBUWM);
+	INIT(VSUBUWS);
+	INIT(VSUMSWS);
+	INIT(VSUM2SWS);
+	INIT(VSUM4SBS);
+	INIT(VSUM4SHS);
+	INIT(VSUM4UBS);
+	INIT(VUPKHPX);
+	INIT(VUPKHSB);
+	INIT(VUPKHSH);
+	INIT(VUPKLPX);
+	INIT(VUPKLSB);
+	INIT(VUPKLSH);
+	INIT(VXOR);
+	INIT(TDI);
+	INIT(TWI);
+	INIT(MULLI);
+	INIT(SUBFIC);
+	INIT(CMPLI);
+	INIT(CMPI);
+	INIT(ADDIC);
+	INIT(ADDI);
+	INIT(ADDIS);
+	INIT(BC);
+	INIT(SC);
+	INIT(B);
+	INIT(MCRF);
+	INIT(BCLR);
+	INIT(CRNOR);
+	INIT(CRANDC);
+	INIT(ISYNC);
+	INIT(CRXOR);
+	INIT(CRNAND);
+	INIT(CRAND);
+	INIT(CREQV);
+	INIT(CRORC);
+	INIT(CROR);
+	INIT(BCCTR);
+	INIT_RC(RLWIMI);
+	INIT_RC(RLWINM);
+	INIT_RC(RLWNM);
+	INIT(ORI);
+	INIT(ORIS);
+	INIT(XORI);
+	INIT(XORIS);
+	INIT(ANDI);
+	INIT(ANDIS);
+	INIT_RC(RLDICL);
+	INIT_RC(RLDICR);
+	INIT_RC(RLDIC);
+	INIT_RC(RLDIMI);
+	INIT_RC(RLDCL);
+	INIT_RC(RLDCR);
+	INIT(CMP);
+	INIT(TW);
+	INIT(LVSL);
+	INIT(LVEBX);
+	INIT_RC_OV(SUBFC);
+	INIT_RC_OV(ADDC);
+	INIT_RC(MULHDU);
+	INIT_RC(MULHWU);
+	INIT(MFOCRF);
+	INIT(LWARX);
+	INIT(LDX);
+	INIT(LWZX);
+	INIT_RC(SLW);
+	INIT_RC(CNTLZW);
+	INIT_RC(SLD);
+	INIT_RC(AND);
+	INIT(CMPL);
+	INIT(LVSR);
+	INIT(LVEHX);
+	INIT_RC_OV(SUBF);
+	INIT(LDUX);
+	INIT(DCBST);
+	INIT(LWZUX);
+	INIT_RC(CNTLZD);
+	INIT_RC(ANDC);
+	INIT(TD);
+	INIT(LVEWX);
+	INIT_RC(MULHD);
+	INIT_RC(MULHW);
+	INIT(LDARX);
+	INIT(DCBF);
+	INIT(LBZX);
+	INIT(LVX);
+	INIT_RC_OV(NEG);
+	INIT(LBZUX);
+	INIT_RC(NOR);
+	INIT(STVEBX);
+	INIT_OV(SUBFE);
+	INIT_OV(ADDE);
+	INIT(MTOCRF);
+	INIT(STDX);
+	INIT(STWCX);
+	INIT(STWX);
+	INIT(STVEHX);
+	INIT(STDUX);
+	INIT(STWUX);
+	INIT(STVEWX);
+	INIT_RC_OV(SUBFZE);
+	INIT_RC_OV(ADDZE);
+	INIT(STDCX);
+	INIT(STBX);
+	INIT(STVX);
+	INIT_RC_OV(SUBFME);
+	INIT_RC_OV(MULLD);
+	INIT_RC_OV(ADDME);
+	INIT_RC_OV(MULLW);
+	INIT(DCBTST);
+	INIT(STBUX);
+	INIT_RC_OV(ADD);
+	INIT(DCBT);
+	INIT(LHZX);
+	INIT_RC(EQV);
+	INIT(ECIWX);
+	INIT(LHZUX);
+	INIT_RC(XOR);
+	INIT(MFSPR);
+	INIT(LWAX);
+	INIT(DST);
+	INIT(LHAX);
+	INIT(LVXL);
+	INIT(MFTB);
+	INIT(LWAUX);
+	INIT(DSTST);
+	INIT(LHAUX);
+	INIT(STHX);
+	INIT_RC(ORC);
+	INIT(ECOWX);
+	INIT(STHUX);
+	INIT_RC(OR);
+	INIT_RC_OV(DIVDU);
+	INIT_RC_OV(DIVWU);
+	INIT(MTSPR);
+	INIT(DCBI);
+	INIT_RC(NAND);
+	INIT(STVXL);
+	INIT_RC_OV(DIVD);
+	INIT_RC_OV(DIVW);
+	INIT(LVLX);
+	INIT(LDBRX);
+	INIT(LSWX);
+	INIT(LWBRX);
+	INIT(LFSX);
+	INIT_RC(SRW);
+	INIT_RC(SRD);
+	INIT(LVRX);
+	INIT(LSWI);
+	INIT(LFSUX);
+	INIT(SYNC);
+	INIT(LFDX);
+	INIT(LFDUX);
+	INIT(STVLX);
+	INIT(STDBRX);
+	INIT(STSWX);
+	INIT(STWBRX);
+	INIT(STFSX);
+	INIT(STVRX);
+	INIT(STFSUX);
+	INIT(STSWI);
+	INIT(STFDX);
+	INIT(STFDUX);
+	INIT(LVLXL);
+	INIT(LHBRX);
+	INIT_RC(SRAW);
+	INIT_RC(SRAD);
+	INIT(LVRXL);
+	INIT(DSS);
+	INIT_RC(SRAWI);
+	INIT_RC(SRADI);
+	INIT(EIEIO);
+	INIT(STVLXL);
+	INIT(STHBRX);
+	INIT_RC(EXTSH);
+	INIT(STVRXL);
+	INIT_RC(EXTSB);
+	INIT(STFIWX);
+	INIT_RC(EXTSW);
+	INIT(ICBI);
+	INIT(DCBZ);
+	INIT(LWZ);
+	INIT(LWZU);
+	INIT(LBZ);
+	INIT(LBZU);
+	INIT(STW);
+	INIT(STWU);
+	INIT(STB);
+	INIT(STBU);
+	INIT(LHZ);
+	INIT(LHZU);
+	INIT(LHA);
+	INIT(LHAU);
+	INIT(STH);
+	INIT(STHU);
+	INIT(LMW);
+	INIT(STMW);
+	INIT(LFS);
+	INIT(LFSU);
+	INIT(LFD);
+	INIT(LFDU);
+	INIT(STFS);
+	INIT(STFSU);
+	INIT(STFD);
+	INIT(STFDU);
+	INIT(LD);
+	INIT(LDU);
+	INIT(LWA);
+	INIT(STD);
+	INIT(STDU);
+	INIT_RC(FDIVS);
+	INIT_RC(FSUBS);
+	INIT_RC(FADDS);
+	INIT_RC(FSQRTS);
+	INIT_RC(FRES);
+	INIT_RC(FMULS);
+	INIT_RC(FMADDS);
+	INIT_RC(FMSUBS);
+	INIT_RC(FNMSUBS);
+	INIT_RC(FNMADDS);
+	INIT_RC(MTFSB1);
+	INIT(MCRFS);
+	INIT_RC(MTFSB0);
+	INIT_RC(MTFSFI);
+	INIT_RC(MFFS);
+	INIT_RC(MTFSF);
+	INIT(FCMPU);
+	INIT_RC(FRSP);
+	INIT_RC(FCTIW);
+	INIT_RC(FCTIWZ);
+	INIT_RC(FDIV);
+	INIT_RC(FSUB);
+	INIT_RC(FADD);
+	INIT_RC(FSQRT);
+	INIT_RC(FSEL);
+	INIT_RC(FMUL);
+	INIT_RC(FRSQRTE);
+	INIT_RC(FMSUB);
+	INIT_RC(FMADD);
+	INIT_RC(FNMSUB);
+	INIT_RC(FNMADD);
+	INIT(FCMPO);
+	INIT_RC(FNEG);
+	INIT_RC(FMR);
+	INIT_RC(FNABS);
+	INIT_RC(FABS);
+	INIT_RC(FCTID);
+	INIT_RC(FCTIDZ);
+	INIT_RC(FCFID);
+	INIT(UNK);
+#endif
+}
+
+ppu_interpreter_rt_base::~ppu_interpreter_rt_base()
+{
+}
+
+ppu_interpreter_rt::ppu_interpreter_rt() noexcept
+	: ppu_interpreter_rt_base()
+	, table(*ptrs)
+{
+}
+
+ppu_intrp_func_t ppu_interpreter_rt::decode(u32 opv) const noexcept
+{
+	const auto op = ppu_opcode_t{opv};
+
+	switch (g_ppu_itype.decode(opv))
+	{
+	case ppu_itype::LWZ:
+	case ppu_itype::LBZ:
+	case ppu_itype::STW:
+	case ppu_itype::STB:
+	case ppu_itype::LHZ:
+	case ppu_itype::LHA:
+	case ppu_itype::STH:
+	case ppu_itype::LFS:
+	case ppu_itype::LFD:
+	case ppu_itype::STFS:
+	case ppu_itype::STFD:
+	{
+		// Minor optimization: 16-bit absolute addressing never points to a valid memory
+		if (!op.ra)
+		{
+			return [](ppu_thread&, ppu_opcode_t op, be_t<u32>*, ppu_intrp_func*)
+			{
+				fmt::throw_exception("Invalid instruction: %s r%d,0x%016x(r0)", g_ppu_iname.decode(op.opcode), op.rd, op.simm16);
+			};
+		}
+
+		break;
+	}
+	default: break;
 	}
 
-	fmt::throw_exception("Unknown/Illegal opcode: 0x%08x at 0x%x", op.opcode, ppu.cia);
+	return table.decode(opv);
 }
diff --git a/rpcs3/Emu/Cell/PPUInterpreter.h b/rpcs3/Emu/Cell/PPUInterpreter.h
index 77b91e78cf..5b8f1037fd 100644
--- a/rpcs3/Emu/Cell/PPUInterpreter.h
+++ b/rpcs3/Emu/Cell/PPUInterpreter.h
@@ -4,454 +4,41 @@
 
 class ppu_thread;
 
-using ppu_inter_func_t = bool(*)(ppu_thread& ppu, ppu_opcode_t op);
+using ppu_intrp_func_t = void(*)(ppu_thread& ppu_, ppu_opcode_t op, be_t<u32>* this_op, struct ppu_intrp_func* next_fn);
 
-struct ppu_interpreter
+struct ppu_intrp_func
 {
-	static bool MFVSCR(ppu_thread&, ppu_opcode_t);
-	static bool MTVSCR(ppu_thread&, ppu_opcode_t);
-	static bool VADDCUW(ppu_thread&, ppu_opcode_t);
-	static bool VADDFP(ppu_thread&, ppu_opcode_t);
-	static bool VADDUBM(ppu_thread&, ppu_opcode_t);
-	static bool VADDUHM(ppu_thread&, ppu_opcode_t);
-	static bool VADDUWM(ppu_thread&, ppu_opcode_t);
-	static bool VAND(ppu_thread&, ppu_opcode_t);
-	static bool VANDC(ppu_thread&, ppu_opcode_t);
-	static bool VAVGSB(ppu_thread&, ppu_opcode_t);
-	static bool VAVGSH(ppu_thread&, ppu_opcode_t);
-	static bool VAVGSW(ppu_thread&, ppu_opcode_t);
-	static bool VAVGUB(ppu_thread&, ppu_opcode_t);
-	static bool VAVGUH(ppu_thread&, ppu_opcode_t);
-	static bool VAVGUW(ppu_thread&, ppu_opcode_t);
-	static bool VCFSX(ppu_thread&, ppu_opcode_t);
-	static bool VCFUX(ppu_thread&, ppu_opcode_t);
-	static bool VCMPBFP(ppu_thread&, ppu_opcode_t);
-	static bool VCMPEQFP(ppu_thread&, ppu_opcode_t);
-	static bool VCMPEQUB(ppu_thread&, ppu_opcode_t);
-	static bool VCMPEQUH(ppu_thread&, ppu_opcode_t);
-	static bool VCMPEQUW(ppu_thread&, ppu_opcode_t);
-	static bool VCMPGEFP(ppu_thread&, ppu_opcode_t);
-	static bool VCMPGTFP(ppu_thread&, ppu_opcode_t);
-	static bool VCMPGTSB(ppu_thread&, ppu_opcode_t);
-	static bool VCMPGTSH(ppu_thread&, ppu_opcode_t);
-	static bool VCMPGTSW(ppu_thread&, ppu_opcode_t);
-	static bool VCMPGTUB(ppu_thread&, ppu_opcode_t);
-	static bool VCMPGTUH(ppu_thread&, ppu_opcode_t);
-	static bool VCMPGTUW(ppu_thread&, ppu_opcode_t);
-	static bool VEXPTEFP(ppu_thread&, ppu_opcode_t);
-	static bool VLOGEFP(ppu_thread&, ppu_opcode_t);
-	static bool VMAXFP(ppu_thread&, ppu_opcode_t);
-	static bool VMAXSB(ppu_thread&, ppu_opcode_t);
-	static bool VMAXSH(ppu_thread&, ppu_opcode_t);
-	static bool VMAXSW(ppu_thread&, ppu_opcode_t);
-	static bool VMAXUB(ppu_thread&, ppu_opcode_t);
-	static bool VMAXUH(ppu_thread&, ppu_opcode_t);
-	static bool VMAXUW(ppu_thread&, ppu_opcode_t);
-	static bool VMINFP(ppu_thread&, ppu_opcode_t);
-	static bool VMINSB(ppu_thread&, ppu_opcode_t);
-	static bool VMINSH(ppu_thread&, ppu_opcode_t);
-	static bool VMINSW(ppu_thread&, ppu_opcode_t);
-	static bool VMINUB(ppu_thread&, ppu_opcode_t);
-	static bool VMINUH(ppu_thread&, ppu_opcode_t);
-	static bool VMINUW(ppu_thread&, ppu_opcode_t);
-	static bool VMLADDUHM(ppu_thread&, ppu_opcode_t);
-	static bool VMRGHB(ppu_thread&, ppu_opcode_t);
-	static bool VMRGHH(ppu_thread&, ppu_opcode_t);
-	static bool VMRGHW(ppu_thread&, ppu_opcode_t);
-	static bool VMRGLB(ppu_thread&, ppu_opcode_t);
-	static bool VMRGLH(ppu_thread&, ppu_opcode_t);
-	static bool VMRGLW(ppu_thread&, ppu_opcode_t);
-	static bool VMSUMMBM(ppu_thread&, ppu_opcode_t);
-	static bool VMSUMSHM(ppu_thread&, ppu_opcode_t);
-	static bool VMSUMUBM(ppu_thread&, ppu_opcode_t);
-	static bool VMSUMUHM(ppu_thread&, ppu_opcode_t);
-	static bool VMULESB(ppu_thread&, ppu_opcode_t);
-	static bool VMULESH(ppu_thread&, ppu_opcode_t);
-	static bool VMULEUB(ppu_thread&, ppu_opcode_t);
-	static bool VMULEUH(ppu_thread&, ppu_opcode_t);
-	static bool VMULOSB(ppu_thread&, ppu_opcode_t);
-	static bool VMULOSH(ppu_thread&, ppu_opcode_t);
-	static bool VMULOUB(ppu_thread&, ppu_opcode_t);
-	static bool VMULOUH(ppu_thread&, ppu_opcode_t);
-	static bool VNOR(ppu_thread&, ppu_opcode_t);
-	static bool VOR(ppu_thread&, ppu_opcode_t);
-	static bool VPERM(ppu_thread&, ppu_opcode_t);
-	static bool VPKPX(ppu_thread&, ppu_opcode_t);
-	static bool VPKUHUM(ppu_thread&, ppu_opcode_t);
-	static bool VPKUWUM(ppu_thread&, ppu_opcode_t);
-	static bool VREFP(ppu_thread&, ppu_opcode_t);
-	static bool VRFIM(ppu_thread&, ppu_opcode_t);
-	static bool VRFIN(ppu_thread&, ppu_opcode_t);
-	static bool VRFIP(ppu_thread&, ppu_opcode_t);
-	static bool VRFIZ(ppu_thread&, ppu_opcode_t);
-	static bool VRLB(ppu_thread&, ppu_opcode_t);
-	static bool VRLH(ppu_thread&, ppu_opcode_t);
-	static bool VRLW(ppu_thread&, ppu_opcode_t);
-	static bool VRSQRTEFP(ppu_thread&, ppu_opcode_t);
-	static bool VSEL(ppu_thread&, ppu_opcode_t);
-	static bool VSL(ppu_thread&, ppu_opcode_t);
-	static bool VSLB(ppu_thread&, ppu_opcode_t);
-	static bool VSLDOI(ppu_thread&, ppu_opcode_t);
-	static bool VSLH(ppu_thread&, ppu_opcode_t);
-	static bool VSLO(ppu_thread&, ppu_opcode_t);
-	static bool VSLW(ppu_thread&, ppu_opcode_t);
-	static bool VSPLTB(ppu_thread&, ppu_opcode_t);
-	static bool VSPLTH(ppu_thread&, ppu_opcode_t);
-	static bool VSPLTISB(ppu_thread&, ppu_opcode_t);
-	static bool VSPLTISH(ppu_thread&, ppu_opcode_t);
-	static bool VSPLTISW(ppu_thread&, ppu_opcode_t);
-	static bool VSPLTW(ppu_thread&, ppu_opcode_t);
-	static bool VSR(ppu_thread&, ppu_opcode_t);
-	static bool VSRAB(ppu_thread&, ppu_opcode_t);
-	static bool VSRAH(ppu_thread&, ppu_opcode_t);
-	static bool VSRAW(ppu_thread&, ppu_opcode_t);
-	static bool VSRB(ppu_thread&, ppu_opcode_t);
-	static bool VSRH(ppu_thread&, ppu_opcode_t);
-	static bool VSRO(ppu_thread&, ppu_opcode_t);
-	static bool VSRW(ppu_thread&, ppu_opcode_t);
-	static bool VSUBCUW(ppu_thread&, ppu_opcode_t);
-	static bool VSUBFP(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUBM(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUHM(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUWM(ppu_thread&, ppu_opcode_t);
-	static bool VUPKHPX(ppu_thread&, ppu_opcode_t);
-	static bool VUPKHSB(ppu_thread&, ppu_opcode_t);
-	static bool VUPKHSH(ppu_thread&, ppu_opcode_t);
-	static bool VUPKLPX(ppu_thread&, ppu_opcode_t);
-	static bool VUPKLSB(ppu_thread&, ppu_opcode_t);
-	static bool VUPKLSH(ppu_thread&, ppu_opcode_t);
-	static bool VXOR(ppu_thread&, ppu_opcode_t);
-	static bool TDI(ppu_thread&, ppu_opcode_t);
-	static bool TWI(ppu_thread&, ppu_opcode_t);
-	static bool MULLI(ppu_thread&, ppu_opcode_t);
-	static bool SUBFIC(ppu_thread&, ppu_opcode_t);
-	static bool CMPLI(ppu_thread&, ppu_opcode_t);
-	static bool CMPI(ppu_thread&, ppu_opcode_t);
-	static bool ADDIC(ppu_thread&, ppu_opcode_t);
-	static bool ADDI(ppu_thread&, ppu_opcode_t);
-	static bool ADDIS(ppu_thread&, ppu_opcode_t);
-	static bool BC(ppu_thread&, ppu_opcode_t);
-	static bool SC(ppu_thread&, ppu_opcode_t);
-	static bool B(ppu_thread&, ppu_opcode_t);
-	static bool MCRF(ppu_thread&, ppu_opcode_t);
-	static bool BCLR(ppu_thread&, ppu_opcode_t);
-	static bool CRNOR(ppu_thread&, ppu_opcode_t);
-	static bool CRANDC(ppu_thread&, ppu_opcode_t);
-	static bool ISYNC(ppu_thread&, ppu_opcode_t);
-	static bool CRXOR(ppu_thread&, ppu_opcode_t);
-	static bool CRNAND(ppu_thread&, ppu_opcode_t);
-	static bool CRAND(ppu_thread&, ppu_opcode_t);
-	static bool CREQV(ppu_thread&, ppu_opcode_t);
-	static bool CRORC(ppu_thread&, ppu_opcode_t);
-	static bool CROR(ppu_thread&, ppu_opcode_t);
-	static bool BCCTR(ppu_thread&, ppu_opcode_t);
-	static bool RLWIMI(ppu_thread&, ppu_opcode_t);
-	static bool RLWINM(ppu_thread&, ppu_opcode_t);
-	static bool RLWNM(ppu_thread&, ppu_opcode_t);
-	static bool ORI(ppu_thread&, ppu_opcode_t);
-	static bool ORIS(ppu_thread&, ppu_opcode_t);
-	static bool XORI(ppu_thread&, ppu_opcode_t);
-	static bool XORIS(ppu_thread&, ppu_opcode_t);
-	static bool ANDI(ppu_thread&, ppu_opcode_t);
-	static bool ANDIS(ppu_thread&, ppu_opcode_t);
-	static bool RLDICL(ppu_thread&, ppu_opcode_t);
-	static bool RLDICR(ppu_thread&, ppu_opcode_t);
-	static bool RLDIC(ppu_thread&, ppu_opcode_t);
-	static bool RLDIMI(ppu_thread&, ppu_opcode_t);
-	static bool RLDCL(ppu_thread&, ppu_opcode_t);
-	static bool RLDCR(ppu_thread&, ppu_opcode_t);
-	static bool CMP(ppu_thread&, ppu_opcode_t);
-	static bool TW(ppu_thread&, ppu_opcode_t);
-	static bool LVSL(ppu_thread&, ppu_opcode_t);
-	static bool LVEBX(ppu_thread&, ppu_opcode_t);
-	static bool SUBFC(ppu_thread&, ppu_opcode_t);
-	static bool MULHDU(ppu_thread&, ppu_opcode_t);
-	static bool ADDC(ppu_thread&, ppu_opcode_t);
-	static bool MULHWU(ppu_thread&, ppu_opcode_t);
-	static bool MFOCRF(ppu_thread&, ppu_opcode_t);
-	static bool LWARX(ppu_thread&, ppu_opcode_t);
-	static bool LDX(ppu_thread&, ppu_opcode_t);
-	static bool LWZX(ppu_thread&, ppu_opcode_t);
-	static bool SLW(ppu_thread&, ppu_opcode_t);
-	static bool CNTLZW(ppu_thread&, ppu_opcode_t);
-	static bool SLD(ppu_thread&, ppu_opcode_t);
-	static bool AND(ppu_thread&, ppu_opcode_t);
-	static bool CMPL(ppu_thread&, ppu_opcode_t);
-	static bool LVSR(ppu_thread&, ppu_opcode_t);
-	static bool LVEHX(ppu_thread&, ppu_opcode_t);
-	static bool SUBF(ppu_thread&, ppu_opcode_t);
-	static bool LDUX(ppu_thread&, ppu_opcode_t);
-	static bool DCBST(ppu_thread&, ppu_opcode_t);
-	static bool LWZUX(ppu_thread&, ppu_opcode_t);
-	static bool CNTLZD(ppu_thread&, ppu_opcode_t);
-	static bool ANDC(ppu_thread&, ppu_opcode_t);
-	static bool TD(ppu_thread&, ppu_opcode_t);
-	static bool LVEWX(ppu_thread&, ppu_opcode_t);
-	static bool MULHD(ppu_thread&, ppu_opcode_t);
-	static bool MULHW(ppu_thread&, ppu_opcode_t);
-	static bool LDARX(ppu_thread&, ppu_opcode_t);
-	static bool DCBF(ppu_thread&, ppu_opcode_t);
-	static bool LBZX(ppu_thread&, ppu_opcode_t);
-	static bool LVX(ppu_thread&, ppu_opcode_t);
-	static bool NEG(ppu_thread&, ppu_opcode_t);
-	static bool LBZUX(ppu_thread&, ppu_opcode_t);
-	static bool NOR(ppu_thread&, ppu_opcode_t);
-	static bool STVEBX(ppu_thread&, ppu_opcode_t);
-	static bool SUBFE(ppu_thread&, ppu_opcode_t);
-	static bool ADDE(ppu_thread&, ppu_opcode_t);
-	static bool MTOCRF(ppu_thread&, ppu_opcode_t);
-	static bool STDX(ppu_thread&, ppu_opcode_t);
-	static bool STWCX(ppu_thread&, ppu_opcode_t);
-	static bool STWX(ppu_thread&, ppu_opcode_t);
-	static bool STVEHX(ppu_thread&, ppu_opcode_t);
-	static bool STDUX(ppu_thread&, ppu_opcode_t);
-	static bool STWUX(ppu_thread&, ppu_opcode_t);
-	static bool STVEWX(ppu_thread&, ppu_opcode_t);
-	static bool SUBFZE(ppu_thread&, ppu_opcode_t);
-	static bool ADDZE(ppu_thread&, ppu_opcode_t);
-	static bool STDCX(ppu_thread&, ppu_opcode_t);
-	static bool STBX(ppu_thread&, ppu_opcode_t);
-	static bool STVX(ppu_thread&, ppu_opcode_t);
-	static bool MULLD(ppu_thread&, ppu_opcode_t);
-	static bool SUBFME(ppu_thread&, ppu_opcode_t);
-	static bool ADDME(ppu_thread&, ppu_opcode_t);
-	static bool MULLW(ppu_thread&, ppu_opcode_t);
-	static bool DCBTST(ppu_thread&, ppu_opcode_t);
-	static bool STBUX(ppu_thread&, ppu_opcode_t);
-	static bool ADD(ppu_thread&, ppu_opcode_t);
-	static bool DCBT(ppu_thread&, ppu_opcode_t);
-	static bool LHZX(ppu_thread&, ppu_opcode_t);
-	static bool EQV(ppu_thread&, ppu_opcode_t);
-	static bool ECIWX(ppu_thread&, ppu_opcode_t);
-	static bool LHZUX(ppu_thread&, ppu_opcode_t);
-	static bool XOR(ppu_thread&, ppu_opcode_t);
-	static bool MFSPR(ppu_thread&, ppu_opcode_t);
-	static bool LWAX(ppu_thread&, ppu_opcode_t);
-	static bool DST(ppu_thread&, ppu_opcode_t);
-	static bool LHAX(ppu_thread&, ppu_opcode_t);
-	static bool LVXL(ppu_thread&, ppu_opcode_t);
-	static bool MFTB(ppu_thread&, ppu_opcode_t);
-	static bool LWAUX(ppu_thread&, ppu_opcode_t);
-	static bool DSTST(ppu_thread&, ppu_opcode_t);
-	static bool LHAUX(ppu_thread&, ppu_opcode_t);
-	static bool STHX(ppu_thread&, ppu_opcode_t);
-	static bool ORC(ppu_thread&, ppu_opcode_t);
-	static bool ECOWX(ppu_thread&, ppu_opcode_t);
-	static bool STHUX(ppu_thread&, ppu_opcode_t);
-	static bool OR(ppu_thread&, ppu_opcode_t);
-	static bool DIVDU(ppu_thread&, ppu_opcode_t);
-	static bool DIVWU(ppu_thread&, ppu_opcode_t);
-	static bool MTSPR(ppu_thread&, ppu_opcode_t);
-	static bool DCBI(ppu_thread&, ppu_opcode_t);
-	static bool NAND(ppu_thread&, ppu_opcode_t);
-	static bool STVXL(ppu_thread&, ppu_opcode_t);
-	static bool DIVD(ppu_thread&, ppu_opcode_t);
-	static bool DIVW(ppu_thread&, ppu_opcode_t);
-	static bool LDBRX(ppu_thread&, ppu_opcode_t);
-	static bool LSWX(ppu_thread&, ppu_opcode_t);
-	static bool LWBRX(ppu_thread&, ppu_opcode_t);
-	static bool LFSX(ppu_thread&, ppu_opcode_t);
-	static bool SRW(ppu_thread&, ppu_opcode_t);
-	static bool SRD(ppu_thread&, ppu_opcode_t);
-	static bool LSWI(ppu_thread&, ppu_opcode_t);
-	static bool LFSUX(ppu_thread&, ppu_opcode_t);
-	static bool SYNC(ppu_thread&, ppu_opcode_t);
-	static bool LFDX(ppu_thread&, ppu_opcode_t);
-	static bool LFDUX(ppu_thread&, ppu_opcode_t);
-	static bool STDBRX(ppu_thread&, ppu_opcode_t);
-	static bool STSWX(ppu_thread&, ppu_opcode_t);
-	static bool STWBRX(ppu_thread&, ppu_opcode_t);
-	static bool STFSX(ppu_thread&, ppu_opcode_t);
-	static bool STFSUX(ppu_thread&, ppu_opcode_t);
-	static bool STSWI(ppu_thread&, ppu_opcode_t);
-	static bool STFDX(ppu_thread&, ppu_opcode_t);
-	static bool STFDUX(ppu_thread&, ppu_opcode_t);
-	static bool LHBRX(ppu_thread&, ppu_opcode_t);
-	static bool SRAW(ppu_thread&, ppu_opcode_t);
-	static bool SRAD(ppu_thread&, ppu_opcode_t);
-	static bool DSS(ppu_thread&, ppu_opcode_t);
-	static bool SRAWI(ppu_thread&, ppu_opcode_t);
-	static bool SRADI(ppu_thread&, ppu_opcode_t);
-	static bool EIEIO(ppu_thread&, ppu_opcode_t);
-	static bool STHBRX(ppu_thread&, ppu_opcode_t);
-	static bool EXTSH(ppu_thread&, ppu_opcode_t);
-	static bool EXTSB(ppu_thread&, ppu_opcode_t);
-	static bool STFIWX(ppu_thread&, ppu_opcode_t);
-	static bool EXTSW(ppu_thread&, ppu_opcode_t);
-	static bool ICBI(ppu_thread&, ppu_opcode_t);
-	static bool DCBZ(ppu_thread&, ppu_opcode_t);
-	static bool LWZ(ppu_thread&, ppu_opcode_t);
-	static bool LWZU(ppu_thread&, ppu_opcode_t);
-	static bool LBZ(ppu_thread&, ppu_opcode_t);
-	static bool LBZU(ppu_thread&, ppu_opcode_t);
-	static bool STW(ppu_thread&, ppu_opcode_t);
-	static bool STWU(ppu_thread&, ppu_opcode_t);
-	static bool STB(ppu_thread&, ppu_opcode_t);
-	static bool STBU(ppu_thread&, ppu_opcode_t);
-	static bool LHZ(ppu_thread&, ppu_opcode_t);
-	static bool LHZU(ppu_thread&, ppu_opcode_t);
-	static bool LHA(ppu_thread&, ppu_opcode_t);
-	static bool LHAU(ppu_thread&, ppu_opcode_t);
-	static bool STH(ppu_thread&, ppu_opcode_t);
-	static bool STHU(ppu_thread&, ppu_opcode_t);
-	static bool LMW(ppu_thread&, ppu_opcode_t);
-	static bool STMW(ppu_thread&, ppu_opcode_t);
-	static bool LFS(ppu_thread&, ppu_opcode_t);
-	static bool LFSU(ppu_thread&, ppu_opcode_t);
-	static bool LFD(ppu_thread&, ppu_opcode_t);
-	static bool LFDU(ppu_thread&, ppu_opcode_t);
-	static bool STFS(ppu_thread&, ppu_opcode_t);
-	static bool STFSU(ppu_thread&, ppu_opcode_t);
-	static bool STFD(ppu_thread&, ppu_opcode_t);
-	static bool STFDU(ppu_thread&, ppu_opcode_t);
-	static bool LD(ppu_thread&, ppu_opcode_t);
-	static bool LDU(ppu_thread&, ppu_opcode_t);
-	static bool LWA(ppu_thread&, ppu_opcode_t);
-	static bool STD(ppu_thread&, ppu_opcode_t);
-	static bool STDU(ppu_thread&, ppu_opcode_t);
-	static bool MTFSB1(ppu_thread&, ppu_opcode_t);
-	static bool MCRFS(ppu_thread&, ppu_opcode_t);
-	static bool MTFSB0(ppu_thread&, ppu_opcode_t);
-	static bool MTFSFI(ppu_thread&, ppu_opcode_t);
-	static bool MFFS(ppu_thread&, ppu_opcode_t);
-	static bool MTFSF(ppu_thread&, ppu_opcode_t);
-	static bool FCMPU(ppu_thread&, ppu_opcode_t);
-	static bool FCTIW(ppu_thread&, ppu_opcode_t);
-	static bool FCTIWZ(ppu_thread&, ppu_opcode_t);
-	static bool FSEL(ppu_thread&, ppu_opcode_t);
-	static bool FCMPO(ppu_thread&, ppu_opcode_t);
-	static bool FNEG(ppu_thread&, ppu_opcode_t);
-	static bool FMR(ppu_thread&, ppu_opcode_t);
-	static bool FNABS(ppu_thread&, ppu_opcode_t);
-	static bool FABS(ppu_thread&, ppu_opcode_t);
-	static bool FCTID(ppu_thread&, ppu_opcode_t);
-	static bool FCTIDZ(ppu_thread&, ppu_opcode_t);
-	static bool FCFID(ppu_thread&, ppu_opcode_t);
-
-	static bool LVLX(ppu_thread&, ppu_opcode_t);
-	static bool LVLXL(ppu_thread&, ppu_opcode_t);
-	static bool LVRX(ppu_thread&, ppu_opcode_t);
-	static bool LVRXL(ppu_thread&, ppu_opcode_t);
-	static bool STVLX(ppu_thread&, ppu_opcode_t);
-	static bool STVLXL(ppu_thread&, ppu_opcode_t);
-	static bool STVRX(ppu_thread&, ppu_opcode_t);
-	static bool STVRXL(ppu_thread&, ppu_opcode_t);
-
-	static bool UNK(ppu_thread&, ppu_opcode_t);
+	ppu_intrp_func_t fn;
 };
 
-struct ppu_interpreter_precise final : ppu_interpreter
+template <typename IT>
+struct ppu_interpreter_t;
+
+namespace asmjit
 {
-	static bool VPKSHSS(ppu_thread&, ppu_opcode_t);
-	static bool VPKSHUS(ppu_thread&, ppu_opcode_t);
-	static bool VPKSWSS(ppu_thread&, ppu_opcode_t);
-	static bool VPKSWUS(ppu_thread&, ppu_opcode_t);
-	static bool VPKUHUS(ppu_thread&, ppu_opcode_t);
-	static bool VPKUWUS(ppu_thread&, ppu_opcode_t);
-	static bool VADDSBS(ppu_thread&, ppu_opcode_t);
-	static bool VADDSHS(ppu_thread&, ppu_opcode_t);
-	static bool VADDSWS(ppu_thread&, ppu_opcode_t);
-	static bool VADDUBS(ppu_thread&, ppu_opcode_t);
-	static bool VADDUHS(ppu_thread&, ppu_opcode_t);
-	static bool VADDUWS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBSBS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBSHS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBSWS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUBS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUHS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUWS(ppu_thread&, ppu_opcode_t);
-	static bool VMHADDSHS(ppu_thread&, ppu_opcode_t);
-	static bool VMHRADDSHS(ppu_thread&, ppu_opcode_t);
-	static bool VMSUMSHS(ppu_thread&, ppu_opcode_t);
-	static bool VMSUMUHS(ppu_thread&, ppu_opcode_t);
-	static bool VSUMSWS(ppu_thread&, ppu_opcode_t);
-	static bool VSUM2SWS(ppu_thread&, ppu_opcode_t);
-	static bool VSUM4SBS(ppu_thread&, ppu_opcode_t);
-	static bool VSUM4SHS(ppu_thread&, ppu_opcode_t);
-	static bool VSUM4UBS(ppu_thread&, ppu_opcode_t);
-	static bool VCTSXS(ppu_thread&, ppu_opcode_t);
-	static bool VCTUXS(ppu_thread&, ppu_opcode_t);
-	static bool VMADDFP(ppu_thread&, ppu_opcode_t);
-	static bool VNMSUBFP(ppu_thread&, ppu_opcode_t);
+	struct ppu_builder;
+}
 
-	static bool FDIVS(ppu_thread&, ppu_opcode_t);
-	static bool FSUBS(ppu_thread&, ppu_opcode_t);
-	static bool FADDS(ppu_thread&, ppu_opcode_t);
-	static bool FSQRTS(ppu_thread&, ppu_opcode_t);
-	static bool FRES(ppu_thread&, ppu_opcode_t);
-	static bool FMULS(ppu_thread&, ppu_opcode_t);
-	static bool FMADDS(ppu_thread&, ppu_opcode_t);
-	static bool FMSUBS(ppu_thread&, ppu_opcode_t);
-	static bool FNMSUBS(ppu_thread&, ppu_opcode_t);
-	static bool FNMADDS(ppu_thread&, ppu_opcode_t);
+struct ppu_interpreter_rt_base
+{
+protected:
+	std::unique_ptr<ppu_interpreter_t<ppu_intrp_func_t>> ptrs;
 
-	static bool FRSP(ppu_thread&, ppu_opcode_t);
-	static bool FDIV(ppu_thread&, ppu_opcode_t);
-	static bool FSUB(ppu_thread&, ppu_opcode_t);
-	static bool FADD(ppu_thread&, ppu_opcode_t);
-	static bool FSQRT(ppu_thread&, ppu_opcode_t);
-	static bool FMUL(ppu_thread&, ppu_opcode_t);
-	static bool FRSQRTE(ppu_thread&, ppu_opcode_t);
-	static bool FMSUB(ppu_thread&, ppu_opcode_t);
-	static bool FMADD(ppu_thread&, ppu_opcode_t);
-	static bool FNMSUB(ppu_thread&, ppu_opcode_t);
-	static bool FNMADD(ppu_thread&, ppu_opcode_t);
+	ppu_interpreter_rt_base() noexcept;
+
+	ppu_interpreter_rt_base(const ppu_interpreter_rt_base&) = delete;
+
+	ppu_interpreter_rt_base& operator=(const ppu_interpreter_rt_base&) = delete;
+
+	virtual ~ppu_interpreter_rt_base();
 };
 
-struct ppu_interpreter_fast final : ppu_interpreter
+struct ppu_interpreter_rt : ppu_interpreter_rt_base
 {
-	static bool VPKSHSS(ppu_thread&, ppu_opcode_t);
-	static bool VPKSHUS(ppu_thread&, ppu_opcode_t);
-	static bool VPKSWSS(ppu_thread&, ppu_opcode_t);
-	static bool VPKSWUS(ppu_thread&, ppu_opcode_t);
-	static bool VPKUHUS(ppu_thread&, ppu_opcode_t);
-	static bool VPKUWUS(ppu_thread&, ppu_opcode_t);
-	static bool VADDSBS(ppu_thread&, ppu_opcode_t);
-	static bool VADDSHS(ppu_thread&, ppu_opcode_t);
-	static bool VADDSWS(ppu_thread&, ppu_opcode_t);
-	static bool VADDUBS(ppu_thread&, ppu_opcode_t);
-	static bool VADDUHS(ppu_thread&, ppu_opcode_t);
-	static bool VADDUWS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBSBS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBSHS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBSWS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUBS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUHS(ppu_thread&, ppu_opcode_t);
-	static bool VSUBUWS(ppu_thread&, ppu_opcode_t);
-	static bool VMHADDSHS(ppu_thread&, ppu_opcode_t);
-	static bool VMHRADDSHS(ppu_thread&, ppu_opcode_t);
-	static bool VMSUMSHS(ppu_thread&, ppu_opcode_t);
-	static bool VMSUMUHS(ppu_thread&, ppu_opcode_t);
-	static bool VSUMSWS(ppu_thread&, ppu_opcode_t);
-	static bool VSUM2SWS(ppu_thread&, ppu_opcode_t);
-	static bool VSUM4SBS(ppu_thread&, ppu_opcode_t);
-	static bool VSUM4SHS(ppu_thread&, ppu_opcode_t);
-	static bool VSUM4UBS(ppu_thread&, ppu_opcode_t);
-	static bool VCTSXS(ppu_thread&, ppu_opcode_t);
-	static bool VCTUXS(ppu_thread&, ppu_opcode_t);
-	static bool VMADDFP(ppu_thread&, ppu_opcode_t);
-	static bool VNMSUBFP(ppu_thread&, ppu_opcode_t);
+	ppu_interpreter_rt() noexcept;
 
-	static bool FDIVS(ppu_thread&, ppu_opcode_t);
-	static bool FSUBS(ppu_thread&, ppu_opcode_t);
-	static bool FADDS(ppu_thread&, ppu_opcode_t);
-	static bool FSQRTS(ppu_thread&, ppu_opcode_t);
-	static bool FRES(ppu_thread&, ppu_opcode_t);
-	static bool FMULS(ppu_thread&, ppu_opcode_t);
-	static bool FMADDS(ppu_thread&, ppu_opcode_t);
-	static bool FMSUBS(ppu_thread&, ppu_opcode_t);
-	static bool FNMSUBS(ppu_thread&, ppu_opcode_t);
-	static bool FNMADDS(ppu_thread&, ppu_opcode_t);
+	ppu_intrp_func_t decode(u32 op) const noexcept;
 
-	static bool FRSP(ppu_thread&, ppu_opcode_t);
-	static bool FDIV(ppu_thread&, ppu_opcode_t);
-	static bool FSUB(ppu_thread&, ppu_opcode_t);
-	static bool FADD(ppu_thread&, ppu_opcode_t);
-	static bool FSQRT(ppu_thread&, ppu_opcode_t);
-	static bool FMUL(ppu_thread&, ppu_opcode_t);
-	static bool FRSQRTE(ppu_thread&, ppu_opcode_t);
-	static bool FMSUB(ppu_thread&, ppu_opcode_t);
-	static bool FMADD(ppu_thread&, ppu_opcode_t);
-	static bool FNMSUB(ppu_thread&, ppu_opcode_t);
-	static bool FNMADD(ppu_thread&, ppu_opcode_t);
+private:
+	ppu_decoder<ppu_interpreter_t<ppu_intrp_func_t>, ppu_intrp_func_t> table;
 };
diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp
index 6605daa38f..de6d24455e 100644
--- a/rpcs3/Emu/Cell/PPUModule.cpp
+++ b/rpcs3/Emu/Cell/PPUModule.cpp
@@ -30,7 +30,7 @@ LOG_CHANNEL(ppu_loader);
 extern std::string ppu_get_function_name(const std::string& _module, u32 fnid);
 extern std::string ppu_get_variable_name(const std::string& _module, u32 vnid);
 extern void ppu_register_range(u32 addr, u32 size);
-extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr);
+extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr);
 
 extern void sys_initialize_tls(ppu_thread&, u64, u32, u32, u32);
 
@@ -275,7 +275,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link)
 	};
 
 	// Initialize double-purpose fake OPD array for HLE functions
-	const auto& hle_funcs = ppu_function_manager::get(g_cfg.core.ppu_decoder == ppu_decoder_type::llvm);
+	const auto& hle_funcs = ppu_function_manager::get(g_cfg.core.ppu_decoder != ppu_decoder_type::_static);
 
 	u32& hle_funcs_addr = g_fxo->get<ppu_function_manager>().addr;
 
diff --git a/rpcs3/Emu/Cell/PPUModule.h b/rpcs3/Emu/Cell/PPUModule.h
index 02de8eed52..9b261a9421 100644
--- a/rpcs3/Emu/Cell/PPUModule.h
+++ b/rpcs3/Emu/Cell/PPUModule.h
@@ -123,7 +123,7 @@ public:
 	static void initialize_modules();
 
 	template <auto* Func>
-	static auto& register_static_function(const char* _module, const char* name, ppu_function_t func, u32 fnid)
+	static auto& register_static_function(const char* _module, const char* name, ppu_intrp_func_t func, u32 fnid)
 	{
 		auto& info = access_static_function(_module, fnid);
 
diff --git a/rpcs3/Emu/Cell/PPUOpcodes.h b/rpcs3/Emu/Cell/PPUOpcodes.h
index 5526fed181..db96359a4e 100644
--- a/rpcs3/Emu/Cell/PPUOpcodes.h
+++ b/rpcs3/Emu/Cell/PPUOpcodes.h
@@ -84,19 +84,22 @@ class ppu_decoder
 	struct instruction_info
 	{
 		u32 value;
-		T pointer;
+		T ptr0;
+		T ptr_rc;
 		u32 magn; // Non-zero for "columns" (effectively, number of most significant bits "eaten")
 
-		constexpr instruction_info(u32 v, T p, u32 m = 0)
+		constexpr instruction_info(u32 v, T p, T p_rc, u32 m = 0)
 			: value(v)
-			, pointer(p)
+			, ptr0(p)
+			, ptr_rc(p_rc)
 			, magn(m)
 		{
 		}
 
-		constexpr instruction_info(u32 v, const T* p, u32 m = 0)
+		constexpr instruction_info(u32 v, const T* p, const T* p_rc, u32 m = 0)
 			: value(v)
-			, pointer(*p)
+			, ptr0(*p)
+			, ptr_rc(*p_rc)
 			, magn(m)
 		{
 		}
@@ -113,7 +116,8 @@ class ppu_decoder
 				{
 					for (u32 j = 0; j < 1u << sh; j++)
 					{
-						m_table.at((((((i << (count - v.magn)) | v.value) << sh) | j) << 6) | main_op) = v.pointer;
+						const u32 k = (((i << (count - v.magn)) | v.value) << sh) | j;
+						m_table.at((k << 6) | main_op) = k & 1 ? v.ptr_rc : v.ptr0;
 					}
 				}
 			}
@@ -125,454 +129,498 @@ class ppu_decoder
 			{
 				for (u32 i = 0; i < 1u << 11; i++)
 				{
-					m_table.at(i << 6 | v.value) = v.pointer;
+					m_table.at(i << 6 | v.value) = i & 1 ? v.ptr_rc : v.ptr0;
 				}
 			}
 		}
 	}
 
-public:
-	ppu_decoder() noexcept
+	// Helper
+	static const D& _first(const D& arg)
 	{
+		return arg;
+	}
+
+public:
+	template <typename... Args>
+	ppu_decoder(const Args&... args) noexcept
+	{
+		// If an object is passed to the constructor, assign values from that object
+#define GET_(name) [&]{ if constexpr (sizeof...(Args) > 0) return _first(args...).name; else return &D::name; }()
+#define GET(name) GET_(name), GET_(name)
+#define GETRC(name) GET_(name), GET_(name##_)
+
+		static_assert(sizeof...(Args) <= 1);
+
 		for (auto& x : m_table)
 		{
-			x = &D::UNK;
+			x = GET(UNK);
 		}
 
 		// Main opcodes (field 0..5)
 		fill_table(0x00, 6, -1,
 		{
-			{ 0x02, &D::TDI },
-			{ 0x03, &D::TWI },
-			{ 0x07, &D::MULLI },
-			{ 0x08, &D::SUBFIC },
-			{ 0x0a, &D::CMPLI },
-			{ 0x0b, &D::CMPI },
-			{ 0x0c, &D::ADDIC },
-			{ 0x0d, &D::ADDIC },
-			{ 0x0e, &D::ADDI },
-			{ 0x0f, &D::ADDIS },
-			{ 0x10, &D::BC },
-			{ 0x11, &D::SC },
-			{ 0x12, &D::B },
-			{ 0x14, &D::RLWIMI },
-			{ 0x15, &D::RLWINM },
-			{ 0x17, &D::RLWNM },
-			{ 0x18, &D::ORI },
-			{ 0x19, &D::ORIS },
-			{ 0x1a, &D::XORI },
-			{ 0x1b, &D::XORIS },
-			{ 0x1c, &D::ANDI },
-			{ 0x1d, &D::ANDIS },
-			{ 0x20, &D::LWZ },
-			{ 0x21, &D::LWZU },
-			{ 0x22, &D::LBZ },
-			{ 0x23, &D::LBZU },
-			{ 0x24, &D::STW },
-			{ 0x25, &D::STWU },
-			{ 0x26, &D::STB },
-			{ 0x27, &D::STBU },
-			{ 0x28, &D::LHZ },
-			{ 0x29, &D::LHZU },
-			{ 0x2a, &D::LHA },
-			{ 0x2b, &D::LHAU },
-			{ 0x2c, &D::STH },
-			{ 0x2d, &D::STHU },
-			{ 0x2e, &D::LMW },
-			{ 0x2f, &D::STMW },
-			{ 0x30, &D::LFS },
-			{ 0x31, &D::LFSU },
-			{ 0x32, &D::LFD },
-			{ 0x33, &D::LFDU },
-			{ 0x34, &D::STFS },
-			{ 0x35, &D::STFSU },
-			{ 0x36, &D::STFD },
-			{ 0x37, &D::STFDU },
+			{ 0x02, GET(TDI) },
+			{ 0x03, GET(TWI) },
+			{ 0x07, GET(MULLI) },
+			{ 0x08, GET(SUBFIC) },
+			{ 0x0a, GET(CMPLI) },
+			{ 0x0b, GET(CMPI) },
+			{ 0x0c, GET(ADDIC) },
+			{ 0x0d, GET(ADDIC) },
+			{ 0x0e, GET(ADDI) },
+			{ 0x0f, GET(ADDIS) },
+			{ 0x10, GET(BC) },
+			{ 0x11, GET(SC) },
+			{ 0x12, GET(B) },
+			{ 0x14, GETRC(RLWIMI) },
+			{ 0x15, GETRC(RLWINM) },
+			{ 0x17, GETRC(RLWNM) },
+			{ 0x18, GET(ORI) },
+			{ 0x19, GET(ORIS) },
+			{ 0x1a, GET(XORI) },
+			{ 0x1b, GET(XORIS) },
+			{ 0x1c, GET(ANDI) },
+			{ 0x1d, GET(ANDIS) },
+			{ 0x20, GET(LWZ) },
+			{ 0x21, GET(LWZU) },
+			{ 0x22, GET(LBZ) },
+			{ 0x23, GET(LBZU) },
+			{ 0x24, GET(STW) },
+			{ 0x25, GET(STWU) },
+			{ 0x26, GET(STB) },
+			{ 0x27, GET(STBU) },
+			{ 0x28, GET(LHZ) },
+			{ 0x29, GET(LHZU) },
+			{ 0x2a, GET(LHA) },
+			{ 0x2b, GET(LHAU) },
+			{ 0x2c, GET(STH) },
+			{ 0x2d, GET(STHU) },
+			{ 0x2e, GET(LMW) },
+			{ 0x2f, GET(STMW) },
+			{ 0x30, GET(LFS) },
+			{ 0x31, GET(LFSU) },
+			{ 0x32, GET(LFD) },
+			{ 0x33, GET(LFDU) },
+			{ 0x34, GET(STFS) },
+			{ 0x35, GET(STFSU) },
+			{ 0x36, GET(STFD) },
+			{ 0x37, GET(STFDU) },
 		});
 
 		// Group 0x04 opcodes (field 21..31)
 		fill_table(0x04, 11, 0,
 		{
-			{ 0x0, &D::VADDUBM },
-			{ 0x2, &D::VMAXUB },
-			{ 0x4, &D::VRLB },
-			{ 0x6, &D::VCMPEQUB, 1 },
-			{ 0x8, &D::VMULOUB },
-			{ 0xa, &D::VADDFP },
-			{ 0xc, &D::VMRGHB },
-			{ 0xe, &D::VPKUHUM },
+			{ 0x0, GET(VADDUBM) },
+			{ 0x2, GET(VMAXUB) },
+			{ 0x4, GET(VRLB) },
+			{ 0x006, GET(VCMPEQUB) },
+			{ 0x406, GET(VCMPEQUB_) },
+			{ 0x8, GET(VMULOUB) },
+			{ 0xa, GET(VADDFP) },
+			{ 0xc, GET(VMRGHB) },
+			{ 0xe, GET(VPKUHUM) },
 
-			{ 0x20, &D::VMHADDSHS, 5 },
-			{ 0x21, &D::VMHRADDSHS, 5 },
-			{ 0x22, &D::VMLADDUHM, 5 },
-			{ 0x24, &D::VMSUMUBM, 5 },
-			{ 0x25, &D::VMSUMMBM, 5 },
-			{ 0x26, &D::VMSUMUHM, 5 },
-			{ 0x27, &D::VMSUMUHS, 5 },
-			{ 0x28, &D::VMSUMSHM, 5 },
-			{ 0x29, &D::VMSUMSHS, 5 },
-			{ 0x2a, &D::VSEL, 5 },
-			{ 0x2b, &D::VPERM, 5 },
-			{ 0x2c, &D::VSLDOI, 5 },
-			{ 0x2e, &D::VMADDFP, 5 },
-			{ 0x2f, &D::VNMSUBFP, 5 },
+			{ 0x20, GET(VMHADDSHS), 5 },
+			{ 0x21, GET(VMHRADDSHS), 5 },
+			{ 0x22, GET(VMLADDUHM), 5 },
+			{ 0x24, GET(VMSUMUBM), 5 },
+			{ 0x25, GET(VMSUMMBM), 5 },
+			{ 0x26, GET(VMSUMUHM), 5 },
+			{ 0x27, GET(VMSUMUHS), 5 },
+			{ 0x28, GET(VMSUMSHM), 5 },
+			{ 0x29, GET(VMSUMSHS), 5 },
+			{ 0x2a, GET(VSEL), 5 },
+			{ 0x2b, GET(VPERM), 5 },
+			{ 0x2c, GET(VSLDOI), 5 },
+			{ 0x2e, GET(VMADDFP), 5 },
+			{ 0x2f, GET(VNMSUBFP), 5 },
 
-			{ 0x40, &D::VADDUHM },
-			{ 0x42, &D::VMAXUH },
-			{ 0x44, &D::VRLH },
-			{ 0x46, &D::VCMPEQUH, 1 },
-			{ 0x48, &D::VMULOUH },
-			{ 0x4a, &D::VSUBFP },
-			{ 0x4c, &D::VMRGHH },
-			{ 0x4e, &D::VPKUWUM },
-			{ 0x80, &D::VADDUWM },
-			{ 0x82, &D::VMAXUW },
-			{ 0x84, &D::VRLW },
-			{ 0x86, &D::VCMPEQUW, 1 },
-			{ 0x8c, &D::VMRGHW },
-			{ 0x8e, &D::VPKUHUS },
-			{ 0xc6, &D::VCMPEQFP, 1 },
-			{ 0xce, &D::VPKUWUS },
+			{ 0x40, GET(VADDUHM) },
+			{ 0x42, GET(VMAXUH) },
+			{ 0x44, GET(VRLH) },
+			{ 0x046, GET(VCMPEQUH) },
+			{ 0x446, GET(VCMPEQUH_) },
+			{ 0x48, GET(VMULOUH) },
+			{ 0x4a, GET(VSUBFP) },
+			{ 0x4c, GET(VMRGHH) },
+			{ 0x4e, GET(VPKUWUM) },
+			{ 0x80, GET(VADDUWM) },
+			{ 0x82, GET(VMAXUW) },
+			{ 0x84, GET(VRLW) },
+			{ 0x086, GET(VCMPEQUW) },
+			{ 0x486, GET(VCMPEQUW_) },
+			{ 0x8c, GET(VMRGHW) },
+			{ 0x8e, GET(VPKUHUS) },
+			{ 0x0c6, GET(VCMPEQFP) },
+			{ 0x4c6, GET(VCMPEQFP_) },
+			{ 0xce, GET(VPKUWUS) },
 
-			{ 0x102, &D::VMAXSB },
-			{ 0x104, &D::VSLB },
-			{ 0x108, &D::VMULOSB },
-			{ 0x10a, &D::VREFP },
-			{ 0x10c, &D::VMRGLB },
-			{ 0x10e, &D::VPKSHUS },
-			{ 0x142, &D::VMAXSH },
-			{ 0x144, &D::VSLH },
-			{ 0x148, &D::VMULOSH },
-			{ 0x14a, &D::VRSQRTEFP },
-			{ 0x14c, &D::VMRGLH },
-			{ 0x14e, &D::VPKSWUS },
-			{ 0x180, &D::VADDCUW },
-			{ 0x182, &D::VMAXSW },
-			{ 0x184, &D::VSLW },
-			{ 0x18a, &D::VEXPTEFP },
-			{ 0x18c, &D::VMRGLW },
-			{ 0x18e, &D::VPKSHSS },
-			{ 0x1c4, &D::VSL },
-			{ 0x1c6, &D::VCMPGEFP, 1 },
-			{ 0x1ca, &D::VLOGEFP },
-			{ 0x1ce, &D::VPKSWSS },
-			{ 0x200, &D::VADDUBS },
-			{ 0x202, &D::VMINUB },
-			{ 0x204, &D::VSRB },
-			{ 0x206, &D::VCMPGTUB, 1 },
-			{ 0x208, &D::VMULEUB },
-			{ 0x20a, &D::VRFIN },
-			{ 0x20c, &D::VSPLTB },
-			{ 0x20e, &D::VUPKHSB },
-			{ 0x240, &D::VADDUHS },
-			{ 0x242, &D::VMINUH },
-			{ 0x244, &D::VSRH },
-			{ 0x246, &D::VCMPGTUH, 1 },
-			{ 0x248, &D::VMULEUH },
-			{ 0x24a, &D::VRFIZ },
-			{ 0x24c, &D::VSPLTH },
-			{ 0x24e, &D::VUPKHSH },
-			{ 0x280, &D::VADDUWS },
-			{ 0x282, &D::VMINUW },
-			{ 0x284, &D::VSRW },
-			{ 0x286, &D::VCMPGTUW, 1 },
-			{ 0x28a, &D::VRFIP },
-			{ 0x28c, &D::VSPLTW },
-			{ 0x28e, &D::VUPKLSB },
-			{ 0x2c4, &D::VSR },
-			{ 0x2c6, &D::VCMPGTFP, 1 },
-			{ 0x2ca, &D::VRFIM },
-			{ 0x2ce, &D::VUPKLSH },
-			{ 0x300, &D::VADDSBS },
-			{ 0x302, &D::VMINSB },
-			{ 0x304, &D::VSRAB },
-			{ 0x306, &D::VCMPGTSB, 1 },
-			{ 0x308, &D::VMULESB },
-			{ 0x30a, &D::VCFUX },
-			{ 0x30c, &D::VSPLTISB },
-			{ 0x30e, &D::VPKPX },
-			{ 0x340, &D::VADDSHS },
-			{ 0x342, &D::VMINSH },
-			{ 0x344, &D::VSRAH },
-			{ 0x346, &D::VCMPGTSH, 1 },
-			{ 0x348, &D::VMULESH },
-			{ 0x34a, &D::VCFSX },
-			{ 0x34c, &D::VSPLTISH },
-			{ 0x34e, &D::VUPKHPX },
-			{ 0x380, &D::VADDSWS },
-			{ 0x382, &D::VMINSW },
-			{ 0x384, &D::VSRAW },
-			{ 0x386, &D::VCMPGTSW, 1 },
-			{ 0x38a, &D::VCTUXS },
-			{ 0x38c, &D::VSPLTISW },
-			{ 0x3c6, &D::VCMPBFP, 1 },
-			{ 0x3ca, &D::VCTSXS },
-			{ 0x3ce, &D::VUPKLPX },
-			{ 0x400, &D::VSUBUBM },
-			{ 0x402, &D::VAVGUB },
-			{ 0x404, &D::VAND },
-			{ 0x40a, &D::VMAXFP },
-			{ 0x40c, &D::VSLO },
-			{ 0x440, &D::VSUBUHM },
-			{ 0x442, &D::VAVGUH },
-			{ 0x444, &D::VANDC },
-			{ 0x44a, &D::VMINFP },
-			{ 0x44c, &D::VSRO },
-			{ 0x480, &D::VSUBUWM },
-			{ 0x482, &D::VAVGUW },
-			{ 0x484, &D::VOR },
-			{ 0x4c4, &D::VXOR },
-			{ 0x502, &D::VAVGSB },
-			{ 0x504, &D::VNOR },
-			{ 0x542, &D::VAVGSH },
-			{ 0x580, &D::VSUBCUW },
-			{ 0x582, &D::VAVGSW },
-			{ 0x600, &D::VSUBUBS },
-			{ 0x604, &D::MFVSCR },
-			{ 0x608, &D::VSUM4UBS },
-			{ 0x640, &D::VSUBUHS },
-			{ 0x644, &D::MTVSCR },
-			{ 0x648, &D::VSUM4SHS },
-			{ 0x680, &D::VSUBUWS },
-			{ 0x688, &D::VSUM2SWS },
-			{ 0x700, &D::VSUBSBS },
-			{ 0x708, &D::VSUM4SBS },
-			{ 0x740, &D::VSUBSHS },
-			{ 0x780, &D::VSUBSWS },
-			{ 0x788, &D::VSUMSWS },
+			{ 0x102, GET(VMAXSB) },
+			{ 0x104, GET(VSLB) },
+			{ 0x108, GET(VMULOSB) },
+			{ 0x10a, GET(VREFP) },
+			{ 0x10c, GET(VMRGLB) },
+			{ 0x10e, GET(VPKSHUS) },
+			{ 0x142, GET(VMAXSH) },
+			{ 0x144, GET(VSLH) },
+			{ 0x148, GET(VMULOSH) },
+			{ 0x14a, GET(VRSQRTEFP) },
+			{ 0x14c, GET(VMRGLH) },
+			{ 0x14e, GET(VPKSWUS) },
+			{ 0x180, GET(VADDCUW) },
+			{ 0x182, GET(VMAXSW) },
+			{ 0x184, GET(VSLW) },
+			{ 0x18a, GET(VEXPTEFP) },
+			{ 0x18c, GET(VMRGLW) },
+			{ 0x18e, GET(VPKSHSS) },
+			{ 0x1c4, GET(VSL) },
+			{ 0x1c6, GET(VCMPGEFP) },
+			{ 0x5c6, GET(VCMPGEFP_) },
+			{ 0x1ca, GET(VLOGEFP) },
+			{ 0x1ce, GET(VPKSWSS) },
+			{ 0x200, GET(VADDUBS) },
+			{ 0x202, GET(VMINUB) },
+			{ 0x204, GET(VSRB) },
+			{ 0x206, GET(VCMPGTUB) },
+			{ 0x606, GET(VCMPGTUB_) },
+			{ 0x208, GET(VMULEUB) },
+			{ 0x20a, GET(VRFIN) },
+			{ 0x20c, GET(VSPLTB) },
+			{ 0x20e, GET(VUPKHSB) },
+			{ 0x240, GET(VADDUHS) },
+			{ 0x242, GET(VMINUH) },
+			{ 0x244, GET(VSRH) },
+			{ 0x246, GET(VCMPGTUH) },
+			{ 0x646, GET(VCMPGTUH_) },
+			{ 0x248, GET(VMULEUH) },
+			{ 0x24a, GET(VRFIZ) },
+			{ 0x24c, GET(VSPLTH) },
+			{ 0x24e, GET(VUPKHSH) },
+			{ 0x280, GET(VADDUWS) },
+			{ 0x282, GET(VMINUW) },
+			{ 0x284, GET(VSRW) },
+			{ 0x286, GET(VCMPGTUW) },
+			{ 0x686, GET(VCMPGTUW_) },
+			{ 0x28a, GET(VRFIP) },
+			{ 0x28c, GET(VSPLTW) },
+			{ 0x28e, GET(VUPKLSB) },
+			{ 0x2c4, GET(VSR) },
+			{ 0x2c6, GET(VCMPGTFP) },
+			{ 0x6c6, GET(VCMPGTFP_) },
+			{ 0x2ca, GET(VRFIM) },
+			{ 0x2ce, GET(VUPKLSH) },
+			{ 0x300, GET(VADDSBS) },
+			{ 0x302, GET(VMINSB) },
+			{ 0x304, GET(VSRAB) },
+			{ 0x306, GET(VCMPGTSB) },
+			{ 0x706, GET(VCMPGTSB_) },
+			{ 0x308, GET(VMULESB) },
+			{ 0x30a, GET(VCFUX) },
+			{ 0x30c, GET(VSPLTISB) },
+			{ 0x30e, GET(VPKPX) },
+			{ 0x340, GET(VADDSHS) },
+			{ 0x342, GET(VMINSH) },
+			{ 0x344, GET(VSRAH) },
+			{ 0x346, GET(VCMPGTSH) },
+			{ 0x746, GET(VCMPGTSH_) },
+			{ 0x348, GET(VMULESH) },
+			{ 0x34a, GET(VCFSX) },
+			{ 0x34c, GET(VSPLTISH) },
+			{ 0x34e, GET(VUPKHPX) },
+			{ 0x380, GET(VADDSWS) },
+			{ 0x382, GET(VMINSW) },
+			{ 0x384, GET(VSRAW) },
+			{ 0x386, GET(VCMPGTSW) },
+			{ 0x786, GET(VCMPGTSW_) },
+			{ 0x38a, GET(VCTUXS) },
+			{ 0x38c, GET(VSPLTISW) },
+			{ 0x3c6, GET(VCMPBFP) },
+			{ 0x7c6, GET(VCMPBFP_) },
+			{ 0x3ca, GET(VCTSXS) },
+			{ 0x3ce, GET(VUPKLPX) },
+			{ 0x400, GET(VSUBUBM) },
+			{ 0x402, GET(VAVGUB) },
+			{ 0x404, GET(VAND) },
+			{ 0x40a, GET(VMAXFP) },
+			{ 0x40c, GET(VSLO) },
+			{ 0x440, GET(VSUBUHM) },
+			{ 0x442, GET(VAVGUH) },
+			{ 0x444, GET(VANDC) },
+			{ 0x44a, GET(VMINFP) },
+			{ 0x44c, GET(VSRO) },
+			{ 0x480, GET(VSUBUWM) },
+			{ 0x482, GET(VAVGUW) },
+			{ 0x484, GET(VOR) },
+			{ 0x4c4, GET(VXOR) },
+			{ 0x502, GET(VAVGSB) },
+			{ 0x504, GET(VNOR) },
+			{ 0x542, GET(VAVGSH) },
+			{ 0x580, GET(VSUBCUW) },
+			{ 0x582, GET(VAVGSW) },
+			{ 0x600, GET(VSUBUBS) },
+			{ 0x604, GET(MFVSCR) },
+			{ 0x608, GET(VSUM4UBS) },
+			{ 0x640, GET(VSUBUHS) },
+			{ 0x644, GET(MTVSCR) },
+			{ 0x648, GET(VSUM4SHS) },
+			{ 0x680, GET(VSUBUWS) },
+			{ 0x688, GET(VSUM2SWS) },
+			{ 0x700, GET(VSUBSBS) },
+			{ 0x708, GET(VSUM4SBS) },
+			{ 0x740, GET(VSUBSHS) },
+			{ 0x780, GET(VSUBSWS) },
+			{ 0x788, GET(VSUMSWS) },
 		});
 
 		// Group 0x13 opcodes (field 21..30)
 		fill_table(0x13, 10, 1,
 		{
-			{ 0x000, &D::MCRF },
-			{ 0x010, &D::BCLR },
-			{ 0x021, &D::CRNOR },
-			{ 0x081, &D::CRANDC },
-			{ 0x096, &D::ISYNC },
-			{ 0x0c1, &D::CRXOR },
-			{ 0x0e1, &D::CRNAND },
-			{ 0x101, &D::CRAND },
-			{ 0x121, &D::CREQV },
-			{ 0x1a1, &D::CRORC },
-			{ 0x1c1, &D::CROR },
-			{ 0x210, &D::BCCTR },
+			{ 0x000, GET(MCRF) },
+			{ 0x010, GET(BCLR) },
+			{ 0x021, GET(CRNOR) },
+			{ 0x081, GET(CRANDC) },
+			{ 0x096, GET(ISYNC) },
+			{ 0x0c1, GET(CRXOR) },
+			{ 0x0e1, GET(CRNAND) },
+			{ 0x101, GET(CRAND) },
+			{ 0x121, GET(CREQV) },
+			{ 0x1a1, GET(CRORC) },
+			{ 0x1c1, GET(CROR) },
+			{ 0x210, GET(BCCTR) },
 		});
 
 		// Group 0x1e opcodes (field 27..30)
 		fill_table(0x1e, 4, 1,
 		{
-			{ 0x0, &D::RLDICL },
-			{ 0x1, &D::RLDICL },
-			{ 0x2, &D::RLDICR },
-			{ 0x3, &D::RLDICR },
-			{ 0x4, &D::RLDIC },
-			{ 0x5, &D::RLDIC },
-			{ 0x6, &D::RLDIMI },
-			{ 0x7, &D::RLDIMI },
-			{ 0x8, &D::RLDCL },
-			{ 0x9, &D::RLDCR },
+			{ 0x0, GETRC(RLDICL) },
+			{ 0x1, GETRC(RLDICL) },
+			{ 0x2, GETRC(RLDICR) },
+			{ 0x3, GETRC(RLDICR) },
+			{ 0x4, GETRC(RLDIC) },
+			{ 0x5, GETRC(RLDIC) },
+			{ 0x6, GETRC(RLDIMI) },
+			{ 0x7, GETRC(RLDIMI) },
+			{ 0x8, GETRC(RLDCL) },
+			{ 0x9, GETRC(RLDCR) },
 		});
 
 		// Group 0x1f opcodes (field 21..30)
 		fill_table(0x1f, 10, 1,
 		{
-			{ 0x000, &D::CMP },
-			{ 0x004, &D::TW },
-			{ 0x006, &D::LVSL },
-			{ 0x007, &D::LVEBX },
-			{ 0x008, &D::SUBFC, 1 },
-			{ 0x009, &D::MULHDU },
-			{ 0x00a, &D::ADDC, 1 },
-			{ 0x00b, &D::MULHWU },
-			{ 0x013, &D::MFOCRF },
-			{ 0x014, &D::LWARX },
-			{ 0x015, &D::LDX },
-			{ 0x017, &D::LWZX },
-			{ 0x018, &D::SLW },
-			{ 0x01a, &D::CNTLZW },
-			{ 0x01b, &D::SLD },
-			{ 0x01c, &D::AND },
-			{ 0x020, &D::CMPL },
-			{ 0x026, &D::LVSR },
-			{ 0x027, &D::LVEHX },
-			{ 0x028, &D::SUBF, 1 },
-			{ 0x035, &D::LDUX },
-			{ 0x036, &D::DCBST },
-			{ 0x037, &D::LWZUX },
-			{ 0x03a, &D::CNTLZD },
-			{ 0x03c, &D::ANDC },
-			{ 0x044, &D::TD },
-			{ 0x047, &D::LVEWX },
-			{ 0x049, &D::MULHD },
-			{ 0x04b, &D::MULHW },
-			{ 0x054, &D::LDARX },
-			{ 0x056, &D::DCBF },
-			{ 0x057, &D::LBZX },
-			{ 0x067, &D::LVX },
-			{ 0x068, &D::NEG, 1 },
-			{ 0x077, &D::LBZUX },
-			{ 0x07c, &D::NOR },
-			{ 0x087, &D::STVEBX },
-			{ 0x088, &D::SUBFE, 1 },
-			{ 0x08a, &D::ADDE, 1 },
-			{ 0x090, &D::MTOCRF },
-			{ 0x095, &D::STDX },
-			{ 0x096, &D::STWCX },
-			{ 0x097, &D::STWX },
-			{ 0x0a7, &D::STVEHX },
-			{ 0x0b5, &D::STDUX },
-			{ 0x0b7, &D::STWUX },
-			{ 0x0c7, &D::STVEWX },
-			{ 0x0c8, &D::SUBFZE, 1 },
-			{ 0x0ca, &D::ADDZE, 1 },
-			{ 0x0d6, &D::STDCX },
-			{ 0x0d7, &D::STBX },
-			{ 0x0e7, &D::STVX },
-			{ 0x0e8, &D::SUBFME, 1 },
-			{ 0x0e9, &D::MULLD, 1 },
-			{ 0x0ea, &D::ADDME, 1 },
-			{ 0x0eb, &D::MULLW, 1 },
-			{ 0x0f6, &D::DCBTST },
-			{ 0x0f7, &D::STBUX },
-			{ 0x10a, &D::ADD, 1 },
-			{ 0x116, &D::DCBT },
-			{ 0x117, &D::LHZX },
-			{ 0x11c, &D::EQV },
-			{ 0x136, &D::ECIWX },
-			{ 0x137, &D::LHZUX },
-			{ 0x13c, &D::XOR },
-			{ 0x153, &D::MFSPR },
-			{ 0x155, &D::LWAX },
-			{ 0x156, &D::DST },
-			{ 0x157, &D::LHAX },
-			{ 0x167, &D::LVXL },
-			{ 0x173, &D::MFTB },
-			{ 0x175, &D::LWAUX },
-			{ 0x176, &D::DSTST },
-			{ 0x177, &D::LHAUX },
-			{ 0x197, &D::STHX },
-			{ 0x19c, &D::ORC },
-			{ 0x1b6, &D::ECOWX },
-			{ 0x1b7, &D::STHUX },
-			{ 0x1bc, &D::OR },
-			{ 0x1c9, &D::DIVDU, 1 },
-			{ 0x1cb, &D::DIVWU, 1 },
-			{ 0x1d3, &D::MTSPR },
-			{ 0x1d6, &D::DCBI },
-			{ 0x1dc, &D::NAND },
-			{ 0x1e7, &D::STVXL },
-			{ 0x1e9, &D::DIVD, 1 },
-			{ 0x1eb, &D::DIVW, 1 },
-			{ 0x207, &D::LVLX },
-			{ 0x214, &D::LDBRX },
-			{ 0x215, &D::LSWX },
-			{ 0x216, &D::LWBRX },
-			{ 0x217, &D::LFSX },
-			{ 0x218, &D::SRW },
-			{ 0x21b, &D::SRD },
-			{ 0x227, &D::LVRX },
-			{ 0x237, &D::LFSUX },
-			{ 0x255, &D::LSWI },
-			{ 0x256, &D::SYNC },
-			{ 0x257, &D::LFDX },
-			{ 0x277, &D::LFDUX },
-			{ 0x287, &D::STVLX },
-			{ 0x294, &D::STDBRX },
-			{ 0x295, &D::STSWX },
-			{ 0x296, &D::STWBRX },
-			{ 0x297, &D::STFSX },
-			{ 0x2a7, &D::STVRX },
-			{ 0x2b7, &D::STFSUX },
-			{ 0x2d5, &D::STSWI },
-			{ 0x2d7, &D::STFDX },
-			{ 0x2f7, &D::STFDUX },
-			{ 0x307, &D::LVLXL },
-			{ 0x316, &D::LHBRX },
-			{ 0x318, &D::SRAW },
-			{ 0x31a, &D::SRAD },
-			{ 0x327, &D::LVRXL },
-			{ 0x336, &D::DSS },
-			{ 0x338, &D::SRAWI },
-			{ 0x33a, &D::SRADI },
-			{ 0x33b, &D::SRADI },
-			{ 0x356, &D::EIEIO },
-			{ 0x387, &D::STVLXL },
-			{ 0x396, &D::STHBRX },
-			{ 0x39a, &D::EXTSH },
-			{ 0x3a7, &D::STVRXL },
-			{ 0x3ba, &D::EXTSB },
-			{ 0x3d7, &D::STFIWX },
-			{ 0x3da, &D::EXTSW },
-			{ 0x3d6, &D::ICBI },
-			{ 0x3f6, &D::DCBZ },
+			{ 0x000, GET(CMP) },
+			{ 0x004, GET(TW) },
+			{ 0x006, GET(LVSL) },
+			{ 0x007, GET(LVEBX) },
+			{ 0x008, GETRC(SUBFC) },
+			{ 0x208, GETRC(SUBFCO) },
+			{ 0x009, GETRC(MULHDU) },
+			{ 0x00a, GETRC(ADDC) },
+			{ 0x20a, GETRC(ADDCO) },
+			{ 0x00b, GETRC(MULHWU) },
+			{ 0x013, GET(MFOCRF) },
+			{ 0x014, GET(LWARX) },
+			{ 0x015, GET(LDX) },
+			{ 0x017, GET(LWZX) },
+			{ 0x018, GETRC(SLW) },
+			{ 0x01a, GETRC(CNTLZW) },
+			{ 0x01b, GETRC(SLD) },
+			{ 0x01c, GETRC(AND) },
+			{ 0x020, GET(CMPL) },
+			{ 0x026, GET(LVSR) },
+			{ 0x027, GET(LVEHX) },
+			{ 0x028, GETRC(SUBF) },
+			{ 0x228, GETRC(SUBFO) },
+			{ 0x035, GET(LDUX) },
+			{ 0x036, GET(DCBST) },
+			{ 0x037, GET(LWZUX) },
+			{ 0x03a, GETRC(CNTLZD) },
+			{ 0x03c, GETRC(ANDC) },
+			{ 0x044, GET(TD) },
+			{ 0x047, GET(LVEWX) },
+			{ 0x049, GETRC(MULHD) },
+			{ 0x04b, GETRC(MULHW) },
+			{ 0x054, GET(LDARX) },
+			{ 0x056, GET(DCBF) },
+			{ 0x057, GET(LBZX) },
+			{ 0x067, GET(LVX) },
+			{ 0x068, GETRC(NEG) },
+			{ 0x268, GETRC(NEGO) },
+			{ 0x077, GET(LBZUX) },
+			{ 0x07c, GETRC(NOR) },
+			{ 0x087, GET(STVEBX) },
+			{ 0x088, GETRC(SUBFE) },
+			{ 0x288, GETRC(SUBFEO) },
+			{ 0x08a, GETRC(ADDE) },
+			{ 0x28a, GETRC(ADDEO) },
+			{ 0x090, GET(MTOCRF) },
+			{ 0x095, GET(STDX) },
+			{ 0x096, GET(STWCX) },
+			{ 0x097, GET(STWX) },
+			{ 0x0a7, GET(STVEHX) },
+			{ 0x0b5, GET(STDUX) },
+			{ 0x0b7, GET(STWUX) },
+			{ 0x0c7, GET(STVEWX) },
+			{ 0x0c8, GETRC(SUBFZE) },
+			{ 0x2c8, GETRC(SUBFZEO) },
+			{ 0x0ca, GETRC(ADDZE) },
+			{ 0x2ca, GETRC(ADDZEO) },
+			{ 0x0d6, GET(STDCX) },
+			{ 0x0d7, GET(STBX) },
+			{ 0x0e7, GET(STVX) },
+			{ 0x0e8, GETRC(SUBFME) },
+			{ 0x2e8, GETRC(SUBFMEO) },
+			{ 0x0e9, GETRC(MULLD) },
+			{ 0x2e9, GETRC(MULLDO) },
+			{ 0x0ea, GETRC(ADDME) },
+			{ 0x2ea, GETRC(ADDMEO) },
+			{ 0x0eb, GETRC(MULLW) },
+			{ 0x2eb, GETRC(MULLWO) },
+			{ 0x0f6, GET(DCBTST) },
+			{ 0x0f7, GET(STBUX) },
+			{ 0x10a, GETRC(ADD) },
+			{ 0x30a, GETRC(ADDO) },
+			{ 0x116, GET(DCBT) },
+			{ 0x117, GET(LHZX) },
+			{ 0x11c, GETRC(EQV) },
+			{ 0x136, GET(ECIWX) },
+			{ 0x137, GET(LHZUX) },
+			{ 0x13c, GETRC(XOR) },
+			{ 0x153, GET(MFSPR) },
+			{ 0x155, GET(LWAX) },
+			{ 0x156, GET(DST) },
+			{ 0x157, GET(LHAX) },
+			{ 0x167, GET(LVXL) },
+			{ 0x173, GET(MFTB) },
+			{ 0x175, GET(LWAUX) },
+			{ 0x176, GET(DSTST) },
+			{ 0x177, GET(LHAUX) },
+			{ 0x197, GET(STHX) },
+			{ 0x19c, GETRC(ORC) },
+			{ 0x1b6, GET(ECOWX) },
+			{ 0x1b7, GET(STHUX) },
+			{ 0x1bc, GETRC(OR) },
+			{ 0x1c9, GETRC(DIVDU) },
+			{ 0x3c9, GETRC(DIVDUO) },
+			{ 0x1cb, GETRC(DIVWU) },
+			{ 0x3cb, GETRC(DIVWUO) },
+			{ 0x1d3, GET(MTSPR) },
+			{ 0x1d6, GET(DCBI) },
+			{ 0x1dc, GETRC(NAND) },
+			{ 0x1e7, GET(STVXL) },
+			{ 0x1e9, GETRC(DIVD) },
+			{ 0x3e9, GETRC(DIVDO) },
+			{ 0x1eb, GETRC(DIVW) },
+			{ 0x3eb, GETRC(DIVWO) },
+			{ 0x207, GET(LVLX) },
+			{ 0x214, GET(LDBRX) },
+			{ 0x215, GET(LSWX) },
+			{ 0x216, GET(LWBRX) },
+			{ 0x217, GET(LFSX) },
+			{ 0x218, GETRC(SRW) },
+			{ 0x21b, GETRC(SRD) },
+			{ 0x227, GET(LVRX) },
+			{ 0x237, GET(LFSUX) },
+			{ 0x255, GET(LSWI) },
+			{ 0x256, GET(SYNC) },
+			{ 0x257, GET(LFDX) },
+			{ 0x277, GET(LFDUX) },
+			{ 0x287, GET(STVLX) },
+			{ 0x294, GET(STDBRX) },
+			{ 0x295, GET(STSWX) },
+			{ 0x296, GET(STWBRX) },
+			{ 0x297, GET(STFSX) },
+			{ 0x2a7, GET(STVRX) },
+			{ 0x2b7, GET(STFSUX) },
+			{ 0x2d5, GET(STSWI) },
+			{ 0x2d7, GET(STFDX) },
+			{ 0x2f7, GET(STFDUX) },
+			{ 0x307, GET(LVLXL) },
+			{ 0x316, GET(LHBRX) },
+			{ 0x318, GETRC(SRAW) },
+			{ 0x31a, GETRC(SRAD) },
+			{ 0x327, GET(LVRXL) },
+			{ 0x336, GET(DSS) },
+			{ 0x338, GETRC(SRAWI) },
+			{ 0x33a, GETRC(SRADI) },
+			{ 0x33b, GETRC(SRADI) },
+			{ 0x356, GET(EIEIO) },
+			{ 0x387, GET(STVLXL) },
+			{ 0x396, GET(STHBRX) },
+			{ 0x39a, GETRC(EXTSH) },
+			{ 0x3a7, GET(STVRXL) },
+			{ 0x3ba, GETRC(EXTSB) },
+			{ 0x3d7, GET(STFIWX) },
+			{ 0x3da, GETRC(EXTSW) },
+			{ 0x3d6, GET(ICBI) },
+			{ 0x3f6, GET(DCBZ) },
 		});
 
 		// Group 0x3a opcodes (field 30..31)
 		fill_table(0x3a, 2, 0,
 		{
-			{ 0x0, &D::LD },
-			{ 0x1, &D::LDU },
-			{ 0x2, &D::LWA },
+			{ 0x0, GET(LD) },
+			{ 0x1, GET(LDU) },
+			{ 0x2, GET(LWA) },
 		});
 
 		// Group 0x3b opcodes (field 21..30)
 		fill_table(0x3b, 10, 1,
 		{
-			{ 0x12, &D::FDIVS, 5 },
-			{ 0x14, &D::FSUBS, 5 },
-			{ 0x15, &D::FADDS, 5 },
-			{ 0x16, &D::FSQRTS, 5 },
-			{ 0x18, &D::FRES, 5 },
-			{ 0x19, &D::FMULS, 5 },
-			{ 0x1c, &D::FMSUBS, 5 },
-			{ 0x1d, &D::FMADDS, 5 },
-			{ 0x1e, &D::FNMSUBS, 5 },
-			{ 0x1f, &D::FNMADDS, 5 },
+			{ 0x12, GETRC(FDIVS), 5 },
+			{ 0x14, GETRC(FSUBS), 5 },
+			{ 0x15, GETRC(FADDS), 5 },
+			{ 0x16, GETRC(FSQRTS), 5 },
+			{ 0x18, GETRC(FRES), 5 },
+			{ 0x19, GETRC(FMULS), 5 },
+			{ 0x1c, GETRC(FMSUBS), 5 },
+			{ 0x1d, GETRC(FMADDS), 5 },
+			{ 0x1e, GETRC(FNMSUBS), 5 },
+			{ 0x1f, GETRC(FNMADDS), 5 },
 		});
 
 		// Group 0x3e opcodes (field 30..31)
 		fill_table(0x3e, 2, 0,
 		{
-			{ 0x0, &D::STD },
-			{ 0x1, &D::STDU },
+			{ 0x0, GET(STD) },
+			{ 0x1, GET(STDU) },
 		});
 
 		// Group 0x3f opcodes (field 21..30)
 		fill_table(0x3f, 10, 1,
 		{
-			{ 0x026, &D::MTFSB1 },
-			{ 0x040, &D::MCRFS },
-			{ 0x046, &D::MTFSB0 },
-			{ 0x086, &D::MTFSFI },
-			{ 0x247, &D::MFFS },
-			{ 0x2c7, &D::MTFSF },
+			{ 0x026, GETRC(MTFSB1) },
+			{ 0x040, GET(MCRFS) },
+			{ 0x046, GETRC(MTFSB0) },
+			{ 0x086, GETRC(MTFSFI) },
+			{ 0x247, GETRC(MFFS) },
+			{ 0x2c7, GETRC(MTFSF) },
 
-			{ 0x000, &D::FCMPU },
-			{ 0x00c, &D::FRSP },
-			{ 0x00e, &D::FCTIW },
-			{ 0x00f, &D::FCTIWZ },
+			{ 0x000, GET(FCMPU) },
+			{ 0x00c, GETRC(FRSP) },
+			{ 0x00e, GETRC(FCTIW) },
+			{ 0x00f, GETRC(FCTIWZ) },
 
-			{ 0x012, &D::FDIV, 5 },
-			{ 0x014, &D::FSUB, 5 },
-			{ 0x015, &D::FADD, 5 },
-			{ 0x016, &D::FSQRT, 5 },
-			{ 0x017, &D::FSEL, 5 },
-			{ 0x019, &D::FMUL, 5 },
-			{ 0x01a, &D::FRSQRTE, 5 },
-			{ 0x01c, &D::FMSUB, 5 },
-			{ 0x01d, &D::FMADD, 5 },
-			{ 0x01e, &D::FNMSUB, 5 },
-			{ 0x01f, &D::FNMADD, 5 },
+			{ 0x012, GETRC(FDIV), 5 },
+			{ 0x014, GETRC(FSUB), 5 },
+			{ 0x015, GETRC(FADD), 5 },
+			{ 0x016, GETRC(FSQRT), 5 },
+			{ 0x017, GETRC(FSEL), 5 },
+			{ 0x019, GETRC(FMUL), 5 },
+			{ 0x01a, GETRC(FRSQRTE), 5 },
+			{ 0x01c, GETRC(FMSUB), 5 },
+			{ 0x01d, GETRC(FMADD), 5 },
+			{ 0x01e, GETRC(FNMSUB), 5 },
+			{ 0x01f, GETRC(FNMADD), 5 },
 
-			{ 0x020, &D::FCMPO },
-			{ 0x028, &D::FNEG },
-			{ 0x048, &D::FMR },
-			{ 0x088, &D::FNABS },
-			{ 0x108, &D::FABS },
-			{ 0x32e, &D::FCTID },
-			{ 0x32f, &D::FCTIDZ },
-			{ 0x34e, &D::FCFID },
+			{ 0x020, GET(FCMPO) },
+			{ 0x028, GETRC(FNEG) },
+			{ 0x048, GETRC(FMR) },
+			{ 0x088, GETRC(FNABS) },
+			{ 0x108, GETRC(FABS) },
+			{ 0x32e, GETRC(FCTID) },
+			{ 0x32f, GETRC(FCTIDZ) },
+			{ 0x34e, GETRC(FCFID) },
 		});
 	}
 
@@ -587,6 +635,10 @@ public:
 	}
 };
 
+#undef GET_
+#undef GET
+#undef GETRC
+
 namespace ppu_instructions
 {
 	namespace fields
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index ed7a265155..e44673f9c5 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -62,7 +62,7 @@
 #include "util/asm.hpp"
 #include "util/vm.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 #include "util/sysinfo.hpp"
 
 extern atomic_t<u64> g_watchdog_hold_ctr;
@@ -131,9 +131,8 @@ void fmt_class_string<typename ppu_thread::call_history_t>::format(std::string&
 	}
 }
 
-const ppu_decoder<ppu_interpreter_precise> g_ppu_interpreter_precise;
-const ppu_decoder<ppu_interpreter_fast> g_ppu_interpreter_fast;
-const ppu_decoder<ppu_itype> g_ppu_itype;
+extern const ppu_decoder<ppu_itype> g_ppu_itype{};
+extern const ppu_decoder<ppu_iname> g_ppu_iname{};
 
 extern void ppu_initialize();
 extern void ppu_finalize(const ppu_module& info);
@@ -143,15 +142,16 @@ extern std::pair<std::shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const
 extern void ppu_unload_prx(const lv2_prx&);
 extern std::shared_ptr<lv2_prx> ppu_load_prx(const ppu_prx_object&, const std::string&, s64 file_offset);
 extern void ppu_execute_syscall(ppu_thread& ppu, u64 code);
-static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op);
+static void ppu_break(ppu_thread&, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*);
 
 extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
 
-const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](asmjit::x86::Assembler& c, auto& args)
+const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](native_asm& c, auto& args)
 {
 	// Gateway for PPU, converts from native to GHC calling convention, also saves RSP value for escape
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 #ifdef _WIN32
 	c.push(x86::r15);
 	c.push(x86::r14);
@@ -192,10 +192,10 @@ const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](
 
 	c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target
 	c.mov(x86::rdx, x86::rax);
-	c.shl(x86::rax, 17);
-	c.shr(x86::rax, 17);
-	c.shr(x86::rdx, 47);
-	c.shl(x86::rdx, 12);
+	c.shl(x86::rax, 16);
+	c.shr(x86::rax, 16);
+	c.shr(x86::rdx, 48);
+	c.shl(x86::edx, 13);
 	c.mov(x86::r12d, x86::edx); // Load relocation base
 
 	c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
@@ -246,116 +246,113 @@ const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](
 #endif
 
 	c.ret();
+#else
+	c.ret(a64::x30);
+#endif
 });
 
-const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_escape", [](asmjit::x86::Assembler& c, auto& args)
+const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_escape", [](native_asm& c, auto& args)
 {
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 	// Restore native stack pointer (longjmp emulation)
 	c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)));
 
 	// Return to the return location
 	c.sub(x86::rsp, 8);
 	c.ret();
+#endif
 });
 
 void ppu_recompiler_fallback(ppu_thread& ppu);
 
-const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>("ppu_trampolineb", [](asmjit::x86::Assembler& c, auto& args)
+#if defined(ARCH_X64)
+const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>("ppu_trampolineb", [](native_asm& c, auto& args)
 {
 	using namespace asmjit;
 
 	c.mov(args[0], x86::rbp);
 	c.jmp(imm_ptr(ppu_recompiler_fallback));
 });
+#elif defined(ARCH_ARM64)
+const auto ppu_recompiler_fallback_ghc = &ppu_recompiler_fallback;
+#endif
 
 // Get pointer to executable cache
-static u64& ppu_ref(u32 addr)
+static ppu_intrp_func_t& ppu_ref(u32 addr)
 {
-	return *reinterpret_cast<u64*>(vm::g_exec_addr + u64{addr} * 2);
+	return *reinterpret_cast<ppu_intrp_func_t*>(vm::g_exec_addr + u64{addr} * 2);
 }
 
 // Get interpreter cache value
-static u64 ppu_cache(u32 addr)
+static ppu_intrp_func_t ppu_cache(u32 addr)
 {
-	if (g_cfg.core.ppu_decoder > ppu_decoder_type::fast)
+	if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
 	{
 		fmt::throw_exception("Invalid PPU decoder");
 	}
 
-	// Select opcode table
-	const auto& table = *(
-		g_cfg.core.ppu_decoder == ppu_decoder_type::precise
-		? &g_ppu_interpreter_precise.get_table()
-		: &g_ppu_interpreter_fast.get_table());
-
-	return reinterpret_cast<uptr>(table[ppu_decode(vm::read32(addr))]);
+	return g_fxo->get<ppu_interpreter_rt>().decode(vm::read32(addr));
 }
 
-static bool ppu_fallback(ppu_thread& ppu, ppu_opcode_t op)
+static ppu_intrp_func ppu_ret = {[](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
 {
-	if (g_cfg.core.ppu_debug)
-	{
-		ppu_log.error("Unregistered instruction: 0x%08x", op.opcode);
-	}
+	// Fix PC and return (step execution)
+	ppu.cia = vm::get_addr(this_op);
+	return;
+}};
 
-	ppu_ref(ppu.cia) = ppu_cache(ppu.cia);
-	return false;
+static void ppu_fallback(ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
+{
+	const auto _pc = vm::get_addr(this_op);
+	const auto _fn = ppu_cache(_pc);
+	ppu_ref(_pc) = _fn;
+	return _fn(ppu, op, this_op, next_fn);
 }
 
 // TODO: Make this a dispatch call
 void ppu_recompiler_fallback(ppu_thread& ppu)
 {
+	perf_meter<"PPUFALL1"_u64> perf0;
+
 	if (g_cfg.core.ppu_debug)
 	{
-		ppu_log.error("Unregistered PPU Function (LR=0x%llx)", ppu.lr);
+		ppu_log.error("Unregistered PPU Function (LR=0x%x)", ppu.lr);
 	}
 
-	const auto& table = g_ppu_interpreter_fast.get_table();
-
-	u64 ctr = 0;
+	const auto& table = g_fxo->get<ppu_interpreter_rt>();
 
 	while (true)
 	{
-		if (uptr func = ppu_ref(ppu.cia); (func << 17 >> 17) != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
+		if (uptr func = uptr(ppu_ref(ppu.cia)); (func << 16 >> 16) != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
 		{
 			// We found a recompiler function at cia, return
 			break;
 		}
 
-		// Run instructions in interpreter
-		if (const u32 op = vm::read32(ppu.cia); ctr++, table[ppu_decode(op)](ppu, {op})) [[likely]]
-		{
-			ppu.cia += 4;
-			continue;
-		}
+		// Run one instruction in interpreter (TODO)
+		const u32 op = vm::read32(ppu.cia);
+		table.decode(op)(ppu, {op}, vm::_ptr<u32>(ppu.cia), &ppu_ret);
 
 		if (ppu.test_stopped())
 		{
 			break;
 		}
 	}
-
-	if (g_cfg.core.ppu_debug)
-	{
-		ppu_log.warning("Exiting interpreter at 0x%x (executed %u functions)", ppu.cia, ctr);
-	}
 }
 
 void ppu_reservation_fallback(ppu_thread& ppu)
 {
-	const auto& table = g_ppu_interpreter_fast.get_table();
+	perf_meter<"PPUFALL2"_u64> perf0;
+
+	const auto& table = g_fxo->get<ppu_interpreter_rt>();
 
 	while (true)
 	{
-		// Run instructions in interpreter
+		// Run one instruction in interpreter (TODO)
 		const u32 op = vm::read32(ppu.cia);
-
-		if (table[ppu_decode(op)](ppu, {op})) [[likely]]
-		{
-			ppu.cia += 4;
-		}
+		table.decode(op)(ppu, {op}, vm::_ptr<u32>(ppu.cia), &ppu_ret);
 
 		if (!ppu.raddr || !ppu.use_full_rdata)
 		{
@@ -372,7 +369,7 @@ void ppu_reservation_fallback(ppu_thread& ppu)
 
 static std::unordered_map<u32, u32>* s_ppu_toc;
 
-static bool ppu_check_toc(ppu_thread& ppu, ppu_opcode_t)
+static void ppu_check_toc(ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
 {
 	// Compare TOC with expected value
 	const auto found = s_ppu_toc->find(ppu.cia);
@@ -383,18 +380,12 @@ static bool ppu_check_toc(ppu_thread& ppu, ppu_opcode_t)
 
 		if (!ppu.state.test_and_set(cpu_flag::dbg_pause) && ppu.check_state())
 		{
-			return false;
+			return;
 		}
 	}
 
 	// Fallback to the interpreter function
-	const u64 val = ppu_cache(ppu.cia);
-	if (reinterpret_cast<decltype(&ppu_interpreter::UNK)>(val & 0xffffffff)(ppu, {static_cast<u32>(val >> 32)}))
-	{
-		ppu.cia += 4;
-	}
-
-	return false;
+	return ppu_cache(ppu.cia)(ppu, op, this_op, next_fn);
 }
 
 extern void ppu_register_range(u32 addr, u32 size)
@@ -417,7 +408,6 @@ extern void ppu_register_range(u32 addr, u32 size)
 		utils::memory_commit(vm::g_stat_addr + addr, size);
 	}
 
-	const u64 fallback = reinterpret_cast<uptr>(ppu_fallback);
 	const u64 seg_base = addr;
 
 	while (size)
@@ -425,11 +415,11 @@ extern void ppu_register_range(u32 addr, u32 size)
 		if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
 		{
 			// Assume addr is the start of first segment of PRX
-			ppu_ref(addr) = reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3));
+			ppu_ref(addr) = reinterpret_cast<ppu_intrp_func_t>(reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3)));
 		}
 		else
 		{
-			ppu_ref(addr) = fallback;
+			ppu_ref(addr) = ppu_fallback;
 		}
 
 		addr += 4;
@@ -437,14 +427,14 @@ extern void ppu_register_range(u32 addr, u32 size)
 	}
 }
 
-static bool ppu_far_jump(ppu_thread& ppu);
+static void ppu_far_jump(ppu_thread&, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*);
 
-extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nullptr)
+extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr = nullptr)
 {
 	// Initialize specific function
 	if (ptr)
 	{
-		ppu_ref(addr) = (reinterpret_cast<uptr>(ptr) & 0x7fff'ffff'ffffu) | (ppu_ref(addr) & ~0x7fff'ffff'ffffu);
+		ppu_ref(addr) = reinterpret_cast<ppu_intrp_func_t>((reinterpret_cast<uptr>(ptr) & 0xffff'ffff'ffffu) | (uptr(ppu_ref(addr)) & ~0xffff'ffff'ffffu));
 		return;
 	}
 
@@ -464,12 +454,9 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nu
 	}
 
 	// Initialize interpreter cache
-	const u64 _break = reinterpret_cast<uptr>(ppu_break);
-	const u64 far_jump = reinterpret_cast<uptr>(ppu_far_jump);
-
 	while (size)
 	{
-		if (ppu_ref(addr) != _break && ppu_ref(addr) != far_jump)
+		if (ppu_ref(addr) != ppu_break && ppu_ref(addr) != ppu_far_jump)
 		{
 			ppu_ref(addr) = ppu_cache(addr);
 		}
@@ -481,12 +468,12 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nu
 
 extern void ppu_register_function_at(u32 addr, u32 size, u64 ptr)
 {
-	return ppu_register_function_at(addr, size, reinterpret_cast<ppu_function_t>(ptr));
+	return ppu_register_function_at(addr, size, reinterpret_cast<ppu_intrp_func_t>(ptr));
 }
 
 u32 ppu_get_exported_func_addr(u32 fnid, const std::string& module_name);
 
-bool ppu_return_from_far_jump(ppu_thread& ppu)
+void ppu_return_from_far_jump(ppu_thread& ppu, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*)
 {
 	auto& calls_info = ppu.hle_func_calls_with_toc_info;
 	ensure(!calls_info.empty());
@@ -498,7 +485,6 @@ bool ppu_return_from_far_jump(ppu_thread& ppu)
 	ppu.gpr[2] = restore_info->saved_r2;
 
 	calls_info.pop_back();
-	return false;
 }
 
 static const bool s_init_return_far_jump_func = []
@@ -586,9 +572,9 @@ u32 ppu_get_far_jump(u32 pc)
 	return g_fxo->get<ppu_far_jumps_t>().get_target(pc);
 }
 
-static bool ppu_far_jump(ppu_thread& ppu)
+static void ppu_far_jump(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
 {
-	const u32 cia = g_fxo->get<ppu_far_jumps_t>().get_target(ppu.cia, &ppu);
+	const u32 cia = g_fxo->get<ppu_far_jumps_t>().get_target(vm::get_addr(this_op), &ppu);
 
 	if (!vm::check_addr(cia, vm::page_executable))
 	{
@@ -596,7 +582,6 @@ static bool ppu_far_jump(ppu_thread& ppu)
 	}
 
 	ppu.cia = cia;
-	return false;
 }
 
 bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, std::string module_name)
@@ -658,7 +643,7 @@ bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, st
 	auto& jumps = g_fxo->get<ppu_far_jumps_t>();
 
 	std::lock_guard lock(jumps.mutex);
-	jumps.vals.insert_or_assign(entry, std::type_identity_t<typename ppu_far_jumps_t::all_info_t>{target, link, with_toc, std::move(module_name)});
+	jumps.vals.insert_or_assign(entry, ppu_far_jumps_t::all_info_t{target, link, with_toc, std::move(module_name)});
 	ppu_register_function_at(entry, 4, &ppu_far_jump);
 
 	return true;
@@ -702,10 +687,13 @@ void ppu_remove_hle_instructions(u32 addr, u32 size)
 atomic_t<bool> g_debugger_pause_all_threads_on_bp = true;
 
 // Breakpoint entry point
-static bool ppu_break(ppu_thread& ppu, ppu_opcode_t)
+static void ppu_break(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func* next_fn)
 {
 	const bool pause_all = g_debugger_pause_all_threads_on_bp;
 
+	const u32 old_cia = vm::get_addr(this_op);
+	ppu.cia = old_cia;
+
 	// Pause
 	ppu.state.atomic_op([&](bs_t<cpu_flag>& state)
 	{
@@ -719,19 +707,14 @@ static bool ppu_break(ppu_thread& ppu, ppu_opcode_t)
 		Emu.CallAfter([]() { Emu.Pause(); });
 	}
 
-	if (ppu.check_state())
+	if (ppu.check_state() || old_cia != atomic_storage<u32>::load(ppu.cia))
 	{
-		return false;
+		// Do not execute if PC changed
+		return;
 	}
 
 	// Fallback to the interpreter function
-	const u64 val = ppu_cache(ppu.cia);
-	if (reinterpret_cast<decltype(&ppu_interpreter::UNK)>(val)(ppu, {vm::read32(ppu.cia).get()}))
-	{
-		ppu.cia += 4;
-	}
-
-	return false;
+	return ppu_cache(ppu.cia)(ppu, {*this_op}, this_op, next_fn);
 }
 
 // Set or remove breakpoint
@@ -742,11 +725,9 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
 		return false;
 	}
 
-	const u64 _break = reinterpret_cast<uptr>(&ppu_break);
-
 	// Remove breakpoint parameters
-	u64 to_set = 0;
-	u64 expected = _break;
+	ppu_intrp_func_t to_set = 0;
+	ppu_intrp_func_t expected = &ppu_break;
 
 	if (u32 hle_addr{}; g_fxo->is_init<ppu_function_manager>() && (hle_addr = g_fxo->get<ppu_function_manager>().addr))
 	{
@@ -756,7 +737,7 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
 		if (addr % 8 == 4 && index < ppu_function_manager::get().size())
 		{
 			// HLE function placement
-			to_set = reinterpret_cast<uptr>(ppu_function_manager::get()[index]);
+			to_set = ppu_function_manager::get()[index];
 		}
 	}
 
@@ -766,23 +747,21 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
 		to_set = ppu_cache(addr);
 	}
 
-	u64& _ref = ppu_ref(addr);
+	ppu_intrp_func_t& _ref = ppu_ref(addr);
 
 	if (is_adding)
 	{
 		// Swap if adding
 		std::swap(to_set, expected);
 
-		const u64 _fall = reinterpret_cast<uptr>(&ppu_fallback);
-
-		if (_ref == _fall)
+		if (_ref == &ppu_fallback)
 		{
 			ppu_log.error("Unregistered instruction replaced with a breakpoint at 0x%08x", addr);
-			expected = _fall;
+			expected = ppu_fallback;
 		}
 	}
 
-	return atomic_storage<u64>::compare_exchange(_ref, expected, to_set);
+	return atomic_storage<ppu_intrp_func_t>::compare_exchange(_ref, expected, to_set);
 }
 
 extern bool ppu_patch(u32 addr, u32 value)
@@ -812,12 +791,9 @@ extern bool ppu_patch(u32 addr, u32 value)
 
 	*vm::get_super_ptr<u32>(addr) = value;
 
-	const u64 _break = reinterpret_cast<uptr>(&ppu_break);
-	const u64 fallback = reinterpret_cast<uptr>(&ppu_fallback);
-
 	if (is_exec)
 	{
-		if (ppu_ref(addr) != _break && ppu_ref(addr) != fallback)
+		if (ppu_ref(addr) != ppu_break && ppu_ref(addr) != ppu_fallback)
 		{
 			ppu_ref(addr) = ppu_cache(addr);
 		}
@@ -1182,10 +1158,13 @@ void ppu_thread::cpu_task()
 {
 	std::fesetround(FE_TONEAREST);
 
-	if (g_cfg.core.set_daz_and_ftz && g_cfg.core.ppu_decoder != ppu_decoder_type::precise)
+	if (g_cfg.core.set_daz_and_ftz)
 	{
-		// Set DAZ and FTZ
-		_mm_setcsr(_mm_getcsr() | 0x8840);
+		gv_set_zeroing_denormals();
+	}
+	else
+	{
+		gv_unset_zeroing_denormals();
 	}
 
 	// Execute cmd_queue
@@ -1197,9 +1176,7 @@ void ppu_thread::cpu_task()
 		{
 		case ppu_cmd::opcode:
 		{
-			cmd_pop(), g_cfg.core.ppu_decoder == ppu_decoder_type::precise
-				? g_ppu_interpreter_precise.decode(arg)(*this, {arg})
-				: g_ppu_interpreter_fast.decode(arg)(*this, {arg});
+			cmd_pop(), g_fxo->get<ppu_interpreter_rt>().decode(arg)(*this, {arg}, vm::_ptr<u32>(cia - 4), &ppu_ret);
 			break;
 		}
 		case ppu_cmd::set_gpr:
@@ -1236,7 +1213,7 @@ void ppu_thread::cpu_task()
 		}
 		case ppu_cmd::hle_call:
 		{
-			cmd_pop(), ppu_function_manager::get().at(arg)(*this);
+			cmd_pop(), ppu_function_manager::get().at(arg)(*this, {arg}, vm::_ptr<u32>(cia - 4), &ppu_ret);
 			break;
 		}
 		case ppu_cmd::opd_call:
@@ -1247,8 +1224,8 @@ void ppu_thread::cpu_task()
 		}
 		case ppu_cmd::ptr_call:
 		{
-			const ppu_function_t func = cmd_get(1).as<ppu_function_t>();
-			cmd_pop(1), func(*this);
+			const ppu_intrp_func_t func = cmd_get(1).as<ppu_intrp_func_t>();
+			cmd_pop(1), func(*this, {}, vm::_ptr<u32>(cia - 4), &ppu_ret);
 			break;
 		}
 		case ppu_cmd::initialize:
@@ -1323,7 +1300,7 @@ void ppu_thread::cpu_on_stop()
 
 void ppu_thread::exec_task()
 {
-	if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
+	if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
 	{
 		while (true)
 		{
@@ -1340,79 +1317,28 @@ void ppu_thread::exec_task()
 	}
 
 	const auto cache = vm::g_exec_addr;
-	using func_t = decltype(&ppu_interpreter::UNK);
+	const auto mem_ = vm::g_base_addr;
 
 	while (true)
 	{
-		const auto exec_op = [this](u64 op)
+		if (test_stopped()) [[unlikely]]
 		{
-			return reinterpret_cast<func_t>(op)(*this, {vm::read32(cia).get()});
-		};
-
-		if (cia % 8 || state) [[unlikely]]
-		{
-			if (test_stopped()) return;
-
-			// Decode single instruction (may be step)
-			if (exec_op(*reinterpret_cast<u64*>(cache + u64{cia} * 2))) { cia += 4; }
-			continue;
+			return;
 		}
 
-		u64 op0, op1, op2, op3;
-		u64 _pos = u64{cia} * 2;
+		gv_zeroupper();
 
-		// Reinitialize
-		{
-			const v128 _op0 = *reinterpret_cast<const v128*>(cache + _pos);
-			const v128 _op1 = *reinterpret_cast<const v128*>(cache + _pos + 16);
-			op0 = _op0._u64[0];
-			op1 = _op0._u64[1];
-			op2 = _op1._u64[0];
-			op3 = _op1._u64[1];
-		}
-
-		while (exec_op(op0)) [[likely]]
-		{
-			cia += 4;
-
-			if (exec_op(op1)) [[likely]]
-			{
-				cia += 4;
-
-				if (exec_op(op2)) [[likely]]
-				{
-					cia += 4;
-
-					if (exec_op(op3)) [[likely]]
-					{
-						cia += 4;
-
-						if (state) [[unlikely]]
-						{
-							break;
-						}
-
-						_pos += 32;
-						const v128 _op0 = *reinterpret_cast<const v128*>(cache + _pos);
-						const v128 _op1 = *reinterpret_cast<const v128*>(cache + _pos + 16);
-						op0 = _op0._u64[0];
-						op1 = _op0._u64[1];
-						op2 = _op1._u64[0];
-						op3 = _op1._u64[1];
-						continue;
-					}
-					break;
-				}
-				break;
-			}
-			break;
-		}
+		// Execute instruction (may be step; execute only one instruction if state)
+		const auto op = reinterpret_cast<be_t<u32>*>(mem_ + u64{cia});
+		const auto fn = reinterpret_cast<ppu_intrp_func*>(cache + u64{cia} * 2);
+		fn->fn(*this, {*op}, op, state ? &ppu_ret : fn + 1);
 	}
 }
 
 ppu_thread::~ppu_thread()
 {
 	perf_log.notice("Perf stats for STCX reload: successs %u, failure %u", last_succ, last_fail);
+	perf_log.notice("Perf stats for instructions: total %u", exec_bytes / 4);
 }
 
 ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u32 prio, int detached)
@@ -1638,7 +1564,7 @@ void ppu_thread::stack_pop_verbose(u32 addr, u32 size) noexcept
 	ppu_log.error("Invalid thread");
 }
 
-extern ppu_function_t ppu_get_syscall(u64 code);
+extern ppu_intrp_func_t ppu_get_syscall(u64 code);
 
 void ppu_trap(ppu_thread& ppu, u64 addr)
 {
@@ -1728,7 +1654,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 			{
 				const auto _inst = v128::loadu(inst + i) & mask_vec;
 
-				if (_mm_movemask_epi8(v128::eq32(_inst, store_vec).vi))
+				if (!gv_testz(gv_eq32(_inst, store_vec)))
 				{
 					return false;
 				}
@@ -1817,10 +1743,11 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
 	return ppu_load_acquire_reservation<u64>(ppu, addr);
 }
 
-const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](asmjit::x86::Assembler& c, auto& args)
+const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](native_asm& c, auto& args)
 {
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 	Label fall = c.newLabel();
 	Label fail = c.newLabel();
 	Label _ret = c.newLabel();
@@ -2024,6 +1951,9 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
 	c.bind(ret2);
 #endif
 	c.ret();
+#else
+	c.ret(a64::x30);
+#endif
 });
 
 template <typename T>
@@ -2147,7 +2077,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 					utils::prefetch_read(ppu.rdata + 64);
 					ppu.last_faddr = addr;
 					ppu.last_ftime = res.load() & -128;
-					ppu.last_ftsc = __rdtsc();
+					ppu.last_ftsc = utils::get_tsc();
 					return false;
 				}
 				default:
@@ -2249,7 +2179,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 
 			ppu.last_faddr = addr;
 			ppu.last_ftime = old_rtime & -128;
-			ppu.last_ftsc = __rdtsc();
+			ppu.last_ftsc = utils::get_tsc();
 			std::memcpy(&ppu.rdata[addr & 0x78], &old_data, 8);
 		}
 
@@ -2286,7 +2216,7 @@ namespace
 	// Compiled PPU module info
 	struct jit_module
 	{
-		std::vector<ppu_function_t> funcs;
+		std::vector<ppu_intrp_func_t> funcs;
 		std::shared_ptr<jit_compiler> pjit;
 		bool init = false;
 	};
@@ -2829,7 +2759,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
 			if (g_cfg.core.ppu_debug && func.size && func.toc != umax)
 			{
 				s_ppu_toc->emplace(func.addr, func.toc);
-				ppu_ref(func.addr) = reinterpret_cast<uptr>(&ppu_check_toc);
+				ppu_ref(func.addr) = &ppu_check_toc;
 			}
 		}
 
@@ -3022,7 +2952,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
 			// Fixup some information
 			entry.name = fmt::format("__0x%x", entry.addr - reloc);
 
-			if (has_mfvscr)
+			if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
 			{
 				// TODO
 				entry.attr += ppu_attr::has_mfvscr;
@@ -3139,13 +3069,15 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
 			enum class ppu_settings : u32
 			{
 				non_win32,
-				accurate_fma,
-				accurate_ppu_vector_nan,
-				java_mode_handling,
+				accurate_dfma,
+				fixup_vnan,
+				accurate_jm,
 				accurate_cache_line_stores,
 				reservations_128_byte,
 				greedy_mode,
-				has_mfvscr,
+				accurate_sat,
+				accurate_fpcc,
+				accurate_vnan,
 
 				__bitset_enum_max
 			};
@@ -3155,20 +3087,24 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
 #ifndef _WIN32
 			settings += ppu_settings::non_win32;
 #endif
-			if (g_cfg.core.llvm_accurate_dfma)
-				settings += ppu_settings::accurate_fma;
-			if (g_cfg.core.llvm_ppu_accurate_vector_nan)
-				settings += ppu_settings::accurate_ppu_vector_nan;
-			if (g_cfg.core.llvm_ppu_jm_handling)
-				settings += ppu_settings::java_mode_handling;
+			if (g_cfg.core.use_accurate_dfma)
+				settings += ppu_settings::accurate_dfma;
+			if (g_cfg.core.ppu_fix_vnan)
+				settings += ppu_settings::fixup_vnan;
+			if (g_cfg.core.ppu_use_nj_bit)
+				settings += ppu_settings::accurate_jm;
 			if (has_dcbz == 2)
 				settings += ppu_settings::accurate_cache_line_stores;
 			if (g_cfg.core.ppu_128_reservations_loop_max_length)
 				settings += ppu_settings::reservations_128_byte;
 			if (g_cfg.core.ppu_llvm_greedy_mode)
 				settings += ppu_settings::greedy_mode;
-			if (has_mfvscr)
-				settings += ppu_settings::has_mfvscr;
+			if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
+				settings += ppu_settings::accurate_sat;
+			if (g_cfg.core.ppu_set_fpcc)
+				settings += ppu_settings::accurate_fpcc, fmt::throw_exception("FPCC Not implemented");
+			if (g_cfg.core.ppu_set_vnan)
+				settings += ppu_settings::accurate_vnan, fmt::throw_exception("VNAN Not implemented");
 
 			// Write version, hash, CPU, settings
 			fmt::append(obj_name, "v5-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
@@ -3319,10 +3255,10 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
 			if (!func.size) continue;
 
 			const auto name = fmt::format("__0x%x", func.addr - reloc);
-			const auto addr = ensure(reinterpret_cast<ppu_function_t>(jit->get(name)));
+			const auto addr = ensure(reinterpret_cast<ppu_intrp_func_t>(jit->get(name)));
 			jit_mod.funcs.emplace_back(addr);
 
-			if (ppu_ref(func.addr) != reinterpret_cast<u64>(ppu_far_jump))
+			if (ppu_ref(func.addr) != ppu_far_jump)
 				ppu_register_function_at(func.addr, 4, addr);
 
 			if (g_cfg.core.ppu_debug)
@@ -3342,7 +3278,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
 
 			const u64 addr = reinterpret_cast<uptr>(ensure(jit_mod.funcs[index++]));
 
-			if (ppu_ref(func.addr) != reinterpret_cast<u64>(ppu_far_jump))
+			if (ppu_ref(func.addr) != ppu_far_jump)
 				ppu_register_function_at(func.addr, 4, addr);
 
 			if (g_cfg.core.ppu_debug)
diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h
index 324e55804d..9604961da2 100644
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@@ -276,6 +276,7 @@ public:
 	u32 last_faddr = 0;
 	u64 last_fail = 0;
 	u64 last_succ = 0;
+	u64 exec_bytes = 0; // Amount of "bytes" executed (4 for each instruction)
 
 	u32 dbg_step_pc = 0;
 
diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp
index a8ef9f0b2c..e3aa3d5384 100644
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@@ -3,20 +3,19 @@
 #include "Emu/system_config.h"
 #include "PPUTranslator.h"
 #include "PPUThread.h"
-#include "PPUInterpreter.h"
 
 #include "util/types.hpp"
 #include "util/endian.hpp"
 #include "util/logs.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 #include <algorithm>
 
 using namespace llvm;
 
 const ppu_decoder<PPUTranslator> s_ppu_decoder;
-const ppu_decoder<ppu_itype> s_ppu_itype;
-const ppu_decoder<ppu_iname> s_ppu_iname;
+extern const ppu_decoder<ppu_itype> g_ppu_itype;
+extern const ppu_decoder<ppu_iname> g_ppu_iname;
 
 PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_module& info, ExecutionEngine& engine)
 	: cpu_translator(_module, false)
@@ -151,7 +150,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
 	{
 		const u32 op = vm::read32(vm::cast(addr + base));
 
-		switch (s_ppu_itype.decode(op))
+		switch (g_ppu_itype.decode(op))
 		{
 		case ppu_itype::UNK:
 		case ppu_itype::ECIWX:
@@ -251,7 +250,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
 			if (m_rel)
 			{
 				// This is very bad. m_rel is normally set to nullptr after a relocation is handled (so it wasn't)
-				ppu_log.error("LLVM: [0x%x] Unsupported relocation(%u) in '%s' (opcode=0x%x '%s'). Please report.", rel_found->first, m_rel->type, m_info.name, op, s_ppu_iname.decode(op));
+				ppu_log.error("LLVM: [0x%x] Unsupported relocation(%u) in '%s' (opcode=0x%x '%s'). Please report.", rel_found->first, m_rel->type, m_info.name, op, g_ppu_iname.decode(op));
 				return nullptr;
 			}
 		}
@@ -291,8 +290,8 @@ Value* PPUTranslator::VecHandleDenormal(Value* val)
 
 Value* PPUTranslator::VecHandleResult(Value* val)
 {
-	val = g_cfg.core.llvm_ppu_accurate_vector_nan ? VecHandleNan(val) : val;
-	val = g_cfg.core.llvm_ppu_jm_handling ? VecHandleDenormal(val) : val;
+	val = g_cfg.core.ppu_fix_vnan ? VecHandleNan(val) : val;
+	val = g_cfg.core.ppu_use_nj_bit ? VecHandleDenormal(val) : val;
 	return val;
 }
 
@@ -391,10 +390,10 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
 		const auto pos = m_ir->CreateShl(indirect, 1);
 		const auto ptr = m_ir->CreateGEP(m_exec, pos);
 		const auto val = m_ir->CreateLoad(m_ir->CreateBitCast(ptr, get_type<u64*>()));
-		callee = FunctionCallee(type, m_ir->CreateIntToPtr(m_ir->CreateAnd(val, 0x7fff'ffff'ffff), type->getPointerTo()));
+		callee = FunctionCallee(type, m_ir->CreateIntToPtr(m_ir->CreateAnd(val, 0xffff'ffff'ffff), type->getPointerTo()));
 
 		// Load new segment address
-		seg0 = m_ir->CreateShl(m_ir->CreateLShr(val, 47), 12);
+		seg0 = m_ir->CreateShl(m_ir->CreateLShr(val, 48), 13);
 	}
 
 	m_ir->SetInsertPoint(block);
@@ -640,7 +639,8 @@ void PPUTranslator::CompilationError(const std::string& error)
 
 void PPUTranslator::MFVSCR(ppu_opcode_t op)
 {
-	const auto vscr = m_ir->CreateOr(ZExt(IsNotZero(RegLoad(m_sat)), GetType<u32>()), m_ir->CreateShl(ZExt(RegLoad(m_nj), GetType<u32>()), 16));
+	const auto vsat = g_cfg.core.ppu_set_sat_bit ? ZExt(IsNotZero(RegLoad(m_sat)), GetType<u32>()) : m_ir->getInt32(0);
+	const auto vscr = m_ir->CreateOr(vsat, m_ir->CreateShl(ZExt(RegLoad(m_nj), GetType<u32>()), 16));
 	SetVr(op.vd, m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), vscr, m_ir->getInt32(m_is_be ? 3 : 0)));
 }
 
@@ -649,8 +649,10 @@ void PPUTranslator::MTVSCR(ppu_opcode_t op)
 	const auto vscr = m_ir->CreateExtractElement(GetVr(op.vb, VrType::vi32), m_ir->getInt32(m_is_be ? 3 : 0));
 	const auto nj = Trunc(m_ir->CreateLShr(vscr, 16), GetType<bool>());
 	RegStore(nj, m_nj);
-	if (g_cfg.core.llvm_ppu_jm_handling) RegStore(m_ir->CreateSelect(nj, m_ir->getInt32(0x7f80'0000), m_ir->getInt32(0x7fff'ffff)), m_jm_mask);
-	RegStore(m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), m_ir->CreateAnd(vscr, 1), m_ir->getInt32(0)), m_sat);
+	if (g_cfg.core.ppu_use_nj_bit)
+		RegStore(m_ir->CreateSelect(nj, m_ir->getInt32(0x7f80'0000), m_ir->getInt32(0x7fff'ffff)), m_jm_mask);
+	if (g_cfg.core.ppu_set_sat_bit)
+		RegStore(m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), m_ir->CreateAnd(vscr, 1), m_ir->getInt32(0)), m_sat);
 }
 
 void PPUTranslator::VADDCUW(ppu_opcode_t op)
@@ -902,10 +904,12 @@ void PPUTranslator::VCTSXS(ppu_opcode_t op)
 	const auto b = get_vr<f32[4]>(op.vb);
 	const auto scaled = b * fsplat<f32[4]>(std::pow(2, 0 + op.vuimm));
 	const auto const1 = fsplat<f32[4]>(-std::pow(2, 31));
-	//const auto is_nan = fcmp_uno(b == b); // NaN -> 0.0
-	const auto sat_l = fcmp_ord(scaled < const1); // TODO ???
+	const auto is_nan = fcmp_uno(b != b);
+	const auto sat_l = fcmp_ord(scaled < const1);
 	const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 31)));
-	const auto converted = fpcast<s32[4]>(select(sat_l, const1, scaled));
+	value_t<s32[4]> converted = eval(fpcast<s32[4]>(select(sat_l, const1, scaled)));
+	if (g_cfg.core.ppu_fix_vnan)
+		converted = eval(select(is_nan, splat<s32[4]>(0), converted));  // NaN -> 0
 	set_vr(op.vd, select(sat_h, splat<s32[4]>(0x7fff'ffff), converted));
 	set_sat(sext<s32[4]>(sat_l) | sext<s32[4]>(sat_h));
 }
@@ -915,10 +919,12 @@ void PPUTranslator::VCTUXS(ppu_opcode_t op)
 	const auto b = get_vr<f32[4]>(op.vb);
 	const auto scaled = b * fsplat<f32[4]>(std::pow(2, 0 + op.vuimm));
 	const auto const0 = fsplat<f32[4]>(0.);
-	//const auto is_nan = fcmp_uno(b == b); // NaN -> 0.0
+	const auto is_nan = fcmp_uno(b != b);
 	const auto sat_l = fcmp_ord(scaled < const0);
-	const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 32))); // TODO ???
-	const auto converted = fpcast<u32[4]>(select(sat_l, const0, scaled));
+	const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 32)));
+	value_t<u32[4]> converted = eval(fpcast<u32[4]>(select(sat_l, const0, scaled)));
+	if (g_cfg.core.ppu_fix_vnan)
+		converted = eval(select(is_nan, splat<u32[4]>(0), converted)); // NaN -> 0
 	set_vr(op.vd, select(sat_h, splat<u32[4]>(0xffff'ffff), converted));
 	set_sat(sext<s32[4]>(sat_l) | sext<s32[4]>(sat_h));
 }
@@ -1334,7 +1340,7 @@ void PPUTranslator::VPKSHSS(ppu_opcode_t op)
 	const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 	const auto r = trunc<u8[16]>(min(max(ab, splat<s16[16]>(-0x80)), splat<s16[16]>(0x7f)));
 	set_vr(op.vd, r);
-	set_sat(((a + 0x80) | (b + 0x80)) >> 8);
+	set_sat(bitcast<u16[8]>((a + 0x80) | (b + 0x80)) >> 8);
 }
 
 void PPUTranslator::VPKSHUS(ppu_opcode_t op)
@@ -1344,7 +1350,7 @@ void PPUTranslator::VPKSHUS(ppu_opcode_t op)
 	const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 	const auto r = trunc<u8[16]>(min(max(ab, splat<s16[16]>(0)), splat<s16[16]>(0xff)));
 	set_vr(op.vd, r);
-	set_sat((a | b) >> 8);
+	set_sat(bitcast<u16[8]>(a | b) >> 8);
 }
 
 void PPUTranslator::VPKSWSS(ppu_opcode_t op)
@@ -1354,7 +1360,7 @@ void PPUTranslator::VPKSWSS(ppu_opcode_t op)
 	const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7);
 	const auto r = trunc<u16[8]>(min(max(ab, splat<s32[8]>(-0x8000)), splat<s32[8]>(0x7fff)));
 	set_vr(op.vd, r);
-	set_sat(((a + 0x8000) | (b + 0x8000)) >> 16);
+	set_sat(bitcast<u32[4]>((a + 0x8000) | (b + 0x8000)) >> 16);
 }
 
 void PPUTranslator::VPKSWUS(ppu_opcode_t op)
@@ -1364,7 +1370,7 @@ void PPUTranslator::VPKSWUS(ppu_opcode_t op)
 	const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7);
 	const auto r = trunc<u16[8]>(min(max(ab, splat<s32[8]>(0)), splat<s32[8]>(0xffff)));
 	set_vr(op.vd, r);
-	set_sat((a | b) >> 16);
+	set_sat(bitcast<u32[4]>(a | b) >> 16);
 }
 
 void PPUTranslator::VPKUHUM(ppu_opcode_t op)
@@ -1741,7 +1747,7 @@ void PPUTranslator::VSUMSWS(ppu_opcode_t op)
 	const auto s = eval(x + y + z);
 	const auto r = min(max(zshuffle(s, 0, 2) + zshuffle(s, 1, 2), splat<s64[2]>(-0x8000'0000ll)), splat<s64[2]>(0x7fff'ffff));
 	set_vr(op.vd, zshuffle(bitcast<u32[4]>(r), 0, 4, 4, 4));
-	set_sat((r + 0x8000'0000) >> 32);
+	set_sat(bitcast<u64[2]>(r + 0x8000'0000) >> 32);
 }
 
 void PPUTranslator::VSUM2SWS(ppu_opcode_t op)
@@ -1752,18 +1758,15 @@ void PPUTranslator::VSUM2SWS(ppu_opcode_t op)
 	const auto z = b >> 32;
 	const auto r = min(max(x + y + z, splat<s64[2]>(-0x8000'0000ll)), splat<s64[2]>(0x7fff'ffff));
 	set_vr(op.vd, zshuffle(bitcast<u32[4]>(r), 0, 4, 2, 4));
-	set_sat((r + 0x8000'0000) >> 32);
+	set_sat(bitcast<u64[2]>(r + 0x8000'0000) >> 32);
 }
 
 void PPUTranslator::VSUM4SBS(ppu_opcode_t op)
 {
-	const auto a = get_vr<s32[4]>(op.va);
+	const auto a = get_vr<s16[8]>(op.va);
 	const auto b = get_vr<s32[4]>(op.vb);
-	const auto x = a << 24 >> 24;
-	const auto y = a << 16 >> 24;
-	const auto z = a << 8 >> 24;
-	const auto w = a >> 24;
-	const auto s = eval(x + y + z + w); // Can't overflow
+	const auto x = eval(bitcast<s32[4]>((a << 8 >> 8) + (a >> 8)));
+	const auto s = eval((x << 16 >> 16) + (x >> 16));
 	const auto r = add_sat(s, b);
 	set_vr(op.vd, r);
 	set_sat(r ^ (s + b));
@@ -1773,9 +1776,7 @@ void PPUTranslator::VSUM4SHS(ppu_opcode_t op)
 {
 	const auto a = get_vr<s32[4]>(op.va);
 	const auto b = get_vr<s32[4]>(op.vb);
-	const auto x = a << 16 >> 16;
-	const auto y = a >> 16;
-	const auto s = eval(x + y); // Can't overflow
+	const auto s = eval((a << 16 >> 16) + (a >> 16));
 	const auto r = add_sat(s, b);
 	set_vr(op.vd, r);
 	set_sat(r ^ (s + b));
@@ -1783,13 +1784,10 @@ void PPUTranslator::VSUM4SHS(ppu_opcode_t op)
 
 void PPUTranslator::VSUM4UBS(ppu_opcode_t op)
 {
-	const auto a = get_vr<u32[4]>(op.va);
+	const auto a = get_vr<u16[8]>(op.va);
 	const auto b = get_vr<u32[4]>(op.vb);
-	const auto x = a & 0xff;
-	const auto y = a << 16 >> 24;
-	const auto z = a << 8 >> 24;
-	const auto w = a >> 24;
-	const auto s = eval(x + y + z + w); // Can't overflow
+	const auto x = eval(bitcast<u32[4]>((a & 0xff) + (a >> 8)));
+	const auto s = eval((x & 0xffff) + (x >> 16));
 	const auto r = add_sat(s, b);
 	set_vr(op.vd, r);
 	set_sat(r ^ (s + b));
@@ -4047,7 +4045,7 @@ void PPUTranslator::FMADDS(ppu_opcode_t op)
 	const auto c = GetFpr(op.frc);
 
 	llvm::Value* result;
-	if (g_cfg.core.llvm_accurate_dfma)
+	if (g_cfg.core.use_accurate_dfma)
 	{
 		result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, b});
 	}
@@ -4075,7 +4073,7 @@ void PPUTranslator::FMSUBS(ppu_opcode_t op)
 	const auto c = GetFpr(op.frc);
 
 	llvm::Value* result;
-	if (g_cfg.core.llvm_accurate_dfma)
+	if (g_cfg.core.use_accurate_dfma)
 	{
 		result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, m_ir->CreateFNeg(b)});
 	}
@@ -4103,7 +4101,7 @@ void PPUTranslator::FNMSUBS(ppu_opcode_t op)
 	const auto c = GetFpr(op.frc);
 
 	llvm::Value* result;
-	if (g_cfg.core.llvm_accurate_dfma)
+	if (g_cfg.core.use_accurate_dfma)
 	{
 		result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, m_ir->CreateFNeg(b)});
 	}
@@ -4131,7 +4129,7 @@ void PPUTranslator::FNMADDS(ppu_opcode_t op)
 	const auto c = GetFpr(op.frc);
 
 	llvm::Value* result;
-	if (g_cfg.core.llvm_accurate_dfma)
+	if (g_cfg.core.use_accurate_dfma)
 	{
 		result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, b});
 	}
@@ -4384,7 +4382,7 @@ void PPUTranslator::FMSUB(ppu_opcode_t op)
 	const auto c = GetFpr(op.frc);
 
 	llvm::Value* result;
-	if (g_cfg.core.llvm_accurate_dfma)
+	if (g_cfg.core.use_accurate_dfma)
 	{
 		result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, m_ir->CreateFNeg(b)});
 	}
@@ -4412,7 +4410,7 @@ void PPUTranslator::FMADD(ppu_opcode_t op)
 	const auto c = GetFpr(op.frc);
 
 	llvm::Value* result;
-	if (g_cfg.core.llvm_accurate_dfma)
+	if (g_cfg.core.use_accurate_dfma)
 	{
 		result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), { a, c, b });
 	}
@@ -4440,7 +4438,7 @@ void PPUTranslator::FNMSUB(ppu_opcode_t op)
 	const auto c = GetFpr(op.frc);
 
 	llvm::Value* result;
-	if (g_cfg.core.llvm_accurate_dfma)
+	if (g_cfg.core.use_accurate_dfma)
 	{
 		result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, m_ir->CreateFNeg(b)});
 	}
@@ -4468,7 +4466,7 @@ void PPUTranslator::FNMADD(ppu_opcode_t op)
 	const auto c = GetFpr(op.frc);
 
 	llvm::Value* result;
-	if (g_cfg.core.llvm_accurate_dfma)
+	if (g_cfg.core.use_accurate_dfma)
 	{
 		result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, b});
 	}
diff --git a/rpcs3/Emu/Cell/PPUTranslator.h b/rpcs3/Emu/Cell/PPUTranslator.h
index 67c6c29d22..23e2fecb84 100644
--- a/rpcs3/Emu/Cell/PPUTranslator.h
+++ b/rpcs3/Emu/Cell/PPUTranslator.h
@@ -358,18 +358,31 @@ public:
 	void VCFSX(ppu_opcode_t op);
 	void VCFUX(ppu_opcode_t op);
 	void VCMPBFP(ppu_opcode_t op);
+	void VCMPBFP_(ppu_opcode_t op) { return VCMPBFP(op); }
 	void VCMPEQFP(ppu_opcode_t op);
+	void VCMPEQFP_(ppu_opcode_t op) { return VCMPEQFP(op); }
 	void VCMPEQUB(ppu_opcode_t op);
+	void VCMPEQUB_(ppu_opcode_t op) { return VCMPEQUB(op); }
 	void VCMPEQUH(ppu_opcode_t op);
+	void VCMPEQUH_(ppu_opcode_t op) { return VCMPEQUH(op); }
 	void VCMPEQUW(ppu_opcode_t op);
+	void VCMPEQUW_(ppu_opcode_t op) { return VCMPEQUW(op); }
 	void VCMPGEFP(ppu_opcode_t op);
+	void VCMPGEFP_(ppu_opcode_t op) { return VCMPGEFP(op); }
 	void VCMPGTFP(ppu_opcode_t op);
+	void VCMPGTFP_(ppu_opcode_t op) { return VCMPGTFP(op); }
 	void VCMPGTSB(ppu_opcode_t op);
+	void VCMPGTSB_(ppu_opcode_t op) { return VCMPGTSB(op); }
 	void VCMPGTSH(ppu_opcode_t op);
+	void VCMPGTSH_(ppu_opcode_t op) { return VCMPGTSH(op); }
 	void VCMPGTSW(ppu_opcode_t op);
+	void VCMPGTSW_(ppu_opcode_t op) { return VCMPGTSW(op); }
 	void VCMPGTUB(ppu_opcode_t op);
+	void VCMPGTUB_(ppu_opcode_t op) { return VCMPGTUB(op); }
 	void VCMPGTUH(ppu_opcode_t op);
+	void VCMPGTUH_(ppu_opcode_t op) { return VCMPGTUH(op); }
 	void VCMPGTUW(ppu_opcode_t op);
+	void VCMPGTUW_(ppu_opcode_t op) { return VCMPGTUW(op); }
 	void VCTSXS(ppu_opcode_t op);
 	void VCTUXS(ppu_opcode_t op);
 	void VEXPTEFP(ppu_opcode_t op);
@@ -717,6 +730,130 @@ public:
 	void FCFID(ppu_opcode_t op);
 
 	void UNK(ppu_opcode_t op);
+
+	void SUBFCO(ppu_opcode_t op) { return SUBFC(op); }
+	void ADDCO(ppu_opcode_t op) { return ADDC(op); }
+	void SUBFO(ppu_opcode_t op) { return SUBF(op); }
+	void NEGO(ppu_opcode_t op) { return NEG(op); }
+	void SUBFEO(ppu_opcode_t op) { return SUBFE(op); }
+	void ADDEO(ppu_opcode_t op) { return ADDE(op); }
+	void SUBFZEO(ppu_opcode_t op) { return SUBFZE(op); }
+	void ADDZEO(ppu_opcode_t op) { return ADDZE(op); }
+	void SUBFMEO(ppu_opcode_t op) { return SUBFME(op); }
+	void MULLDO(ppu_opcode_t op) { return MULLD(op); }
+	void ADDMEO(ppu_opcode_t op) { return ADDME(op); }
+	void MULLWO(ppu_opcode_t op) { return MULLW(op); }
+	void ADDO(ppu_opcode_t op) { return ADD(op); }
+	void DIVDUO(ppu_opcode_t op) { return DIVDU(op); }
+	void DIVWUO(ppu_opcode_t op) { return DIVWU(op); }
+	void DIVDO(ppu_opcode_t op) { return DIVD(op); }
+	void DIVWO(ppu_opcode_t op) { return DIVW(op); }
+
+	void SUBFCO_(ppu_opcode_t op) { return SUBFC(op); }
+	void ADDCO_(ppu_opcode_t op) { return ADDC(op); }
+	void SUBFO_(ppu_opcode_t op) { return SUBF(op); }
+	void NEGO_(ppu_opcode_t op) { return NEG(op); }
+	void SUBFEO_(ppu_opcode_t op) { return SUBFE(op); }
+	void ADDEO_(ppu_opcode_t op) { return ADDE(op); }
+	void SUBFZEO_(ppu_opcode_t op) { return SUBFZE(op); }
+	void ADDZEO_(ppu_opcode_t op) { return ADDZE(op); }
+	void SUBFMEO_(ppu_opcode_t op) { return SUBFME(op); }
+	void MULLDO_(ppu_opcode_t op) { return MULLD(op); }
+	void ADDMEO_(ppu_opcode_t op) { return ADDME(op); }
+	void MULLWO_(ppu_opcode_t op) { return MULLW(op); }
+	void ADDO_(ppu_opcode_t op) { return ADD(op); }
+	void DIVDUO_(ppu_opcode_t op) { return DIVDU(op); }
+	void DIVWUO_(ppu_opcode_t op) { return DIVWU(op); }
+	void DIVDO_(ppu_opcode_t op) { return DIVD(op); }
+	void DIVWO_(ppu_opcode_t op) { return DIVW(op); }
+
+	void RLWIMI_(ppu_opcode_t op) { return RLWIMI(op); }
+	void RLWINM_(ppu_opcode_t op) { return RLWINM(op); }
+	void RLWNM_(ppu_opcode_t op) { return RLWNM(op); }
+	void RLDICL_(ppu_opcode_t op) { return RLDICL(op); }
+	void RLDICR_(ppu_opcode_t op) { return RLDICR(op); }
+	void RLDIC_(ppu_opcode_t op) { return RLDIC(op); }
+	void RLDIMI_(ppu_opcode_t op) { return RLDIMI(op); }
+	void RLDCL_(ppu_opcode_t op) { return RLDCL(op); }
+	void RLDCR_(ppu_opcode_t op) { return RLDCR(op); }
+	void SUBFC_(ppu_opcode_t op) { return SUBFC(op); }
+	void MULHDU_(ppu_opcode_t op) { return MULHDU(op); }
+	void ADDC_(ppu_opcode_t op) { return ADDC(op); }
+	void MULHWU_(ppu_opcode_t op) { return MULHWU(op); }
+	void SLW_(ppu_opcode_t op) { return SLW(op); }
+	void CNTLZW_(ppu_opcode_t op) { return CNTLZW(op); }
+	void SLD_(ppu_opcode_t op) { return SLD(op); }
+	void AND_(ppu_opcode_t op) { return AND(op); }
+	void SUBF_(ppu_opcode_t op) { return SUBF(op); }
+	void CNTLZD_(ppu_opcode_t op) { return CNTLZD(op); }
+	void ANDC_(ppu_opcode_t op) { return ANDC(op); }
+	void MULHD_(ppu_opcode_t op) { return MULHD(op); }
+	void MULHW_(ppu_opcode_t op) { return MULHW(op); }
+	void NEG_(ppu_opcode_t op) { return NEG(op); }
+	void NOR_(ppu_opcode_t op) { return NOR(op); }
+	void SUBFE_(ppu_opcode_t op) { return SUBFE(op); }
+	void ADDE_(ppu_opcode_t op) { return ADDE(op); }
+	void SUBFZE_(ppu_opcode_t op) { return SUBFZE(op); }
+	void ADDZE_(ppu_opcode_t op) { return ADDZE(op); }
+	void MULLD_(ppu_opcode_t op) { return MULLD(op); }
+	void SUBFME_(ppu_opcode_t op) { return SUBFME(op); }
+	void ADDME_(ppu_opcode_t op) { return ADDME(op); }
+	void MULLW_(ppu_opcode_t op) { return MULLW(op); }
+	void ADD_(ppu_opcode_t op) { return ADD(op); }
+	void EQV_(ppu_opcode_t op) { return EQV(op); }
+	void XOR_(ppu_opcode_t op) { return XOR(op); }
+	void ORC_(ppu_opcode_t op) { return ORC(op); }
+	void OR_(ppu_opcode_t op) { return OR(op); }
+	void DIVDU_(ppu_opcode_t op) { return DIVDU(op); }
+	void DIVWU_(ppu_opcode_t op) { return DIVWU(op); }
+	void NAND_(ppu_opcode_t op) { return NAND(op); }
+	void DIVD_(ppu_opcode_t op) { return DIVD(op); }
+	void DIVW_(ppu_opcode_t op) { return DIVW(op); }
+	void SRW_(ppu_opcode_t op) { return SRW(op); }
+	void SRD_(ppu_opcode_t op) { return SRD(op); }
+	void SRAW_(ppu_opcode_t op) { return SRAW(op); }
+	void SRAD_(ppu_opcode_t op) { return SRAD(op); }
+	void SRAWI_(ppu_opcode_t op) { return SRAWI(op); }
+	void SRADI_(ppu_opcode_t op) { return SRADI(op); }
+	void EXTSH_(ppu_opcode_t op) { return EXTSH(op); }
+	void EXTSB_(ppu_opcode_t op) { return EXTSB(op); }
+	void EXTSW_(ppu_opcode_t op) { return EXTSW(op); }
+	void FDIVS_(ppu_opcode_t op) { return FDIVS(op); }
+	void FSUBS_(ppu_opcode_t op) { return FSUBS(op); }
+	void FADDS_(ppu_opcode_t op) { return FADDS(op); }
+	void FSQRTS_(ppu_opcode_t op) { return FSQRTS(op); }
+	void FRES_(ppu_opcode_t op) { return FRES(op); }
+	void FMULS_(ppu_opcode_t op) { return FMULS(op); }
+	void FMADDS_(ppu_opcode_t op) { return FMADDS(op); }
+	void FMSUBS_(ppu_opcode_t op) { return FMSUBS(op); }
+	void FNMSUBS_(ppu_opcode_t op) { return FNMSUBS(op); }
+	void FNMADDS_(ppu_opcode_t op) { return FNMADDS(op); }
+	void MTFSB1_(ppu_opcode_t op) { return MTFSB1(op); }
+	void MTFSB0_(ppu_opcode_t op) { return MTFSB0(op); }
+	void MTFSFI_(ppu_opcode_t op) { return MTFSFI(op); }
+	void MFFS_(ppu_opcode_t op) { return MFFS(op); }
+	void MTFSF_(ppu_opcode_t op) { return MTFSF(op); }
+	void FRSP_(ppu_opcode_t op) { return FRSP(op); }
+	void FCTIW_(ppu_opcode_t op) { return FCTIW(op); }
+	void FCTIWZ_(ppu_opcode_t op) { return FCTIWZ(op); }
+	void FDIV_(ppu_opcode_t op) { return FDIV(op); }
+	void FSUB_(ppu_opcode_t op) { return FSUB(op); }
+	void FADD_(ppu_opcode_t op) { return FADD(op); }
+	void FSQRT_(ppu_opcode_t op) { return FSQRT(op); }
+	void FSEL_(ppu_opcode_t op) { return FSEL(op); }
+	void FMUL_(ppu_opcode_t op) { return FMUL(op); }
+	void FRSQRTE_(ppu_opcode_t op) { return FRSQRTE(op); }
+	void FMSUB_(ppu_opcode_t op) { return FMSUB(op); }
+	void FMADD_(ppu_opcode_t op) { return FMADD(op); }
+	void FNMSUB_(ppu_opcode_t op) { return FNMSUB(op); }
+	void FNMADD_(ppu_opcode_t op) { return FNMADD(op); }
+	void FNEG_(ppu_opcode_t op) { return FNEG(op); }
+	void FMR_(ppu_opcode_t op) { return FMR(op); }
+	void FNABS_(ppu_opcode_t op) { return FNABS(op); }
+	void FABS_(ppu_opcode_t op) { return FABS(op); }
+	void FCTID_(ppu_opcode_t op) { return FCTID(op); }
+	void FCTIDZ_(ppu_opcode_t op) { return FCTIDZ(op); }
+	void FCFID_(ppu_opcode_t op) { return FCFID(op); }
 };
 
 #endif
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
index 8651b47d3b..e5dda98e62 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@@ -13,7 +13,7 @@
 
 #include "util/asm.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 #include "util/sysinfo.hpp"
 
 #include <cmath>
@@ -25,7 +25,6 @@
 #define SPU_OFF_16(x, ...) asmjit::x86::word_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__))
 #define SPU_OFF_8(x, ...) asmjit::x86::byte_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__))
 
-extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast{}; // TODO: avoid
 const spu_decoder<spu_recompiler> s_spu_decoder;
 
 std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler()
@@ -978,16 +977,6 @@ inline asmjit::x86::Mem spu_recompiler::XmmConst(const v128& data)
 	return asmjit::x86::oword_ptr(xmm_label);
 }
 
-inline asmjit::x86::Mem spu_recompiler::XmmConst(const __m128& data)
-{
-	return XmmConst(v128::fromF(data));
-}
-
-inline asmjit::x86::Mem spu_recompiler::XmmConst(const __m128i& data)
-{
-	return XmmConst(v128::fromV(data));
-}
-
 inline asmjit::x86::Mem spu_recompiler::get_pc(u32 addr)
 {
 	return asmjit::x86::qword_ptr(*pc0, addr - m_base);
@@ -1227,7 +1216,7 @@ void spu_recompiler::branch_set_link(u32 target)
 
 void spu_recompiler::fall(spu_opcode_t op)
 {
-	auto gate = [](spu_thread* _spu, u32 opcode, spu_inter_func_t _func)
+	auto gate = [](spu_thread* _spu, u32 opcode, spu_intrp_func_t _func)
 	{
 		if (!_func(*_spu, {opcode}))
 		{
@@ -1241,7 +1230,7 @@ void spu_recompiler::fall(spu_opcode_t op)
 	c->and_(*addr, 0x3fffc);
 	c->mov(SPU_OFF_32(pc), *addr);
 	c->mov(arg1->r32(), op.opcode);
-	c->mov(*qw0, asmjit::imm_ptr(g_spu_interpreter_fast.decode(op.opcode)));
+	c->mov(*qw0, asmjit::imm_ptr(g_fxo->get<spu_interpreter_rt>().decode(op.opcode)));
 	c->mov(*arg0, *cpu);
 	c->call(asmjit::imm_ptr(+gate));
 }
@@ -1541,7 +1530,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op)
 		c->movq(vr, channel_ptr);
 		c->psrlq(vr, spu_channel::off_count);
 		if (inv)
-			c->pxor(vr, XmmConst(_mm_set1_epi32(1)));
+			c->pxor(vr, XmmConst(v128::from32p(1)));
 		c->pslldq(vr, 12);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	};
@@ -1570,7 +1559,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op)
 	{
 		const XmmLink& vr = XmmAlloc();
 		const XmmLink& v1 = XmmAlloc();
-		c->movdqa(vr, XmmConst(_mm_set1_epi32(16)));
+		c->movdqa(vr, XmmConst(v128::from32p(16)));
 		c->movd(v1, SPU_OFF_32(mfc_size));
 		c->psubd(vr, v1);
 		c->pslldq(vr, 12);
@@ -1669,11 +1658,11 @@ void spu_recompiler::BG(spu_opcode_t op)
 		return;
 	}
 
-	c->movdqa(vi, XmmConst(_mm_set1_epi32(0x80000000)));
+	c->movdqa(vi, XmmConst(v128::from32p(0x80000000)));
 	c->pxor(va, vi);
 	c->pxor(vi, SPU_OFF_128(gpr, op.rb));
 	c->pcmpgtd(va, vi);
-	c->paddd(va, XmmConst(_mm_set1_epi32(1)));
+	c->paddd(va, XmmConst(v128::from32p(1)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -1697,7 +1686,7 @@ void spu_recompiler::NOR(spu_opcode_t op)
 	}
 
 	c->por(va, SPU_OFF_128(gpr, op.rb));
-	c->pxor(va, XmmConst(_mm_set1_epi32(0xffffffff)));
+	c->pxor(va, XmmConst(v128::from32p(0xffffffff)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -1731,10 +1720,10 @@ void spu_recompiler::ROT(spu_opcode_t op)
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
 		const XmmLink& v4 = XmmAlloc();
-		c->movdqa(v4, XmmConst(_mm_set1_epi32(0x1f)));
+		c->movdqa(v4, XmmConst(v128::from32p(0x1f)));
 		c->pand(vb, v4);
 		c->vpsllvd(vt, va, vb);
-		c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
+		c->psubd(vb, XmmConst(v128::from32p(1)));
 		c->pandn(vb, v4);
 		c->vpsrlvd(va, va, vb);
 		c->por(vt, va);
@@ -1768,8 +1757,8 @@ void spu_recompiler::ROTM(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
+		c->psubd(vb, XmmConst(v128::from32p(1)));
+		c->pandn(vb, XmmConst(v128::from32p(0x3f)));
 		c->vpsrlvd(vt, va, vb);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 		return;
@@ -1780,11 +1769,11 @@ void spu_recompiler::ROTM(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
+		c->psubd(vb, XmmConst(v128::from32p(1)));
+		c->pandn(vb, XmmConst(v128::from32p(0x3f)));
 		c->pxor(vt, vt);
 		c->psubd(vt, vb);
-		c->pcmpgtd(vb, XmmConst(_mm_set1_epi32(31)));
+		c->pcmpgtd(vb, XmmConst(v128::from32p(31)));
 		c->vpshld(vt, va, vt);
 		c->vpandn(vt, vb, vt);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
@@ -1808,8 +1797,8 @@ void spu_recompiler::ROTMA(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
+		c->psubd(vb, XmmConst(v128::from32p(1)));
+		c->pandn(vb, XmmConst(v128::from32p(0x3f)));
 		c->vpsravd(vt, va, vb);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 		return;
@@ -1820,10 +1809,10 @@ void spu_recompiler::ROTMA(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
+		c->psubd(vb, XmmConst(v128::from32p(1)));
+		c->pandn(vb, XmmConst(v128::from32p(0x3f)));
 		c->pxor(vt, vt);
-		c->pminud(vb, XmmConst(_mm_set1_epi32(31)));
+		c->pminud(vb, XmmConst(v128::from32p(31)));
 		c->psubd(vt, vb);
 		c->vpshad(vt, va, vt);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
@@ -1847,7 +1836,7 @@ void spu_recompiler::SHL(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->pand(vb, XmmConst(_mm_set1_epi32(0x3f)));
+		c->pand(vb, XmmConst(v128::from32p(0x3f)));
 		c->vpsllvd(vt, va, vb);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 		return;
@@ -1858,8 +1847,8 @@ void spu_recompiler::SHL(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->pand(vb, XmmConst(_mm_set1_epi32(0x3f)));
-		c->vpcmpgtd(vt, vb, XmmConst(_mm_set1_epi32(31)));
+		c->pand(vb, XmmConst(v128::from32p(0x3f)));
+		c->vpcmpgtd(vt, vb, XmmConst(v128::from32p(31)));
 		c->vpshld(vb, va, vb);
 		c->pandn(vt, vb);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
@@ -1883,7 +1872,7 @@ void spu_recompiler::ROTH(spu_opcode_t op) //nf
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
 		const XmmLink& v4 = XmmAlloc();
-		c->vmovdqa(v4, XmmConst(_mm_set_epi32(0x0d0c0d0c, 0x09080908, 0x05040504, 0x01000100)));
+		c->vmovdqa(v4, XmmConst(v128::from32r(0x0d0c0d0c, 0x09080908, 0x05040504, 0x01000100)));
 		c->vpshufb(vt, va, v4); // duplicate low word
 		c->vpsrld(va, va, 16);
 		c->vpshufb(va, va, v4);
@@ -1921,8 +1910,8 @@ void spu_recompiler::ROTHM(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
+		c->psubw(vb, XmmConst(v128::from16p(1)));
+		c->pandn(vb, XmmConst(v128::from16p(0x1f)));
 		c->vpsrlvw(vt, va, vb);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 		return;
@@ -1935,9 +1924,9 @@ void spu_recompiler::ROTHM(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		const XmmLink& v4 = XmmAlloc();
 		const XmmLink& v5 = XmmAlloc();
-		c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
-		c->movdqa(vt, XmmConst(_mm_set1_epi32(0xffff0000))); // mask: select high words
+		c->psubw(vb, XmmConst(v128::from16p(1)));
+		c->pandn(vb, XmmConst(v128::from16p(0x1f)));
+		c->movdqa(vt, XmmConst(v128::from32p(0xffff0000))); // mask: select high words
 		c->vpsrld(v4, vb, 16);
 		c->vpsubusw(v5, vb, vt); // clear high words (using saturation sub for throughput)
 		c->vpandn(vb, vt, va); // clear high words
@@ -1953,11 +1942,11 @@ void spu_recompiler::ROTHM(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
+		c->psubw(vb, XmmConst(v128::from16p(1)));
+		c->pandn(vb, XmmConst(v128::from16p(0x1f)));
 		c->pxor(vt, vt);
 		c->psubw(vt, vb);
-		c->pcmpgtw(vb, XmmConst(_mm_set1_epi16(15)));
+		c->pcmpgtw(vb, XmmConst(v128::from16p(15)));
 		c->vpshlw(vt, va, vt);
 		c->vpandn(vt, vb, vt);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
@@ -1981,8 +1970,8 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
+		c->psubw(vb, XmmConst(v128::from16p(1)));
+		c->pandn(vb, XmmConst(v128::from16p(0x1f)));
 		c->vpsravw(vt, va, vb);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 		return;
@@ -1995,11 +1984,11 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		const XmmLink& v4 = XmmAlloc();
 		const XmmLink& v5 = XmmAlloc();
-		c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
-		c->movdqa(vt, XmmConst(_mm_set1_epi16(0x1f)));
+		c->psubw(vb, XmmConst(v128::from16p(1)));
+		c->movdqa(vt, XmmConst(v128::from16p(0x1f)));
 		c->vpandn(v4, vb, vt);
 		c->vpand(v5, vb, vt);
-		c->movdqa(vt, XmmConst(_mm_set1_epi32(0x2f)));
+		c->movdqa(vt, XmmConst(v128::from32p(0x2f)));
 		c->vpsrld(v4, v4, 16);
 		c->vpsubusw(v5, vt, v5); // clear high word and add 16 to low word
 		c->vpslld(vb, va, 16);
@@ -2015,10 +2004,10 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
-		c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
+		c->psubw(vb, XmmConst(v128::from16p(1)));
+		c->pandn(vb, XmmConst(v128::from16p(0x1f)));
 		c->pxor(vt, vt);
-		c->pminuw(vb, XmmConst(_mm_set1_epi16(15)));
+		c->pminuw(vb, XmmConst(v128::from16p(15)));
 		c->psubw(vt, vb);
 		c->vpshaw(vt, va, vt);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
@@ -2042,7 +2031,7 @@ void spu_recompiler::SHLH(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->pand(vb, XmmConst(_mm_set1_epi16(0x1f)));
+		c->pand(vb, XmmConst(v128::from16p(0x1f)));
 		c->vpsllvw(vt, va, vb);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 		return;
@@ -2055,8 +2044,8 @@ void spu_recompiler::SHLH(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		const XmmLink& v4 = XmmAlloc();
 		const XmmLink& v5 = XmmAlloc();
-		c->pand(vb, XmmConst(_mm_set1_epi16(0x1f)));
-		c->movdqa(vt, XmmConst(_mm_set1_epi32(0xffff0000))); // mask: select high words
+		c->pand(vb, XmmConst(v128::from16p(0x1f)));
+		c->movdqa(vt, XmmConst(v128::from32p(0xffff0000))); // mask: select high words
 		c->vpsrld(v4, vb, 16);
 		c->vpsubusw(v5, vb, vt); // clear high words (using saturation sub for throughput)
 		c->vpand(vb, vt, va); // clear low words
@@ -2072,8 +2061,8 @@ void spu_recompiler::SHLH(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
-		c->pand(vb, XmmConst(_mm_set1_epi16(0x1f)));
-		c->vpcmpgtw(vt, vb, XmmConst(_mm_set1_epi16(15)));
+		c->pand(vb, XmmConst(v128::from16p(0x1f)));
+		c->vpcmpgtw(vt, vb, XmmConst(v128::from16p(15)));
 		c->vpshlw(vb, va, vb);
 		c->pandn(vt, vb);
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
@@ -2216,7 +2205,7 @@ void spu_recompiler::CG(spu_opcode_t op)
 		return;
 	}
 
-	c->movdqa(vi, XmmConst(_mm_set1_epi32(0x80000000)));
+	c->movdqa(vi, XmmConst(v128::from32p(0x80000000)));
 	c->paddd(vb, va);
 	c->pxor(va, vi);
 	c->pxor(vb, vi);
@@ -2245,7 +2234,7 @@ void spu_recompiler::NAND(spu_opcode_t op)
 	}
 
 	c->pand(va, SPU_OFF_128(gpr, op.rb));
-	c->pxor(va, XmmConst(_mm_set1_epi32(0xffffffff)));
+	c->pxor(va, XmmConst(v128::from32p(0xffffffff)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -2593,7 +2582,7 @@ void spu_recompiler::STQX(spu_opcode_t op)
 	if (utils::has_ssse3())
 	{
 		const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
-		c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
+		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
 		c->movdqa(asmjit::x86::oword_ptr(*ls, addr->r64()), vt);
 	}
 	else
@@ -2657,7 +2646,7 @@ void spu_recompiler::BISLED(spu_opcode_t op)
 	const XmmLink& vr = XmmAlloc();
 	c->lea(*qw0, get_pc(m_pos + 4));
 	c->movd(vr, qw0->r32());
-	c->pand(vr, XmmConst(_mm_set1_epi32(0x3fffc)));
+	c->pand(vr, XmmConst(v128::from32p(0x3fffc)));
 	c->pslldq(vr, 12);
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 
@@ -2694,7 +2683,7 @@ void spu_recompiler::GBH(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psllw(va, 15);
-	c->packsswb(va, XmmConst(_mm_setzero_si128()));
+	c->packsswb(va, XmmConst({}));
 	c->pmovmskb(*addr, va);
 	c->pxor(va, va);
 	c->pinsrw(va, *addr, 6);
@@ -2716,7 +2705,7 @@ void spu_recompiler::FSM(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	const XmmLink& vm = XmmAlloc();
 	c->pshufd(va, va, 0xff);
-	c->movdqa(vm, XmmConst(_mm_set_epi32(8, 4, 2, 1)));
+	c->movdqa(vm, XmmConst(v128::from32r(8, 4, 2, 1)));
 	c->pand(va, vm);
 	c->pcmpeqd(va, vm);
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
@@ -2728,7 +2717,7 @@ void spu_recompiler::FSMH(spu_opcode_t op)
 	const XmmLink& vm = XmmAlloc();
 	c->punpckhwd(va, va);
 	c->pshufd(va, va, 0xaa);
-	c->movdqa(vm, XmmConst(_mm_set_epi16(128, 64, 32, 16, 8, 4, 2, 1)));
+	c->movdqa(vm, XmmConst(v128::from64r(0x0080004000200010, 0x0008000400020001)));
 	c->pand(va, vm);
 	c->pcmpeqw(va, vm);
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
@@ -2741,7 +2730,7 @@ void spu_recompiler::FSMB(spu_opcode_t op)
 
 	if (utils::has_ssse3())
 	{
-		c->pshufb(va, XmmConst(_mm_set_epi8(13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12)));
+		c->pshufb(va, XmmConst(v128::from64r(0x0d0d0d0d0d0d0d0d, 0x0c0c0c0c0c0c0c0c)));
 	}
 	else
 	{
@@ -2750,7 +2739,7 @@ void spu_recompiler::FSMB(spu_opcode_t op)
 		c->pshufd(va, va, 0xfa);
 	}
 
-	c->movdqa(vm, XmmConst(_mm_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1)));
+	c->movdqa(vm, XmmConst(v128::from64p(0x8040201008040201)));
 	c->pand(va, vm);
 	c->pcmpeqb(va, vm);
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
@@ -2766,7 +2755,7 @@ void spu_recompiler::FREST(spu_opcode_t op)
 void spu_recompiler::FRSQEST(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Float);
-	c->andps(va, XmmConst(_mm_set1_epi32(0x7fffffff))); // abs
+	c->andps(va, XmmConst(v128::from32p(0x7fffffff))); // abs
 	c->rsqrtps(va, va);
 	c->movaps(SPU_OFF_128(gpr, op.rt), va);
 }
@@ -2781,7 +2770,7 @@ void spu_recompiler::LQX(spu_opcode_t op)
 	{
 		const XmmLink& vt = XmmAlloc();
 		c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64()));
-		c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
+		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 	}
 	else
@@ -2848,7 +2837,7 @@ void spu_recompiler::CBX(spu_opcode_t op)
 	c->and_(*addr, 0xf);
 
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
+	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x03);
 }
@@ -2861,7 +2850,7 @@ void spu_recompiler::CHX(spu_opcode_t op)
 	c->and_(*addr, 0xe);
 
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
+	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x0203);
 }
@@ -2874,7 +2863,7 @@ void spu_recompiler::CWX(spu_opcode_t op)
 	c->and_(*addr, 0xc);
 
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
+	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x00010203);
 }
@@ -2887,7 +2876,7 @@ void spu_recompiler::CDX(spu_opcode_t op)
 	c->and_(*addr, 0x8);
 
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
+	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	c->mov(*qw0, asmjit::Imm(0x0001020304050607ull));
 	c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), *qw0);
@@ -2900,8 +2889,8 @@ void spu_recompiler::ROTQBI(spu_opcode_t op)
 	const XmmLink& vt = XmmAlloc();
 	const XmmLink& v4 = XmmAlloc();
 	c->psrldq(vb, 12);
-	c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
-	c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
+	c->pand(vb, XmmConst(v128::from64r(0, 7)));
+	c->movdqa(v4, XmmConst(v128::from64r(0, 64)));
 	c->pshufd(vt, va, 0x4e);
 	c->psubq(v4, vb);
 	c->psllq(va, vb);
@@ -2919,8 +2908,8 @@ void spu_recompiler::ROTQMBI(spu_opcode_t op)
 	c->psrldq(vt, 12);
 	c->pxor(vb, vb);
 	c->psubq(vb, vt);
-	c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
-	c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
+	c->pand(vb, XmmConst(v128::from64r(0, 7)));
+	c->movdqa(v4, XmmConst(v128::from64r(0, 64)));
 	c->movdqa(vt, va);
 	c->psrldq(vt, 8);
 	c->psubq(v4, vb);
@@ -2937,8 +2926,8 @@ void spu_recompiler::SHLQBI(spu_opcode_t op)
 	const XmmLink& vt = XmmAlloc();
 	const XmmLink& v4 = XmmAlloc();
 	c->psrldq(vb, 12);
-	c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
-	c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
+	c->pand(vb, XmmConst(v128::from64r(0, 7)));
+	c->movdqa(v4, XmmConst(v128::from64r(0, 64)));
 	c->movdqa(vt, va);
 	c->pslldq(vt, 8);
 	c->psubq(v4, vb);
@@ -3014,7 +3003,7 @@ void spu_recompiler::CBD(spu_opcode_t op)
 	//{
 	//	// assuming that SP % 16 is always zero
 	//	const XmmLink& vr = XmmAlloc();
-	//	v128 value = v128::fromV(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
+	//	v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
 	//	value.u8r[op.i7 & 0xf] = 0x03;
 	//	c->movdqa(vr, XmmConst(value));
 	//	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
@@ -3027,7 +3016,7 @@ void spu_recompiler::CBD(spu_opcode_t op)
 	c->and_(*addr, 0xf);
 
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
+	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x03);
 }
@@ -3038,7 +3027,7 @@ void spu_recompiler::CHD(spu_opcode_t op)
 	//{
 	//	// assuming that SP % 16 is always zero
 	//	const XmmLink& vr = XmmAlloc();
-	//	v128 value = v128::fromV(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
+	//	v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
 	//	value.u16r[(op.i7 >> 1) & 0x7] = 0x0203;
 	//	c->movdqa(vr, XmmConst(value));
 	//	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
@@ -3051,7 +3040,7 @@ void spu_recompiler::CHD(spu_opcode_t op)
 	c->and_(*addr, 0xe);
 
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
+	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x0203);
 }
@@ -3062,7 +3051,7 @@ void spu_recompiler::CWD(spu_opcode_t op)
 	//{
 	//	// assuming that SP % 16 is always zero
 	//	const XmmLink& vr = XmmAlloc();
-	//	v128 value = v128::fromV(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
+	//	v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
 	//	value.u32r[(op.i7 >> 2) & 0x3] = 0x00010203;
 	//	c->movdqa(vr, XmmConst(value));
 	//	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
@@ -3075,7 +3064,7 @@ void spu_recompiler::CWD(spu_opcode_t op)
 	c->and_(*addr, 0xc);
 
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
+	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x00010203);
 }
@@ -3086,7 +3075,7 @@ void spu_recompiler::CDD(spu_opcode_t op)
 	//{
 	//	// assuming that SP % 16 is always zero
 	//	const XmmLink& vr = XmmAlloc();
-	//	v128 value = v128::fromV(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
+	//	v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
 	//	value.u64r[(op.i7 >> 3) & 0x1] = 0x0001020304050607ull;
 	//	c->movdqa(vr, XmmConst(value));
 	//	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
@@ -3099,7 +3088,7 @@ void spu_recompiler::CDD(spu_opcode_t op)
 	c->and_(*addr, 0x8);
 
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
+	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 	c->mov(*qw0, asmjit::Imm(0x0001020304050607ull));
 	c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), *qw0);
@@ -3221,7 +3210,7 @@ void spu_recompiler::EQV(spu_opcode_t op)
 		return;
 	}
 
-	c->pxor(vb, XmmConst(_mm_set1_epi32(0xffffffff)));
+	c->pxor(vb, XmmConst(v128::from32p(0xffffffff)));
 	c->pxor(vb, SPU_OFF_128(gpr, op.ra));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
 }
@@ -3239,7 +3228,7 @@ void spu_recompiler::SUMB(spu_opcode_t op)
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 	const XmmLink& v1 = XmmAlloc();
 	const XmmLink& v2 = XmmAlloc();
-	c->movdqa(v2, XmmConst(_mm_set1_epi16(0xff)));
+	c->movdqa(v2, XmmConst(v128::from16p(0xff)));
 	c->movdqa(v1, va);
 	c->psrlw(va, 8);
 	c->pand(v1, v2);
@@ -3247,7 +3236,7 @@ void spu_recompiler::SUMB(spu_opcode_t op)
 	c->psrlw(vb, 8);
 	c->paddw(va, v1);
 	c->paddw(vb, v2);
-	c->movdqa(v2, XmmConst(_mm_set1_epi32(0xffff)));
+	c->movdqa(v2, XmmConst(v128::from32p(0xffff)));
 	c->movdqa(v1, va);
 	c->psrld(va, 16);
 	c->pand(v1, v2);
@@ -3322,19 +3311,19 @@ void spu_recompiler::CNTB(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	const XmmLink& v1 = XmmAlloc();
 	const XmmLink& vm = XmmAlloc();
-	c->movdqa(vm, XmmConst(_mm_set1_epi8(0x55)));
+	c->movdqa(vm, XmmConst(v128::from8p(0x55)));
 	c->movdqa(v1, va);
 	c->pand(va, vm);
 	c->psrlq(v1, 1);
 	c->pand(v1, vm);
 	c->paddb(va, v1);
-	c->movdqa(vm, XmmConst(_mm_set1_epi8(0x33)));
+	c->movdqa(vm, XmmConst(v128::from8p(0x33)));
 	c->movdqa(v1, va);
 	c->pand(va, vm);
 	c->psrlq(v1, 2);
 	c->pand(v1, vm);
 	c->paddb(va, v1);
-	c->movdqa(vm, XmmConst(_mm_set1_epi8(0x0f)));
+	c->movdqa(vm, XmmConst(v128::from8p(0x0f)));
 	c->movdqa(v1, va);
 	c->pand(va, vm);
 	c->psrlq(v1, 4);
@@ -3356,7 +3345,7 @@ void spu_recompiler::CLGT(spu_opcode_t op)
 	// compare if-greater-than
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	const XmmLink& vi = XmmAlloc();
-	c->movdqa(vi, XmmConst(_mm_set1_epi32(0x80000000)));
+	c->movdqa(vi, XmmConst(v128::from32p(0x80000000)));
 	c->pxor(va, vi);
 	c->pxor(vi, SPU_OFF_128(gpr, op.rb));
 	c->pcmpgtd(va, vi);
@@ -3373,8 +3362,8 @@ void spu_recompiler::ANDC(spu_opcode_t op)
 
 void spu_recompiler::FCGT(spu_opcode_t op)
 {
-	const auto last_exp_bit = XmmConst(_mm_set1_epi32(0x00800000));
-	const auto all_exp_bits = XmmConst(_mm_set1_epi32(0x7f800000));
+	const auto last_exp_bit = XmmConst(v128::from32p(0x00800000));
+	const auto all_exp_bits = XmmConst(v128::from32p(0x7f800000));
 
 	const XmmLink& tmp0 = XmmAlloc();
 	const XmmLink& tmp1 = XmmAlloc();
@@ -3441,8 +3430,8 @@ void spu_recompiler::FS(spu_opcode_t op)
 
 void spu_recompiler::FM(spu_opcode_t op)
 {
-	const auto sign_bits = XmmConst(_mm_set1_epi32(0x80000000));
-	const auto all_exp_bits = XmmConst(_mm_set1_epi32(0x7f800000));
+	const auto sign_bits = XmmConst(v128::from32p(0x80000000));
+	const auto all_exp_bits = XmmConst(v128::from32p(0x7f800000));
 
 	const XmmLink& tmp0 = XmmAlloc();
 	const XmmLink& tmp1 = XmmAlloc();
@@ -3497,7 +3486,7 @@ void spu_recompiler::CLGTH(spu_opcode_t op)
 	// compare if-greater-than
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	const XmmLink& vi = XmmAlloc();
-	c->movdqa(vi, XmmConst(_mm_set1_epi16(smin)));
+	c->movdqa(vi, XmmConst(v128::from16p(0x8000)));
 	c->pxor(va, vi);
 	c->pxor(vi, SPU_OFF_128(gpr, op.rb));
 	c->pcmpgtw(va, vi);
@@ -3515,7 +3504,7 @@ void spu_recompiler::ORC(spu_opcode_t op)
 		return;
 	}
 
-	c->pxor(vb, XmmConst(_mm_set1_epi32(0xffffffff)));
+	c->pxor(vb, XmmConst(v128::from32p(0xffffffff)));
 	c->por(vb, SPU_OFF_128(gpr, op.ra));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
 }
@@ -3525,8 +3514,8 @@ void spu_recompiler::FCMGT(spu_opcode_t op)
 	// reverted less-than
 	// since comparison is absoulte, a > b if a is extended and b is not extended
 	// flush denormals to zero to make zero == zero work
-	const auto all_exp_bits = XmmConst(_mm_set1_epi32(0x7f800000));
-	const auto remove_sign_bits = XmmConst(_mm_set1_epi32(0x7fffffff));
+	const auto all_exp_bits = XmmConst(v128::from32p(0x7f800000));
+	const auto remove_sign_bits = XmmConst(v128::from32p(0x7fffffff));
 
 	const XmmLink& tmp0 = XmmAlloc();
 	const XmmLink& tmp1 = XmmAlloc();
@@ -3593,7 +3582,7 @@ void spu_recompiler::CLGTB(spu_opcode_t op)
 	// compare if-greater-than
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	const XmmLink& vi = XmmAlloc();
-	c->movdqa(vi, XmmConst(_mm_set1_epi8(smin)));
+	c->movdqa(vi, XmmConst(v128::from8p(0x80)));
 	c->pxor(va, vi);
 	c->pxor(vi, SPU_OFF_128(gpr, op.rb));
 	c->pcmpgtb(va, vi);
@@ -3654,7 +3643,7 @@ void spu_recompiler::DFNMA(spu_opcode_t op)
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Double);
 	c->mulpd(va, SPU_OFF_128(gpr, op.rb));
 	c->addpd(va, vt);
-	c->xorpd(va, XmmConst(_mm_set1_epi64x(0x8000000000000000)));
+	c->xorpd(va, XmmConst(v128::from64p(0x8000000000000000)));
 	c->movapd(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -3673,7 +3662,7 @@ void spu_recompiler::MPYHHU(spu_opcode_t op)
 	c->movdqa(va2, va);
 	c->pmulhuw(va, vb);
 	c->pmullw(va2, vb);
-	c->pand(va, XmmConst(_mm_set1_epi32(0xffff0000)));
+	c->pand(va, XmmConst(v128::from32p(0xffff0000)));
 	c->psrld(va2, 16);
 	c->por(va, va2);
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
@@ -3682,7 +3671,7 @@ void spu_recompiler::MPYHHU(spu_opcode_t op)
 void spu_recompiler::ADDX(spu_opcode_t op)
 {
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
-	c->pand(vt, XmmConst(_mm_set1_epi32(1)));
+	c->pand(vt, XmmConst(v128::from32p(1)));
 	c->paddd(vt, SPU_OFF_128(gpr, op.ra));
 	c->paddd(vt, SPU_OFF_128(gpr, op.rb));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
@@ -3692,7 +3681,7 @@ void spu_recompiler::SFX(spu_opcode_t op)
 {
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
-	c->pandn(vt, XmmConst(_mm_set1_epi32(1)));
+	c->pandn(vt, XmmConst(v128::from32p(1)));
 	c->psubd(vb, SPU_OFF_128(gpr, op.ra));
 	c->psubd(vb, vt);
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
@@ -3719,7 +3708,7 @@ void spu_recompiler::CGX(spu_opcode_t op) //nf
 		c->paddd(res, vb);
 	}
 
-	c->movdqa(sign, XmmConst(_mm_set1_epi32(smin)));
+	c->movdqa(sign, XmmConst(v128::from32p(0x80000000)));
 	c->pxor(va, sign);
 	c->pxor(res, sign);
 	c->pcmpgtd(va, res);
@@ -3752,7 +3741,7 @@ void spu_recompiler::BGX(spu_opcode_t op) //nf
 	}
 
 	c->pand(vt, temp);
-	c->movdqa(sign, XmmConst(_mm_set1_epi32(smin)));
+	c->movdqa(sign, XmmConst(v128::from32p(0x80000000)));
 	c->pxor(va, sign);
 	c->pxor(vb, sign);
 	c->pcmpgtd(vb, va);
@@ -3782,7 +3771,7 @@ void spu_recompiler::MPYHHAU(spu_opcode_t op)
 	c->movdqa(va2, va);
 	c->pmulhuw(va, vb);
 	c->pmullw(va2, vb);
-	c->pand(va, XmmConst(_mm_set1_epi32(0xffff0000)));
+	c->pand(va, XmmConst(v128::from32p(0xffff0000)));
 	c->psrld(va2, 16);
 	c->paddd(vt, va);
 	c->paddd(vt, va2);
@@ -3841,7 +3830,7 @@ void spu_recompiler::MPY(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 	const XmmLink& vi = XmmAlloc();
-	c->movdqa(vi, XmmConst(_mm_set1_epi32(0xffff)));
+	c->movdqa(vi, XmmConst(v128::from32p(0xffff)));
 	c->pand(va, vi);
 	c->pand(vb, vi);
 	c->pmaddwd(va, vb);
@@ -3889,7 +3878,7 @@ void spu_recompiler::FCMEQ(spu_opcode_t op)
 {
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
 	const XmmLink& vi = XmmAlloc();
-	c->movaps(vi, XmmConst(_mm_set1_epi32(0x7fffffff)));
+	c->movaps(vi, XmmConst(v128::from32p(0x7fffffff)));
 	c->andps(vb, vi); // abs
 	c->andps(vi, SPU_OFF_128(gpr, op.ra));
 	c->cmpps(vb, vi, 0); // ==
@@ -3910,7 +3899,7 @@ void spu_recompiler::MPYU(spu_opcode_t op)
 	c->pmulhuw(va, vb);
 	c->pmullw(va2, vb);
 	c->pslld(va, 16);
-	c->pand(va2, XmmConst(_mm_set1_epi32(0xffff)));
+	c->pand(va2, XmmConst(v128::from32p(0xffff)));
 	c->por(va, va2);
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
@@ -3954,8 +3943,8 @@ void spu_recompiler::CFLTS(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Float);
 	const XmmLink& vi = XmmAlloc();
-	if (op.i8 != 173) c->mulps(va, XmmConst(_mm_set1_ps(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))))); // scale
-	c->movaps(vi, XmmConst(_mm_set1_ps(std::exp2(31.f))));
+	if (op.i8 != 173) c->mulps(va, XmmConst(v128::fromf32p(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))))); // scale
+	c->movaps(vi, XmmConst(v128::fromf32p(std::exp2(31.f))));
 	c->cmpps(vi, va, 2);
 	c->cvttps2dq(va, va); // convert to ints with truncation
 	c->pxor(va, vi); // fix result saturation (0x80000000 -> 0x7fffffff)
@@ -3968,7 +3957,7 @@ void spu_recompiler::CFLTU(spu_opcode_t op)
 	const XmmLink& vs = XmmAlloc();
 	const XmmLink& vs2 = XmmAlloc();
 	const XmmLink& vs3 = XmmAlloc();
-	if (op.i8 != 173) c->mulps(va, XmmConst(_mm_set1_ps(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))))); // scale
+	if (op.i8 != 173) c->mulps(va, XmmConst(v128::fromf32p(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))))); // scale
 
 	if (utils::has_avx512())
 	{
@@ -3984,12 +3973,12 @@ void spu_recompiler::CFLTU(spu_opcode_t op)
 	c->andnps(va, vs);
 	c->movaps(vs, va); // copy scaled value
 	c->movaps(vs2, va);
-	c->movaps(vs3, XmmConst(_mm_set1_ps(std::exp2(31.f))));
+	c->movaps(vs3, XmmConst(v128::fromf32p(std::exp2(31.f))));
 	c->subps(vs2, vs3);
 	c->cmpps(vs3, vs, 2);
 	c->andps(vs2, vs3);
 	c->cvttps2dq(va, va);
-	c->cmpps(vs, XmmConst(_mm_set1_ps(std::exp2(32.f))), 5);
+	c->cmpps(vs, XmmConst(v128::fromf32p(std::exp2(32.f))), 5);
 	c->cvttps2dq(vs2, vs2);
 	c->por(va, vs);
 	c->por(va, vs2);
@@ -4000,7 +3989,7 @@ void spu_recompiler::CSFLT(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->cvtdq2ps(va, va); // convert to floats
-	if (op.i8 != 155) c->mulps(va, XmmConst(_mm_set1_ps(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))))); // scale
+	if (op.i8 != 155) c->mulps(va, XmmConst(v128::fromf32p(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))))); // scale
 	c->movaps(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4016,14 +4005,14 @@ void spu_recompiler::CUFLT(spu_opcode_t op)
 	else
 	{
 		c->movdqa(v1, va);
-		c->pand(va, XmmConst(_mm_set1_epi32(0x7fffffff)));
+		c->pand(va, XmmConst(v128::from32p(0x7fffffff)));
 		c->cvtdq2ps(va, va); // convert to floats
 		c->psrad(v1, 31); // generate mask from sign bit
-		c->andps(v1, XmmConst(_mm_set1_ps(std::exp2(31.f)))); // generate correction component
+		c->andps(v1, XmmConst(v128::fromf32p(std::exp2(31.f)))); // generate correction component
 		c->addps(va, v1); // add correction component
 	}
 
-	if (op.i8 != 155) c->mulps(va, XmmConst(_mm_set1_ps(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))))); // scale
+	if (op.i8 != 155) c->mulps(va, XmmConst(v128::fromf32p(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))))); // scale
 	c->movaps(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4053,7 +4042,7 @@ void spu_recompiler::STQA(spu_opcode_t op)
 	if (utils::has_ssse3())
 	{
 		const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
-		c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
+		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
 		c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)), vt);
 	}
 	else
@@ -4138,7 +4127,7 @@ void spu_recompiler::STQR(spu_opcode_t op)
 	if (utils::has_ssse3())
 	{
 		const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
-		c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
+		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
 		c->movdqa(asmjit::x86::oword_ptr(*ls, addr->r64()), vt);
 	}
 	else
@@ -4166,7 +4155,7 @@ void spu_recompiler::LQA(spu_opcode_t op)
 	{
 		const XmmLink& vt = XmmAlloc();
 		c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)));
-		c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
+		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 	}
 	else
@@ -4246,7 +4235,7 @@ void spu_recompiler::LQR(spu_opcode_t op)
 	{
 		const XmmLink& vt = XmmAlloc();
 		c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64()));
-		c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
+		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 	}
 	else
@@ -4263,56 +4252,56 @@ void spu_recompiler::LQR(spu_opcode_t op)
 void spu_recompiler::IL(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set1_epi32(op.si16)));
+	c->movdqa(vr, XmmConst(v128::from32p(op.si16)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 }
 
 void spu_recompiler::ILHU(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set1_epi32(op.i16 << 16)));
+	c->movdqa(vr, XmmConst(v128::from32p(op.i16 << 16)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 }
 
 void spu_recompiler::ILH(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set1_epi16(op.i16)));
+	c->movdqa(vr, XmmConst(v128::from16p(op.i16)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 }
 
 void spu_recompiler::IOHL(spu_opcode_t op)
 {
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
-	c->por(vt, XmmConst(_mm_set1_epi32(op.i16)));
+	c->por(vt, XmmConst(v128::from32p(op.i16)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 }
 
 void spu_recompiler::ORI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	if (op.si10) c->por(va, XmmConst(_mm_set1_epi32(op.si10)));
+	if (op.si10) c->por(va, XmmConst(v128::from32p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::ORHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->por(va, XmmConst(_mm_set1_epi16(op.si10)));
+	c->por(va, XmmConst(v128::from16p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::ORBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->por(va, XmmConst(_mm_set1_epi8(op.si10)));
+	c->por(va, XmmConst(v128::from8p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::SFI(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set1_epi32(op.si10)));
+	c->movdqa(vr, XmmConst(v128::from32p(op.si10)));
 	c->psubd(vr, SPU_OFF_128(gpr, op.ra));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 }
@@ -4320,7 +4309,7 @@ void spu_recompiler::SFI(spu_opcode_t op)
 void spu_recompiler::SFHI(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set1_epi16(op.si10)));
+	c->movdqa(vr, XmmConst(v128::from16p(op.si10)));
 	c->psubw(vr, SPU_OFF_128(gpr, op.ra));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 }
@@ -4328,21 +4317,21 @@ void spu_recompiler::SFHI(spu_opcode_t op)
 void spu_recompiler::ANDI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pand(va, XmmConst(_mm_set1_epi32(op.si10)));
+	c->pand(va, XmmConst(v128::from32p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::ANDHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pand(va, XmmConst(_mm_set1_epi16(op.si10)));
+	c->pand(va, XmmConst(v128::from16p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::ANDBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pand(va, XmmConst(_mm_set1_epi8(op.si10)));
+	c->pand(va, XmmConst(v128::from8p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4350,7 +4339,7 @@ void spu_recompiler::AI(spu_opcode_t op)
 {
 	// add
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->paddd(va, XmmConst(_mm_set1_epi32(op.si10)));
+	c->paddd(va, XmmConst(v128::from32p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4358,7 +4347,7 @@ void spu_recompiler::AHI(spu_opcode_t op)
 {
 	// add
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->paddw(va, XmmConst(_mm_set1_epi16(op.si10)));
+	c->paddw(va, XmmConst(v128::from16p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4371,7 +4360,7 @@ void spu_recompiler::STQD(spu_opcode_t op)
 	if (utils::has_ssse3())
 	{
 		const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
-		c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
+		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
 		c->movdqa(asmjit::x86::oword_ptr(*ls, addr->r64()), vt);
 	}
 	else
@@ -4395,7 +4384,7 @@ void spu_recompiler::LQD(spu_opcode_t op)
 	{
 		const XmmLink& vt = XmmAlloc();
 		c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64()));
-		c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
+		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
 		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
 	}
 	else
@@ -4412,42 +4401,42 @@ void spu_recompiler::LQD(spu_opcode_t op)
 void spu_recompiler::XORI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pxor(va, XmmConst(_mm_set1_epi32(op.si10)));
+	c->pxor(va, XmmConst(v128::from32p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::XORHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pxor(va, XmmConst(_mm_set1_epi16(op.si10)));
+	c->pxor(va, XmmConst(v128::from16p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::XORBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pxor(va, XmmConst(_mm_set1_epi8(op.si10)));
+	c->pxor(va, XmmConst(v128::from8p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::CGTI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpgtd(va, XmmConst(_mm_set1_epi32(op.si10)));
+	c->pcmpgtd(va, XmmConst(v128::from32p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::CGTHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpgtw(va, XmmConst(_mm_set1_epi16(op.si10)));
+	c->pcmpgtw(va, XmmConst(v128::from16p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::CGTBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpgtb(va, XmmConst(_mm_set1_epi8(op.si10)));
+	c->pcmpgtb(va, XmmConst(v128::from8p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4474,24 +4463,24 @@ void spu_recompiler::HGTI(spu_opcode_t op)
 void spu_recompiler::CLGTI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pxor(va, XmmConst(_mm_set1_epi32(0x80000000)));
-	c->pcmpgtd(va, XmmConst(_mm_set1_epi32(op.si10 - 0x80000000)));
+	c->pxor(va, XmmConst(v128::from32p(0x80000000)));
+	c->pcmpgtd(va, XmmConst(v128::from32p(op.si10 - 0x80000000)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::CLGTHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pxor(va, XmmConst(_mm_set1_epi16(smin)));
-	c->pcmpgtw(va, XmmConst(_mm_set1_epi16(op.si10 - 0x8000)));
+	c->pxor(va, XmmConst(v128::from16p(0x8000)));
+	c->pcmpgtw(va, XmmConst(v128::from16p(op.si10 - 0x8000)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::CLGTBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->psubb(va, XmmConst(_mm_set1_epi8(smin)));
-	c->pcmpgtb(va, XmmConst(_mm_set1_epi8(op.si10 - 0x80)));
+	c->psubb(va, XmmConst(v128::from8p(0x80)));
+	c->pcmpgtb(va, XmmConst(v128::from8p(op.si10 - 0x80)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4518,7 +4507,7 @@ void spu_recompiler::HLGTI(spu_opcode_t op)
 void spu_recompiler::MPYI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pmaddwd(va, XmmConst(_mm_set1_epi32(op.si10 & 0xffff)));
+	c->pmaddwd(va, XmmConst(v128::from32p(op.si10 & 0xffff)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4528,7 +4517,7 @@ void spu_recompiler::MPYUI(spu_opcode_t op)
 	const XmmLink& vi = XmmAlloc();
 	const XmmLink& va2 = XmmAlloc();
 	c->movdqa(va2, va);
-	c->movdqa(vi, XmmConst(_mm_set1_epi32(op.si10 & 0xffff)));
+	c->movdqa(vi, XmmConst(v128::from32p(op.si10 & 0xffff)));
 	c->pmulhuw(va, vi);
 	c->pmullw(va2, vi);
 	c->pslld(va, 16);
@@ -4539,21 +4528,21 @@ void spu_recompiler::MPYUI(spu_opcode_t op)
 void spu_recompiler::CEQI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpeqd(va, XmmConst(_mm_set1_epi32(op.si10)));
+	c->pcmpeqd(va, XmmConst(v128::from32p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::CEQHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpeqw(va, XmmConst(_mm_set1_epi16(op.si10)));
+	c->pcmpeqw(va, XmmConst(v128::from16p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
 void spu_recompiler::CEQBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpeqb(va, XmmConst(_mm_set1_epi8(op.si10)));
+	c->pcmpeqb(va, XmmConst(v128::from8p(op.si10)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
 }
 
@@ -4588,7 +4577,7 @@ void spu_recompiler::HBRR([[maybe_unused]] spu_opcode_t op)
 void spu_recompiler::ILA(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
-	c->movdqa(vr, XmmConst(_mm_set1_epi32(op.i18)));
+	c->movdqa(vr, XmmConst(v128::from32p(op.i18)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 }
 
@@ -4627,15 +4616,15 @@ void spu_recompiler::SHUFB(spu_opcode_t op)
 		const XmmLink& vc = XmmGet(op.rc, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
 		const XmmLink& vm = XmmAlloc();
-		c->vpcmpub(asmjit::x86::k1, vc, XmmConst(_mm_set1_epi8(-0x40)), 5 /* GE */);
-		c->vpxor(vm, vc, XmmConst(_mm_set1_epi8(0xf)));
+		c->vpcmpub(asmjit::x86::k1, vc, XmmConst(v128::from8p(-0x40)), 5 /* GE */);
+		c->vpxor(vm, vc, XmmConst(v128::from8p(0xf)));
 		c->setExtraReg(asmjit::x86::k1);
-		c->z().vpblendmb(vc, vc, XmmConst(_mm_set1_epi8(-1))); // {k1}
-		c->vpcmpub(asmjit::x86::k2, vm, XmmConst(_mm_set1_epi8(-0x20)), 5 /* GE */);
-		c->vptestmb(asmjit::x86::k1, vm, XmmConst(_mm_set1_epi8(0x10)));
+		c->z().vpblendmb(vc, vc, XmmConst(v128::from8p(-1))); // {k1}
+		c->vpcmpub(asmjit::x86::k2, vm, XmmConst(v128::from8p(-0x20)), 5 /* GE */);
+		c->vptestmb(asmjit::x86::k1, vm, XmmConst(v128::from8p(0x10)));
 		c->vpshufb(vt, va, vm);
 		c->setExtraReg(asmjit::x86::k2);
-		c->z().vpblendmb(va, va, XmmConst(_mm_set1_epi8(0x7f))); // {k2}
+		c->z().vpblendmb(va, va, XmmConst(v128::from8p(0x7f))); // {k2}
 		c->setExtraReg(asmjit::x86::k1);
 		c->vpshufb(vt, vb, vm); // {k1}
 		c->vpternlogd(vt, va, vc, 0xf6 /* orAxorBC */);
@@ -4654,12 +4643,12 @@ void spu_recompiler::SHUFB(spu_opcode_t op)
 	const XmmLink& vt = XmmAlloc();
 	const XmmLink& vm = XmmAlloc();
 	const XmmLink& v5 = XmmAlloc();
-	c->movdqa(vm, XmmConst(_mm_set1_epi8(static_cast<s8>(0xc0))));
+	c->movdqa(vm, XmmConst(v128::from8p(static_cast<s8>(0xc0))));
 
 	if (utils::has_avx())
 	{
-		c->vpand(v5, vc, XmmConst(_mm_set1_epi8(static_cast<s8>(0xe0))));
-		c->vpxor(vc, vc, XmmConst(_mm_set1_epi8(0xf)));
+		c->vpand(v5, vc, XmmConst(v128::from8p(static_cast<s8>(0xe0))));
+		c->vpxor(vc, vc, XmmConst(v128::from8p(0xf)));
 		c->vpshufb(va, va, vc);
 		c->vpslld(vt, vc, 3);
 		c->vpcmpeqb(v5, v5, vm);
@@ -4673,10 +4662,10 @@ void spu_recompiler::SHUFB(spu_opcode_t op)
 	else
 	{
 		c->movdqa(v5, vc);
-		c->pand(v5, XmmConst(_mm_set1_epi8(static_cast<s8>(0xe0))));
+		c->pand(v5, XmmConst(v128::from8p(static_cast<s8>(0xe0))));
 		c->movdqa(vt, vc);
 		c->pand(vt, vm);
-		c->pxor(vc, XmmConst(_mm_set1_epi8(0xf)));
+		c->pxor(vc, XmmConst(v128::from8p(0xf)));
 		c->pshufb(va, vc);
 		c->pshufb(vb, vc);
 		c->pslld(vc, 3);
@@ -4699,7 +4688,7 @@ void spu_recompiler::MPYA(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 	const XmmLink& vi = XmmAlloc();
-	c->movdqa(vi, XmmConst(_mm_set1_epi32(0xffff)));
+	c->movdqa(vi, XmmConst(v128::from32p(0xffff)));
 	c->pand(va, vi);
 	c->pand(vb, vi);
 	c->pmaddwd(va, vb);
@@ -4714,7 +4703,7 @@ void spu_recompiler::FNMS(spu_opcode_t op)
 	const XmmLink& mask = XmmAlloc();
 	const XmmLink& v1 = XmmAlloc();
 	const XmmLink& v2 = XmmAlloc();
-	c->movaps(mask, XmmConst(_mm_set1_epi32(0x7f800000)));
+	c->movaps(mask, XmmConst(v128::from32p(0x7f800000)));
 	c->movaps(v1, va);
 	c->movaps(v2, vb);
 	c->andps(va, mask);
@@ -4737,7 +4726,7 @@ void spu_recompiler::FMA(spu_opcode_t op)
 	const XmmLink& mask = XmmAlloc();
 	const XmmLink& v1 = XmmAlloc();
 	const XmmLink& v2 = XmmAlloc();
-	c->movaps(mask, XmmConst(_mm_set1_epi32(0x7f800000)));
+	c->movaps(mask, XmmConst(v128::from32p(0x7f800000)));
 	c->movaps(v1, va);
 	c->movaps(v2, vb);
 	c->andps(va, mask);
@@ -4759,7 +4748,7 @@ void spu_recompiler::FMS(spu_opcode_t op)
 	const XmmLink& mask = XmmAlloc();
 	const XmmLink& v1 = XmmAlloc();
 	const XmmLink& v2 = XmmAlloc();
-	c->movaps(mask, XmmConst(_mm_set1_epi32(0x7f800000)));
+	c->movaps(mask, XmmConst(v128::from32p(0x7f800000)));
 	c->movaps(v1, va);
 	c->movaps(v2, vb);
 	c->andps(va, mask);
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
index f221b33ab0..36c238f231 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@@ -88,8 +88,6 @@ private:
 	XmmLink XmmGet(s8 reg, XmmType type);
 
 	asmjit::x86::Mem XmmConst(const v128& data);
-	asmjit::x86::Mem XmmConst(const __m128& data);
-	asmjit::x86::Mem XmmConst(const __m128i& data);
 
 	asmjit::x86::Mem get_pc(u32 addr);
 	void branch_fixed(u32 target, bool absolute = false);
diff --git a/rpcs3/Emu/Cell/SPUAnalyser.cpp b/rpcs3/Emu/Cell/SPUAnalyser.cpp
index ccf284487b..73ba8fb17c 100644
--- a/rpcs3/Emu/Cell/SPUAnalyser.cpp
+++ b/rpcs3/Emu/Cell/SPUAnalyser.cpp
@@ -1,2 +1,7 @@
 #include "stdafx.h"
 #include "SPUAnalyser.h"
+#include "SPUOpcodes.h"
+
+const extern spu_decoder<spu_itype> g_spu_itype{};
+const extern spu_decoder<spu_iname> g_spu_iname{};
+const extern spu_decoder<spu_iflag> g_spu_iflag{};
diff --git a/rpcs3/Emu/Cell/SPUDisAsm.cpp b/rpcs3/Emu/Cell/SPUDisAsm.cpp
index 9a06ec8912..e1f643b8c6 100644
--- a/rpcs3/Emu/Cell/SPUDisAsm.cpp
+++ b/rpcs3/Emu/Cell/SPUDisAsm.cpp
@@ -4,11 +4,12 @@
 #include "SPUThread.h"
 
 const spu_decoder<SPUDisAsm> s_spu_disasm;
-const spu_decoder<spu_itype> s_spu_itype;
-const spu_decoder<spu_iflag> s_spu_iflag;
+const extern spu_decoder<spu_itype> g_spu_itype;
+const extern spu_decoder<spu_iname> g_spu_iname;
+const extern spu_decoder<spu_iflag> g_spu_iflag;
 
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 
 u32 SPUDisAsm::disasm(u32 pc)
 {
@@ -49,7 +50,7 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc, u32 TTL) c
 
 	if (pc == umax)
 	{
-		// Default arg: choose pc of previous instruction 
+		// Default arg: choose pc of previous instruction
 
 		if (dump_pc == 0)
 		{
@@ -68,7 +69,7 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc, u32 TTL) c
 		const u32 opcode = *reinterpret_cast<const be_t<u32>*>(m_offset + i);
 		const spu_opcode_t op0{ opcode };
 
-		const auto type = s_spu_itype.decode(opcode);
+		const auto type = g_spu_itype.decode(opcode);
 
 		if (type & spu_itype::branch || type == spu_itype::UNK || !opcode)
 		{
@@ -101,7 +102,7 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc, u32 TTL) c
 			var = value;\
 		} void() /*<- Require a semicolon*/
 
-		//const auto flag = s_spu_iflag.decode(opcode);
+		//const auto flag = g_spu_iflag.decode(opcode);
 
 		// TODO: It detects spurious register modifications
 		if (u32 dst = type & spu_itype::_quadrop ? +op0.rt4 : +op0.rt; dst == reg)
@@ -203,14 +204,14 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc, u32 TTL) c
 				v128 reg_val{};
 				GET_CONST_REG(reg_val, op0.ra);
 
-				return { true, reg_val };	
+				return { true, reg_val };
 			}
 			case spu_itype::ORI:
 			{
 				v128 reg_val{};
 				GET_CONST_REG(reg_val, op0.ra);
 
-				return { true, reg_val | v128::from32p(op0.si10) };	
+				return { true, reg_val | v128::from32p(op0.si10) };
 			}
 			default: return {};
 			}
diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp
index 0d4b72c508..b3e1f729a9 100644
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@@ -4,44 +4,75 @@
 #include "Utilities/JIT.h"
 #include "SPUThread.h"
 #include "Emu/Cell/Common.h"
+#include "Emu/Cell/SPUAnalyser.h"
+#include "Emu/system_config.h"
 
 #include "util/asm.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 #include "util/sysinfo.hpp"
 
 #include <cmath>
 #include <cfenv>
 
-#if !defined(_MSC_VER) && defined(__clang__)
+#if !defined(_MSC_VER)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wold-style-cast"
 #endif
 
-// Compare 16 packed unsigned bytes (greater than)
-inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
-{
-	// (A xor 0x80) > (B xor 0x80)
-	const auto sign = _mm_set1_epi32(0x80808080);
-	return _mm_cmpgt_epi8(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
-}
+#if defined(ARCH_ARM64)
+#if !defined(_MSC_VER)
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+#undef FORCE_INLINE
+#include "Emu/CPU/sse2neon.h"
+#endif
 
-inline __m128i sse_cmpgt_epu16(__m128i A, __m128i B)
-{
-	const auto sign = _mm_set1_epi32(0x80008000);
-	return _mm_cmpgt_epi16(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
-}
+const extern spu_decoder<spu_itype> g_spu_itype;
+const extern spu_decoder<spu_iname> g_spu_iname;
+const extern spu_decoder<spu_iflag> g_spu_iflag;
 
-inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
+enum class spu_exec_bit : u64
 {
-	const auto sign = _mm_set1_epi32(0x80000000);
-	return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
-}
+	use_dfma,
+
+	__bitset_enum_max
+};
+
+using enum spu_exec_bit;
+
+template <spu_exec_bit... Flags0>
+struct spu_exec_select
+{
+	template <spu_exec_bit Flag, spu_exec_bit... Flags, typename F>
+	static spu_intrp_func_t select(bs_t<spu_exec_bit> selected, F func)
+	{
+		// Make sure there is no flag duplication, otherwise skip flag
+		if constexpr (((Flags0 != Flag) && ...))
+		{
+			// Test only relevant flags at runtime (compiling both variants)
+			if (selected & Flag)
+			{
+				// In this branch, selected flag is added to Flags0
+				return spu_exec_select<Flags0..., Flag>::template select<Flags...>(selected, func);
+			}
+		}
+
+		return spu_exec_select<Flags0...>::template select<Flags...>(selected, func);
+	}
+
+	template <typename F>
+	static spu_intrp_func_t select(bs_t<spu_exec_bit>, F func)
+	{
+		// Instantiate interpreter function with required set of flags
+		return func.template operator()<Flags0...>();
+	}
+};
+
+static constexpr spu_opcode_t s_op{};
 
 namespace asmjit
 {
-	static constexpr spu_opcode_t s_op{};
-
 	template <uint I, uint N>
 	static void build_spu_gpr_load(x86::Assembler& c, x86::Xmm x, const bf_t<u32, I, N>&, bool store = false)
 	{
@@ -93,7 +124,8 @@ namespace asmjit
 	}
 }
 
-bool spu_interpreter::UNK(spu_thread&, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool UNK(spu_thread&, spu_opcode_t op)
 {
 	spu_log.fatal("Unknown/Illegal instruction (0x%08x)", op.opcode);
 	return false;
@@ -123,7 +155,8 @@ void spu_interpreter::set_interrupt_status(spu_thread& spu, spu_opcode_t op)
 }
 
 
-bool spu_interpreter::STOP(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool STOP(spu_thread& spu, spu_opcode_t op)
 {
 	const bool allow = std::exchange(spu.allow_interrupts_in_cpu_work, false);
 
@@ -145,32 +178,37 @@ bool spu_interpreter::STOP(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::LNOP(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool LNOP(spu_thread&, spu_opcode_t)
 {
 	return true;
 }
 
 // This instruction must be used following a store instruction that modifies the instruction stream.
-bool spu_interpreter::SYNC(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool SYNC(spu_thread&, spu_opcode_t)
 {
 	atomic_fence_seq_cst();
 	return true;
 }
 
 // This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
-bool spu_interpreter::DSYNC(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool DSYNC(spu_thread&, spu_opcode_t)
 {
 	atomic_fence_seq_cst();
 	return true;
 }
 
-bool spu_interpreter::MFSPR(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MFSPR(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt].clear(); // All SPRs read as zero. TODO: check it.
 	return true;
 }
 
-bool spu_interpreter::RDCH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool RDCH(spu_thread& spu, spu_opcode_t op)
 {
 	const bool allow = std::exchange(spu.allow_interrupts_in_cpu_work, false);
 
@@ -194,51 +232,59 @@ bool spu_interpreter::RDCH(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::RCHCNT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool RCHCNT(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = v128::from32r(spu.get_ch_count(op.ra));
 	return true;
 }
 
-bool spu_interpreter::SF(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SF(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::sub32(spu.gpr[op.rb], spu.gpr[op.ra]);
+	spu.gpr[op.rt] = gv_sub32(spu.gpr[op.rb], spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter::OR(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool OR(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = spu.gpr[op.ra] | spu.gpr[op.rb];
 	return true;
 }
 
-bool spu_interpreter::BG(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BG(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_add_epi32(sse_cmpgt_epu32(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi), _mm_set1_epi32(1));
+	spu.gpr[op.rt] = _mm_add_epi32(gv_gtu32(spu.gpr[op.ra], spu.gpr[op.rb]), _mm_set1_epi32(1));
 	return true;
 }
 
-bool spu_interpreter::SFH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SFH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::sub16(spu.gpr[op.rb], spu.gpr[op.ra]);
+	spu.gpr[op.rt] = gv_sub16(spu.gpr[op.rb], spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter::NOR(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool NOR(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = ~(spu.gpr[op.ra] | spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::ABSDB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ABSDB(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
-	spu.gpr[op.rt] = v128::sub8(v128::maxu8(a, b), v128::minu8(a, b));
+	spu.gpr[op.rt] = gv_sub8(gv_maxu8(a, b), gv_minu8(a, b));
 	return true;
 }
 
-bool spu_interpreter::ROT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROT(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
@@ -250,7 +296,8 @@ bool spu_interpreter::ROT(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::ROTM(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTM(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
@@ -263,7 +310,8 @@ bool spu_interpreter::ROTM(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::ROTMA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTMA(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
@@ -276,7 +324,8 @@ bool spu_interpreter::ROTMA(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::SHL(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHL(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
@@ -289,7 +338,8 @@ bool spu_interpreter::SHL(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::ROTH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTH(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
@@ -301,7 +351,8 @@ bool spu_interpreter::ROTH(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::ROTHM(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTHM(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
@@ -314,7 +365,8 @@ bool spu_interpreter::ROTHM(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::ROTMAH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTMAH(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
@@ -327,7 +379,8 @@ bool spu_interpreter::ROTMAH(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::SHLH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHLH(spu_thread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra];
 	const auto b = spu.gpr[op.rb];
@@ -340,103 +393,119 @@ bool spu_interpreter::SHLH(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::ROTI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const s32 n = op.i7 & 0x1f;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi32(a, n), _mm_srli_epi32(a, 32 - n));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_slli_epi32(a, n), _mm_srli_epi32(a, 32 - n));
 	return true;
 }
 
-bool spu_interpreter::ROTMI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTMI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_srli_epi32(spu.gpr[op.ra].vi, (0-op.i7) & 0x3f);
+	spu.gpr[op.rt] = _mm_srli_epi32(spu.gpr[op.ra], (0-op.i7) & 0x3f);
 	return true;
 }
 
-bool spu_interpreter::ROTMAI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTMAI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_srai_epi32(spu.gpr[op.ra].vi, (0-op.i7) & 0x3f);
+	spu.gpr[op.rt] = _mm_srai_epi32(spu.gpr[op.ra], (0-op.i7) & 0x3f);
 	return true;
 }
 
-bool spu_interpreter::SHLI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHLI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_slli_epi32(spu.gpr[op.ra].vi, op.i7 & 0x3f);
+	spu.gpr[op.rt] = _mm_slli_epi32(spu.gpr[op.ra], op.i7 & 0x3f);
 	return true;
 }
 
-bool spu_interpreter::ROTHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTHI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const s32 n = op.i7 & 0xf;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi16(a, n), _mm_srli_epi16(a, 16 - n));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_slli_epi16(a, n), _mm_srli_epi16(a, 16 - n));
 	return true;
 }
 
-bool spu_interpreter::ROTHMI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTHMI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_srli_epi16(spu.gpr[op.ra].vi, (0-op.i7) & 0x1f);
+	spu.gpr[op.rt] = _mm_srli_epi16(spu.gpr[op.ra], (0-op.i7) & 0x1f);
 	return true;
 }
 
-bool spu_interpreter::ROTMAHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTMAHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_srai_epi16(spu.gpr[op.ra].vi, (0-op.i7) & 0x1f);
+	spu.gpr[op.rt] = _mm_srai_epi16(spu.gpr[op.ra], (0-op.i7) & 0x1f);
 	return true;
 }
 
-bool spu_interpreter::SHLHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHLHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_slli_epi16(spu.gpr[op.ra].vi, op.i7 & 0x1f);
+	spu.gpr[op.rt] = _mm_slli_epi16(spu.gpr[op.ra], op.i7 & 0x1f);
 	return true;
 }
 
-bool spu_interpreter::A(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool A(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::add32(spu.gpr[op.ra], spu.gpr[op.rb]);
+	spu.gpr[op.rt] = gv_add32(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::AND(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool AND(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = spu.gpr[op.ra] & spu.gpr[op.rb];
 	return true;
 }
 
-bool spu_interpreter::CG(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CG(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = _mm_xor_si128(spu.gpr[op.ra].vi, _mm_set1_epi32(0x7fffffff));
-	const auto b = _mm_xor_si128(spu.gpr[op.rb].vi, _mm_set1_epi32(0x80000000));
-	spu.gpr[op.rt].vi = _mm_srli_epi32(_mm_cmpgt_epi32(b, a), 31);
+	const auto a = _mm_xor_si128(spu.gpr[op.ra], _mm_set1_epi32(0x7fffffff));
+	const auto b = _mm_xor_si128(spu.gpr[op.rb], _mm_set1_epi32(0x80000000));
+	spu.gpr[op.rt] = _mm_srli_epi32(_mm_cmpgt_epi32(b, a), 31);
 	return true;
 }
 
-bool spu_interpreter::AH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool AH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::add16(spu.gpr[op.ra], spu.gpr[op.rb]);
+	spu.gpr[op.rt] = gv_add16(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::NAND(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool NAND(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = ~(spu.gpr[op.ra] & spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::AVGB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool AVGB(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_avg_epu8(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = _mm_avg_epu8(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::MTSPR(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool MTSPR(spu_thread&, spu_opcode_t)
 {
 	// SPR writes are ignored. TODO: check it.
 	return true;
 }
 
-bool spu_interpreter::WRCH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool WRCH(spu_thread& spu, spu_opcode_t op)
 {
 	const bool allow = std::exchange(spu.allow_interrupts_in_cpu_work, false);
 
@@ -458,85 +527,95 @@ bool spu_interpreter::WRCH(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::BIZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BIZ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.rt]._u32[3] == 0)
 	{
 		spu.pc = spu_branch_target(spu.gpr[op.ra]._u32[3]);
-		set_interrupt_status(spu, op);
+		spu_interpreter::set_interrupt_status(spu, op);
 		return false;
 	}
 	return true;
 }
 
-bool spu_interpreter::BINZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BINZ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.rt]._u32[3] != 0)
 	{
 		spu.pc = spu_branch_target(spu.gpr[op.ra]._u32[3]);
-		set_interrupt_status(spu, op);
+		spu_interpreter::set_interrupt_status(spu, op);
 		return false;
 	}
 	return true;
 }
 
-bool spu_interpreter::BIHZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BIHZ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.rt]._u16[6] == 0)
 	{
 		spu.pc = spu_branch_target(spu.gpr[op.ra]._u32[3]);
-		set_interrupt_status(spu, op);
+		spu_interpreter::set_interrupt_status(spu, op);
 		return false;
 	}
 	return true;
 }
 
-bool spu_interpreter::BIHNZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BIHNZ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.rt]._u16[6] != 0)
 	{
 		spu.pc = spu_branch_target(spu.gpr[op.ra]._u32[3]);
-		set_interrupt_status(spu, op);
+		spu_interpreter::set_interrupt_status(spu, op);
 		return false;
 	}
 	return true;
 }
 
-bool spu_interpreter::STOPD(spu_thread& spu, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool STOPD(spu_thread& spu, spu_opcode_t)
 {
 	return spu.stop_and_signal(0x3fff);
 }
 
-bool spu_interpreter::STQX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool STQX(spu_thread& spu, spu_opcode_t op)
 {
 	spu._ref<v128>((spu.gpr[op.ra]._u32[3] + spu.gpr[op.rb]._u32[3]) & 0x3fff0) = spu.gpr[op.rt];
 	return true;
 }
 
-bool spu_interpreter::BI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BI(spu_thread& spu, spu_opcode_t op)
 {
 	spu.pc = spu_branch_target(spu.gpr[op.ra]._u32[3]);
-	set_interrupt_status(spu, op);
+	spu_interpreter::set_interrupt_status(spu, op);
 	return false;
 }
 
-bool spu_interpreter::BISL(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BISL(spu_thread& spu, spu_opcode_t op)
 {
 	const u32 target = spu_branch_target(spu.gpr[op.ra]._u32[3]);
 	spu.gpr[op.rt] = v128::from32r(spu_branch_target(spu.pc + 4));
 	spu.pc = target;
-	set_interrupt_status(spu, op);
+	spu_interpreter::set_interrupt_status(spu, op);
 	return false;
 }
 
-bool spu_interpreter::IRET(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool IRET(spu_thread& spu, spu_opcode_t op)
 {
 	spu.pc = spu.srr0;
-	set_interrupt_status(spu, op);
+	spu_interpreter::set_interrupt_status(spu, op);
 	return false;
 }
 
-bool spu_interpreter::BISLED(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BISLED(spu_thread& spu, spu_opcode_t op)
 {
 	const u32 target = spu_branch_target(spu.gpr[op.ra]._u32[3]);
 	spu.gpr[op.rt] = v128::from32r(spu_branch_target(spu.pc + 4));
@@ -544,111 +623,120 @@ bool spu_interpreter::BISLED(spu_thread& spu, spu_opcode_t op)
 	if (spu.get_events().count)
 	{
 		spu.pc = target;
-		set_interrupt_status(spu, op);
+		spu_interpreter::set_interrupt_status(spu, op);
 		return false;
 	}
 
 	return true;
 }
 
-bool spu_interpreter::HBR(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool HBR(spu_thread&, spu_opcode_t)
 {
 	return true;
 }
 
-bool spu_interpreter::GB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool GB(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::from32r(_mm_movemask_ps(_mm_castsi128_ps(_mm_slli_epi32(spu.gpr[op.ra].vi, 31))));
+	spu.gpr[op.rt] = v128::from32r(_mm_movemask_ps(_mm_castsi128_ps(_mm_slli_epi32(spu.gpr[op.ra], 31))));
 	return true;
 }
 
-bool spu_interpreter::GBH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool GBH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_packs_epi16(_mm_slli_epi16(spu.gpr[op.ra].vi, 15), _mm_setzero_si128())));
+	spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_packs_epi16(_mm_slli_epi16(spu.gpr[op.ra], 15), _mm_setzero_si128())));
 	return true;
 }
 
-bool spu_interpreter::GBB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool GBB(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_slli_epi64(spu.gpr[op.ra].vi, 7)));
+	spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_slli_epi64(spu.gpr[op.ra], 7)));
 	return true;
 }
 
-#ifndef _MSC_VER
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wold-style-cast"
-#endif
-
-bool spu_interpreter::FSM(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FSM(spu_thread& spu, spu_opcode_t op)
 {
-	const auto bits = _mm_shuffle_epi32(spu.gpr[op.ra].vi, 0xff);
+	const auto bits = _mm_shuffle_epi32(spu.gpr[op.ra], 0xff);
 	const auto mask = _mm_set_epi32(8, 4, 2, 1);
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi32(_mm_and_si128(bits, mask), mask);
+	spu.gpr[op.rt] = _mm_cmpeq_epi32(_mm_and_si128(bits, mask), mask);
 	return true;
 }
 
-bool spu_interpreter::FSMH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FSMH(spu_thread& spu, spu_opcode_t op)
 {
-	const auto vsrc = spu.gpr[op.ra].vi;
+	const auto vsrc = spu.gpr[op.ra];
 	const auto bits = _mm_shuffle_epi32(_mm_unpackhi_epi16(vsrc, vsrc), 0xaa);
 	const auto mask = _mm_set_epi16(128, 64, 32, 16, 8, 4, 2, 1);
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi16(_mm_and_si128(bits, mask), mask);
+	spu.gpr[op.rt] = _mm_cmpeq_epi16(_mm_and_si128(bits, mask), mask);
 	return true;
 }
 
-bool spu_interpreter::FSMB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FSMB(spu_thread& spu, spu_opcode_t op)
 {
-	const auto vsrc = spu.gpr[op.ra].vi;
+	const auto vsrc = spu.gpr[op.ra];
 	const auto bits = _mm_shuffle_epi32(_mm_shufflehi_epi16(_mm_unpackhi_epi8(vsrc, vsrc), 0x50), 0xfa);
 	const auto mask = _mm_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi8(_mm_and_si128(bits, mask), mask);
+	spu.gpr[op.rt] = _mm_cmpeq_epi8(_mm_and_si128(bits, mask), mask);
 	return true;
 }
 
-bool spu_interpreter_fast::FREST(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FREST(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vf = _mm_rcp_ps(spu.gpr[op.ra].vf);
+	spu.gpr[op.rt] = _mm_rcp_ps(spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter_fast::FRSQEST(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FRSQEST(spu_thread& spu, spu_opcode_t op)
 {
 	const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-	spu.gpr[op.rt].vf = _mm_rsqrt_ps(_mm_and_ps(spu.gpr[op.ra].vf, mask));
+	spu.gpr[op.rt] = _mm_rsqrt_ps(_mm_and_ps(spu.gpr[op.ra], mask));
 	return true;
 }
 
-bool spu_interpreter::LQX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool LQX(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = spu._ref<v128>((spu.gpr[op.ra]._u32[3] + spu.gpr[op.rb]._u32[3]) & 0x3fff0);
 	return true;
 }
 
-bool spu_interpreter::ROTQBYBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQBYBI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(32) const __m128i buf[2]{a, a};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (spu.gpr[op.rb]._u32[3] >> 3 & 0xf))));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (spu.gpr[op.rb]._u32[3] >> 3 & 0xf))));
 	return true;
 }
 
-bool spu_interpreter::ROTQMBYBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQMBYBI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + ((0 - (spu.gpr[op.rb]._u32[3] >> 3)) & 0x1f)));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + ((0 - (spu.gpr[op.rb]._u32[3] >> 3)) & 0x1f)));
 	return true;
 }
 
-bool spu_interpreter::SHLQBYBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHLQBYBI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (32 - (spu.gpr[op.rb]._u32[3] >> 3 & 0x1f))));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (32 - (spu.gpr[op.rb]._u32[3] >> 3 & 0x1f))));
 	return true;
 }
 
-bool spu_interpreter::CBX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CBX(spu_thread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
 	{
@@ -661,7 +749,8 @@ bool spu_interpreter::CBX(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::CHX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CHX(spu_thread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
 	{
@@ -674,7 +763,8 @@ bool spu_interpreter::CHX(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::CWX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CWX(spu_thread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
 	{
@@ -687,7 +777,8 @@ bool spu_interpreter::CWX(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::CDX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CDX(spu_thread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
 	{
@@ -700,61 +791,69 @@ bool spu_interpreter::CDX(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::ROTQBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQBI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const s32 n = spu.gpr[op.rb]._s32[3] & 0x7;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n));
 	return true;
 }
 
-bool spu_interpreter::ROTQMBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQMBI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const s32 n = -spu.gpr[op.rb]._s32[3] & 0x7;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(_mm_srli_si128(a, 8), 64 - n));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(_mm_srli_si128(a, 8), 64 - n));
 	return true;
 }
 
-bool spu_interpreter::SHLQBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHLQBI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const s32 n = spu.gpr[op.rb]._u32[3] & 0x7;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
 	return true;
 }
 
-bool spu_interpreter::ROTQBY(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQBY(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(32) const __m128i buf[2]{a, a};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (spu.gpr[op.rb]._u32[3] & 0xf))));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (spu.gpr[op.rb]._u32[3] & 0xf))));
 	return true;
 }
 
-bool spu_interpreter::ROTQMBY(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQMBY(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + ((0 - spu.gpr[op.rb]._u32[3]) & 0x1f)));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + ((0 - spu.gpr[op.rb]._u32[3]) & 0x1f)));
 	return true;
 }
 
-bool spu_interpreter::SHLQBY(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHLQBY(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (32 - (spu.gpr[op.rb]._u32[3] & 0x1f))));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (32 - (spu.gpr[op.rb]._u32[3] & 0x1f))));
 	return true;
 }
 
-bool spu_interpreter::ORX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ORX(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = v128::from32r(spu.gpr[op.ra]._u32[0] | spu.gpr[op.ra]._u32[1] | spu.gpr[op.ra]._u32[2] | spu.gpr[op.ra]._u32[3]);
 	return true;
 }
 
-bool spu_interpreter::CBD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CBD(spu_thread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
 	{
@@ -767,7 +866,8 @@ bool spu_interpreter::CBD(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::CHD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CHD(spu_thread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
 	{
@@ -780,7 +880,8 @@ bool spu_interpreter::CHD(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::CWD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CWD(spu_thread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
 	{
@@ -793,7 +894,8 @@ bool spu_interpreter::CWD(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::CDD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CDD(spu_thread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
 	{
@@ -806,95 +908,108 @@ bool spu_interpreter::CDD(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::ROTQBII(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQBII(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const s32 n = op.i7 & 0x7;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n));
 	return true;
 }
 
-bool spu_interpreter::ROTQMBII(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQMBII(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const s32 n = (0-op.i7) & 0x7;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(_mm_srli_si128(a, 8), 64 - n));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(_mm_srli_si128(a, 8), 64 - n));
 	return true;
 }
 
-bool spu_interpreter::SHLQBII(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHLQBII(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const s32 n = op.i7 & 0x7;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
 	return true;
 }
 
-bool spu_interpreter::ROTQBYI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQBYI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(32) const __m128i buf[2]{a, a};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (op.i7 & 0xf))));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (op.i7 & 0xf))));
 	return true;
 }
 
-bool spu_interpreter::ROTQMBYI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ROTQMBYI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + ((0 - op.i7) & 0x1f)));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + ((0 - op.i7) & 0x1f)));
 	return true;
 }
 
-bool spu_interpreter::SHLQBYI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHLQBYI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const __m128i a = spu.gpr[op.ra];
 	alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
-	spu.gpr[op.rt].vi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (32 - (op.i7 & 0x1f))));
+	spu.gpr[op.rt] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (32 - (op.i7 & 0x1f))));
 	return true;
 }
 
-bool spu_interpreter::NOP(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool NOP(spu_thread&, spu_opcode_t)
 {
 	return true;
 }
 
-bool spu_interpreter::CGT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CGT(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi32(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = _mm_cmpgt_epi32(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::XOR(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool XOR(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = spu.gpr[op.ra] ^ spu.gpr[op.rb];
 	return true;
 }
 
-bool spu_interpreter::CGTH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CGTH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi16(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = _mm_cmpgt_epi16(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::EQV(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool EQV(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = ~(spu.gpr[op.ra] ^ spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::CGTB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CGTB(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi8(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = _mm_cmpgt_epi8(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::SUMB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SUMB(spu_thread& spu, spu_opcode_t op)
 {
 	const auto m1 = _mm_set1_epi16(0xff);
 	const auto m2 = _mm_set1_epi32(0xffff);
-	const auto a = spu.gpr[op.ra].vi;
-	const auto b = spu.gpr[op.rb].vi;
+	const auto a = spu.gpr[op.ra];
+	const auto b = spu.gpr[op.rb];
 	const auto a1 = _mm_srli_epi16(a, 8);
 	const auto a2 = _mm_and_si128(a, m1);
 	const auto b1 = _mm_srli_epi16(b, 8);
@@ -905,11 +1020,12 @@ bool spu_interpreter::SUMB(spu_thread& spu, spu_opcode_t op)
 	const auto s1 = _mm_srli_epi32(sa, 16);
 	const auto s4 = _mm_andnot_si128(m2, sb);
 	const auto s3 = _mm_slli_epi32(sb, 16);
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_add_epi16(s1, s2), _mm_add_epi16(s3, s4));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_add_epi16(s1, s2), _mm_add_epi16(s3, s4));
 	return true;
 }
 
-bool spu_interpreter::HGT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool HGT(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.ra]._s32[3] > spu.gpr[op.rb]._s32[3])
 	{
@@ -918,7 +1034,8 @@ bool spu_interpreter::HGT(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::CLZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CLZ(spu_thread& spu, spu_opcode_t op)
 {
 	for (u32 i = 0; i < 4; i++)
 	{
@@ -927,51 +1044,58 @@ bool spu_interpreter::CLZ(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::XSWD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool XSWD(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt]._s64[0] = spu.gpr[op.ra]._s32[0];
 	spu.gpr[op.rt]._s64[1] = spu.gpr[op.ra]._s32[2];
 	return true;
 }
 
-bool spu_interpreter::XSHW(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool XSHW(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_srai_epi32(_mm_slli_epi32(spu.gpr[op.ra].vi, 16), 16);
+	spu.gpr[op.rt] = _mm_srai_epi32(_mm_slli_epi32(spu.gpr[op.ra], 16), 16);
 	return true;
 }
 
-bool spu_interpreter::CNTB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CNTB(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const auto mask1 = _mm_set1_epi8(0x55);
 	const auto sum1 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(a, 1), mask1), _mm_and_si128(a, mask1));
 	const auto mask2 = _mm_set1_epi8(0x33);
 	const auto sum2 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(sum1, 2), mask2), _mm_and_si128(sum1, mask2));
 	const auto mask3 = _mm_set1_epi8(0x0f);
 	const auto sum3 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(sum2, 4), mask3), _mm_and_si128(sum2, mask3));
-	spu.gpr[op.rt].vi = sum3;
+	spu.gpr[op.rt] = sum3;
 	return true;
 }
 
-bool spu_interpreter::XSBH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool XSBH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_srai_epi16(_mm_slli_epi16(spu.gpr[op.ra].vi, 8), 8);
+	spu.gpr[op.rt] = _mm_srai_epi16(_mm_slli_epi16(spu.gpr[op.ra], 8), 8);
 	return true;
 }
 
-bool spu_interpreter::CLGT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CLGT(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = sse_cmpgt_epu32(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = gv_gtu32(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::ANDC(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ANDC(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::andnot(spu.gpr[op.rb], spu.gpr[op.ra]);
+	spu.gpr[op.rt] = gv_andn(spu.gpr[op.rb], spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter_fast::FCGT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FCGT(spu_thread& spu, spu_opcode_t op)
 {
 	// IMPL NOTES:
 	// if (v is inf) v = (inf - 1) i.e nearest normal value to inf with mantissa bits left intact
@@ -980,73 +1104,77 @@ bool spu_interpreter_fast::FCGT(spu_thread& spu, spu_opcode_t op)
 	// branching simulated using bitwise ops and_not+or
 
 	const auto zero = _mm_set1_ps(0.f);
-	const auto nan_check_a = _mm_cmpunord_ps(spu.gpr[op.ra].vf, zero);    //mask true where a is extended
-	const auto nan_check_b = _mm_cmpunord_ps(spu.gpr[op.rb].vf, zero);    //mask true where b is extended
+	const auto nan_check_a = _mm_cmpunord_ps(spu.gpr[op.ra], zero);    //mask true where a is extended
+	const auto nan_check_b = _mm_cmpunord_ps(spu.gpr[op.rb], zero);    //mask true where b is extended
 
 	//calculate lowered a and b. The mantissa bits are left untouched for now unless its proven they should be flushed
 	const auto last_exp_bit = _mm_castsi128_ps(_mm_set1_epi32(0x00800000));
-	const auto lowered_a =_mm_andnot_ps(last_exp_bit, spu.gpr[op.ra].vf);      //a is lowered to largest unextended value with sign
-	const auto lowered_b = _mm_andnot_ps(last_exp_bit, spu.gpr[op.rb].vf);		//b is lowered to largest unextended value with sign
+	const auto lowered_a =_mm_andnot_ps(last_exp_bit, spu.gpr[op.ra]);      //a is lowered to largest unextended value with sign
+	const auto lowered_b = _mm_andnot_ps(last_exp_bit, spu.gpr[op.rb]);		//b is lowered to largest unextended value with sign
 
 	//check if a and b are denormalized
 	const auto all_exp_bits = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
-	const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra].vf));
-	const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb].vf));
+	const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra]));
+	const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb]));
 
 	//set a and b to their lowered values if they are extended
 	const auto a_values_lowered = _mm_and_ps(nan_check_a, lowered_a);
-	const auto original_a_masked = _mm_andnot_ps(nan_check_a, spu.gpr[op.ra].vf);
+	const auto original_a_masked = _mm_andnot_ps(nan_check_a, spu.gpr[op.ra]);
 	const auto a_final1 = _mm_or_ps(a_values_lowered, original_a_masked);
 
 	const auto b_values_lowered = _mm_and_ps(nan_check_b, lowered_b);
-	const auto original_b_masked = _mm_andnot_ps(nan_check_b, spu.gpr[op.rb].vf);
+	const auto original_b_masked = _mm_andnot_ps(nan_check_b, spu.gpr[op.rb]);
 	const auto b_final1 = _mm_or_ps(b_values_lowered, original_b_masked);
 
 	//Flush denormals to zero
 	const auto final_a = _mm_andnot_ps(denorm_check_a, a_final1);
 	const auto final_b = _mm_andnot_ps(denorm_check_b, b_final1);
 
-	spu.gpr[op.rt].vf = _mm_cmplt_ps(final_b, final_a);
+	spu.gpr[op.rt] = _mm_cmplt_ps(final_b, final_a);
 	return true;
 }
 
-bool spu_interpreter::DFCGT(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool DFCGT(spu_thread&, spu_opcode_t)
 {
 	spu_log.fatal("DFCGT");
 	return false;
 }
 
-bool spu_interpreter_fast::FA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FA(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::addfs(spu.gpr[op.ra], spu.gpr[op.rb]);
+	spu.gpr[op.rt] = gv_addfs(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter_fast::FS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FS(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::subfs(spu.gpr[op.ra], spu.gpr[op.rb]);
+	spu.gpr[op.rt] = gv_subfs(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter_fast::FM(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FM(spu_thread& spu, spu_opcode_t op)
 {
 	const auto zero = _mm_set1_ps(0.f);
 	const auto sign_bits = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 	const auto all_exp_bits = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
 
 	//check denormals
-	const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra].vf));
-	const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb].vf));
+	const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra]));
+	const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb]));
 	const auto denorm_operand_mask = _mm_or_ps(denorm_check_a, denorm_check_b);
 
 	//compute result with flushed denormal inputs
-	const auto primary_result = _mm_mul_ps(spu.gpr[op.ra].vf, spu.gpr[op.rb].vf);
+	const auto primary_result = _mm_mul_ps(spu.gpr[op.ra], spu.gpr[op.rb]);
 	const auto denom_result_mask = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, primary_result));
 	const auto flushed_result = _mm_andnot_ps(_mm_or_ps(denom_result_mask, denorm_operand_mask), primary_result);
 
 	//check for extended
 	const auto nan_check = _mm_cmpeq_ps(_mm_and_ps(primary_result, all_exp_bits), all_exp_bits);
-	const auto sign_mask = _mm_xor_ps(_mm_and_ps(sign_bits, spu.gpr[op.ra].vf), _mm_and_ps(sign_bits, spu.gpr[op.rb].vf));
+	const auto sign_mask = _mm_xor_ps(_mm_and_ps(sign_bits, spu.gpr[op.ra]), _mm_and_ps(sign_bits, spu.gpr[op.rb]));
 	const auto extended_result = _mm_or_ps(sign_mask, _mm_andnot_ps(sign_bits, primary_result));
 	const auto final_extended = _mm_andnot_ps(denorm_operand_mask, extended_result);
 
@@ -1054,38 +1182,41 @@ bool spu_interpreter_fast::FM(spu_thread& spu, spu_opcode_t op)
 	const auto set1 = _mm_andnot_ps(nan_check, flushed_result);
 	const auto set2 = _mm_and_ps(nan_check, final_extended);
 
-	spu.gpr[op.rt].vf = _mm_or_ps(set1, set2);
+	spu.gpr[op.rt] = _mm_or_ps(set1, set2);
 	return true;
 }
 
-bool spu_interpreter::CLGTH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CLGTH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = sse_cmpgt_epu16(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = gv_gtu16(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::ORC(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ORC(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = spu.gpr[op.ra] | ~spu.gpr[op.rb];
 	return true;
 }
 
-bool spu_interpreter_fast::FCMGT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FCMGT(spu_thread& spu, spu_opcode_t op)
 {
 	//IMPL NOTES: See FCGT
 
 	const auto zero = _mm_set1_ps(0.f);
-	const auto nan_check_a = _mm_cmpunord_ps(spu.gpr[op.ra].vf, zero);    //mask true where a is extended
-	const auto nan_check_b = _mm_cmpunord_ps(spu.gpr[op.rb].vf, zero);    //mask true where b is extended
+	const auto nan_check_a = _mm_cmpunord_ps(spu.gpr[op.ra], zero);    //mask true where a is extended
+	const auto nan_check_b = _mm_cmpunord_ps(spu.gpr[op.rb], zero);    //mask true where b is extended
 
 	//check if a and b are denormalized
 	const auto all_exp_bits = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
-	const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra].vf));
-	const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb].vf));
+	const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra]));
+	const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb]));
 
 	//Flush denormals to zero
-	const auto final_a = _mm_andnot_ps(denorm_check_a, spu.gpr[op.ra].vf);
-	const auto final_b = _mm_andnot_ps(denorm_check_b, spu.gpr[op.rb].vf);
+	const auto final_a = _mm_andnot_ps(denorm_check_a, spu.gpr[op.ra]);
+	const auto final_b = _mm_andnot_ps(denorm_check_b, spu.gpr[op.rb]);
 
 	//Mask to make a > b if a is extended but b is not (is this necessary on x86?)
 	const auto nan_mask = _mm_andnot_ps(nan_check_b, _mm_xor_ps(nan_check_a, nan_check_b));
@@ -1093,41 +1224,47 @@ bool spu_interpreter_fast::FCMGT(spu_thread& spu, spu_opcode_t op)
 	const auto sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
 	const auto comparison = _mm_cmplt_ps(_mm_and_ps(final_b, sign_mask), _mm_and_ps(final_a, sign_mask));
 
-	spu.gpr[op.rt].vf = _mm_or_ps(comparison, nan_mask);
+	spu.gpr[op.rt] = _mm_or_ps(comparison, nan_mask);
 	return true;
 }
 
-bool spu_interpreter::DFCMGT(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool DFCMGT(spu_thread&, spu_opcode_t)
 {
 	spu_log.fatal("DFCMGT");
 	return false;
 }
 
-bool spu_interpreter_fast::DFA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool DFA(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::addfd(spu.gpr[op.ra], spu.gpr[op.rb]);
+	spu.gpr[op.rt] = gv_addfd(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter_fast::DFS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool DFS(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::subfd(spu.gpr[op.ra], spu.gpr[op.rb]);
+	spu.gpr[op.rt] = gv_subfd(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter_fast::DFM(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool DFM(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vd = _mm_mul_pd(spu.gpr[op.ra].vd, spu.gpr[op.rb].vd);
+	spu.gpr[op.rt] = _mm_mul_pd(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::CLGTB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CLGTB(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = sse_cmpgt_epu8(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = gv_gtu8(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::HLGT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool HLGT(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.ra]._u32[3] > spu.gpr[op.rb]._u32[3])
 	{
@@ -1136,57 +1273,66 @@ bool spu_interpreter::HLGT(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter_fast::DFMA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool DFMA(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vd = _mm_add_pd(_mm_mul_pd(spu.gpr[op.ra].vd, spu.gpr[op.rb].vd), spu.gpr[op.rt].vd);
+	spu.gpr[op.rt] = _mm_add_pd(_mm_mul_pd(spu.gpr[op.ra], spu.gpr[op.rb]), spu.gpr[op.rt]);
 	return true;
 }
 
-bool spu_interpreter_fast::DFMS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool DFMS(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vd = _mm_sub_pd(_mm_mul_pd(spu.gpr[op.ra].vd, spu.gpr[op.rb].vd), spu.gpr[op.rt].vd);
+	spu.gpr[op.rt] = _mm_sub_pd(_mm_mul_pd(spu.gpr[op.ra], spu.gpr[op.rb]), spu.gpr[op.rt]);
 	return true;
 }
 
-bool spu_interpreter_fast::DFNMS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool DFNMS(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vd = _mm_sub_pd(spu.gpr[op.rt].vd, _mm_mul_pd(spu.gpr[op.ra].vd, spu.gpr[op.rb].vd));
+	spu.gpr[op.rt] = _mm_sub_pd(spu.gpr[op.rt], _mm_mul_pd(spu.gpr[op.ra], spu.gpr[op.rb]));
 	return true;
 }
 
-bool spu_interpreter_fast::DFNMA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool DFNMA(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vd = _mm_xor_pd(_mm_add_pd(_mm_mul_pd(spu.gpr[op.ra].vd, spu.gpr[op.rb].vd), spu.gpr[op.rt].vd), _mm_set1_pd(-0.0));
+	spu.gpr[op.rt] = _mm_xor_pd(_mm_add_pd(_mm_mul_pd(spu.gpr[op.ra], spu.gpr[op.rb]), spu.gpr[op.rt]), _mm_set1_pd(-0.0));
 	return true;
 }
 
-bool spu_interpreter::CEQ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CEQ(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi32(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = _mm_cmpeq_epi32(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter::MPYHHU(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYHHU(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
-	const auto b = spu.gpr[op.rb].vi;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_srli_epi32(_mm_mullo_epi16(a, b), 16), _mm_and_si128(_mm_mulhi_epu16(a, b), _mm_set1_epi32(0xffff0000)));
+	const auto a = spu.gpr[op.ra];
+	const auto b = spu.gpr[op.rb];
+	spu.gpr[op.rt] = _mm_or_si128(_mm_srli_epi32(_mm_mullo_epi16(a, b), 16), _mm_and_si128(_mm_mulhi_epu16(a, b), _mm_set1_epi32(0xffff0000)));
 	return true;
 }
 
-bool spu_interpreter::ADDX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ADDX(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::add32(v128::add32(spu.gpr[op.ra], spu.gpr[op.rb]), spu.gpr[op.rt] & v128::from32p(1));
+	spu.gpr[op.rt] = gv_add32(gv_add32(spu.gpr[op.ra], spu.gpr[op.rb]), spu.gpr[op.rt] & v128::from32p(1));
 	return true;
 }
 
-bool spu_interpreter::SFX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SFX(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt] = v128::sub32(v128::sub32(spu.gpr[op.rb], spu.gpr[op.ra]), v128::andnot(spu.gpr[op.rt], v128::from32p(1)));
+	spu.gpr[op.rt] = gv_sub32(gv_sub32(spu.gpr[op.rb], spu.gpr[op.ra]), gv_andn(spu.gpr[op.rt], v128::from32p(1)));
 	return true;
 }
 
-bool spu_interpreter::CGX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CGX(spu_thread& spu, spu_opcode_t op)
 {
 	for (s32 i = 0; i < 4; i++)
 	{
@@ -1196,7 +1342,8 @@ bool spu_interpreter::CGX(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::BGX(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BGX(spu_thread& spu, spu_opcode_t op)
 {
 	for (s32 i = 0; i < 4; i++)
 	{
@@ -1206,136 +1353,156 @@ bool spu_interpreter::BGX(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::MPYHHA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYHHA(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_add_epi32(spu.gpr[op.rt].vi, _mm_madd_epi16(_mm_srli_epi32(spu.gpr[op.ra].vi, 16), _mm_srli_epi32(spu.gpr[op.rb].vi, 16)));
+	spu.gpr[op.rt] = _mm_add_epi32(spu.gpr[op.rt], _mm_madd_epi16(_mm_srli_epi32(spu.gpr[op.ra], 16), _mm_srli_epi32(spu.gpr[op.rb], 16)));
 	return true;
 }
 
-bool spu_interpreter::MPYHHAU(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYHHAU(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
-	const auto b = spu.gpr[op.rb].vi;
-	spu.gpr[op.rt].vi = _mm_add_epi32(spu.gpr[op.rt].vi, _mm_or_si128(_mm_srli_epi32(_mm_mullo_epi16(a, b), 16), _mm_and_si128(_mm_mulhi_epu16(a, b), _mm_set1_epi32(0xffff0000))));
+	const auto a = spu.gpr[op.ra];
+	const auto b = spu.gpr[op.rb];
+	spu.gpr[op.rt] = _mm_add_epi32(spu.gpr[op.rt], _mm_or_si128(_mm_srli_epi32(_mm_mullo_epi16(a, b), 16), _mm_and_si128(_mm_mulhi_epu16(a, b), _mm_set1_epi32(0xffff0000))));
 	return true;
 }
 
-bool spu_interpreter_fast::FSCRRD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FSCRRD(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt].clear();
 	return true;
 }
 
-bool spu_interpreter_fast::FESD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FESD(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vf;
-	spu.gpr[op.rt].vd = _mm_cvtps_pd(_mm_shuffle_ps(a, a, 0x8d));
+	const auto a = spu.gpr[op.ra];
+	spu.gpr[op.rt] = _mm_cvtps_pd(_mm_shuffle_ps(a, a, 0x8d));
 	return true;
 }
 
-bool spu_interpreter_fast::FRDS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FRDS(spu_thread& spu, spu_opcode_t op)
 {
-	const auto t = _mm_cvtpd_ps(spu.gpr[op.ra].vd);
-	spu.gpr[op.rt].vf = _mm_shuffle_ps(t, t, 0x72);
+	const auto t = _mm_cvtpd_ps(spu.gpr[op.ra]);
+	spu.gpr[op.rt] = _mm_shuffle_ps(t, t, 0x72);
 	return true;
 }
 
-bool spu_interpreter_fast::FSCRWR(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool FSCRWR(spu_thread&, spu_opcode_t)
 {
 	return true;
 }
 
-bool spu_interpreter::DFTSV(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool DFTSV(spu_thread&, spu_opcode_t)
 {
 	spu_log.fatal("DFTSV");
 	return false;
 }
 
-bool spu_interpreter_fast::FCEQ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FCEQ(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vf = _mm_cmpeq_ps(spu.gpr[op.rb].vf, spu.gpr[op.ra].vf);
+	spu.gpr[op.rt] = _mm_cmpeq_ps(spu.gpr[op.rb], spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter::DFCEQ(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool DFCEQ(spu_thread&, spu_opcode_t)
 {
 	spu_log.fatal("DFCEQ");
 	return false;
 }
 
-bool spu_interpreter::MPY(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPY(spu_thread& spu, spu_opcode_t op)
 {
 	const auto mask = _mm_set1_epi32(0xffff);
-	spu.gpr[op.rt].vi = _mm_madd_epi16(_mm_and_si128(spu.gpr[op.ra].vi, mask), _mm_and_si128(spu.gpr[op.rb].vi, mask));
+	spu.gpr[op.rt] = _mm_madd_epi16(_mm_and_si128(spu.gpr[op.ra], mask), _mm_and_si128(spu.gpr[op.rb], mask));
 	return true;
 }
 
-bool spu_interpreter::MPYH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(spu.gpr[op.ra].vi, 16), spu.gpr[op.rb].vi), 16);
+	spu.gpr[op.rt] = _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(spu.gpr[op.ra], 16), spu.gpr[op.rb]), 16);
 	return true;
 }
 
-bool spu_interpreter::MPYHH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYHH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_madd_epi16(_mm_srli_epi32(spu.gpr[op.ra].vi, 16), _mm_srli_epi32(spu.gpr[op.rb].vi, 16));
+	spu.gpr[op.rt] = _mm_madd_epi16(_mm_srli_epi32(spu.gpr[op.ra], 16), _mm_srli_epi32(spu.gpr[op.rb], 16));
 	return true;
 }
 
-bool spu_interpreter::MPYS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYS(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_srai_epi32(_mm_slli_epi32(_mm_mulhi_epi16(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi), 16), 16);
+	spu.gpr[op.rt] = _mm_srai_epi32(_mm_slli_epi32(_mm_mulhi_epi16(spu.gpr[op.ra], spu.gpr[op.rb]), 16), 16);
 	return true;
 }
 
-bool spu_interpreter::CEQH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CEQH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi16(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = _mm_cmpeq_epi16(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter_fast::FCMEQ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FCMEQ(spu_thread& spu, spu_opcode_t op)
 {
 	const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-	spu.gpr[op.rt].vf = _mm_cmpeq_ps(_mm_and_ps(spu.gpr[op.rb].vf, mask), _mm_and_ps(spu.gpr[op.ra].vf, mask));
+	spu.gpr[op.rt] = _mm_cmpeq_ps(_mm_and_ps(spu.gpr[op.rb], mask), _mm_and_ps(spu.gpr[op.ra], mask));
 	return true;
 }
 
-bool spu_interpreter::DFCMEQ(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool DFCMEQ(spu_thread&, spu_opcode_t)
 {
 	spu_log.fatal("DFCMEQ");
 	return false;
 }
 
-bool spu_interpreter::MPYU(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYU(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
-	const auto b = spu.gpr[op.rb].vi;
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_and_si128(_mm_mullo_epi16(a, b), _mm_set1_epi32(0xffff)));
+	const auto a = spu.gpr[op.ra];
+	const auto b = spu.gpr[op.rb];
+	spu.gpr[op.rt] = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_and_si128(_mm_mullo_epi16(a, b), _mm_set1_epi32(0xffff)));
 	return true;
 }
 
-bool spu_interpreter::CEQB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CEQB(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi8(spu.gpr[op.ra].vi, spu.gpr[op.rb].vi);
+	spu.gpr[op.rt] = _mm_cmpeq_epi8(spu.gpr[op.ra], spu.gpr[op.rb]);
 	return true;
 }
 
-bool spu_interpreter_fast::FI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FI(spu_thread& spu, spu_opcode_t op)
 {
 	// TODO
 	const auto mask_se = _mm_castsi128_ps(_mm_set1_epi32(0xff800000)); // sign and exponent mask
 	const auto mask_bf = _mm_castsi128_ps(_mm_set1_epi32(0x007ffc00)); // base fraction mask
 	const auto mask_sf = _mm_set1_epi32(0x000003ff); // step fraction mask
 	const auto mask_yf = _mm_set1_epi32(0x0007ffff); // Y fraction mask (bits 13..31)
-	const auto base = _mm_or_ps(_mm_and_ps(spu.gpr[op.rb].vf, mask_bf), _mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
-	const auto step = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(spu.gpr[op.rb].vi, mask_sf)), _mm_set1_ps(std::exp2(-13.f)));
-	const auto y = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(spu.gpr[op.ra].vi, mask_yf)), _mm_set1_ps(std::exp2(-19.f)));
-	spu.gpr[op.rt].vf = _mm_or_ps(_mm_and_ps(mask_se, spu.gpr[op.rb].vf), _mm_andnot_ps(mask_se, _mm_sub_ps(base, _mm_mul_ps(step, y))));
+	const auto base = _mm_or_ps(_mm_and_ps(spu.gpr[op.rb], mask_bf), _mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
+	const auto step = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(spu.gpr[op.rb], mask_sf)), _mm_set1_ps(std::exp2(-13.f)));
+	const auto y = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(spu.gpr[op.ra], mask_yf)), _mm_set1_ps(std::exp2(-19.f)));
+	spu.gpr[op.rt] = _mm_or_ps(_mm_and_ps(mask_se, spu.gpr[op.rb]), _mm_andnot_ps(mask_se, _mm_sub_ps(base, _mm_mul_ps(step, y))));
 	return true;
 }
 
-bool spu_interpreter::HEQ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool HEQ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.ra]._s32[3] == spu.gpr[op.rb]._s32[3])
 	{
@@ -1345,37 +1512,42 @@ bool spu_interpreter::HEQ(spu_thread& spu, spu_opcode_t op)
 }
 
 
-bool spu_interpreter_fast::CFLTS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CFLTS(spu_thread& spu, spu_opcode_t op)
 {
-	const auto scaled = _mm_mul_ps(spu.gpr[op.ra].vf, g_spu_imm.scale[173 - op.i8]);
-	spu.gpr[op.rt].vi = _mm_xor_si128(_mm_cvttps_epi32(scaled), _mm_castps_si128(_mm_cmpge_ps(scaled, _mm_set1_ps(0x80000000))));
+	const auto scaled = _mm_mul_ps(spu.gpr[op.ra], g_spu_imm.scale[173 - op.i8]);
+	spu.gpr[op.rt] = _mm_xor_si128(_mm_cvttps_epi32(scaled), _mm_castps_si128(_mm_cmpge_ps(scaled, _mm_set1_ps(0x80000000))));
 	return true;
 }
 
-bool spu_interpreter_fast::CFLTU(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CFLTU(spu_thread& spu, spu_opcode_t op)
 {
-	const auto scaled1 = _mm_max_ps(_mm_mul_ps(spu.gpr[op.ra].vf, g_spu_imm.scale[173 - op.i8]), _mm_set1_ps(0.0f));
+	const auto scaled1 = _mm_max_ps(_mm_mul_ps(spu.gpr[op.ra], g_spu_imm.scale[173 - op.i8]), _mm_set1_ps(0.0f));
 	const auto scaled2 = _mm_and_ps(_mm_sub_ps(scaled1, _mm_set1_ps(0x80000000)), _mm_cmpge_ps(scaled1, _mm_set1_ps(0x80000000)));
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_or_si128(_mm_cvttps_epi32(scaled1), _mm_cvttps_epi32(scaled2)), _mm_castps_si128(_mm_cmpge_ps(scaled1, _mm_set1_ps(0x100000000))));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_or_si128(_mm_cvttps_epi32(scaled1), _mm_cvttps_epi32(scaled2)), _mm_castps_si128(_mm_cmpge_ps(scaled1, _mm_set1_ps(0x100000000))));
 	return true;
 }
 
-bool spu_interpreter_fast::CSFLT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CSFLT(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vf = _mm_mul_ps(_mm_cvtepi32_ps(spu.gpr[op.ra].vi), g_spu_imm.scale[op.i8 - 155]);
+	spu.gpr[op.rt] = _mm_mul_ps(_mm_cvtepi32_ps(spu.gpr[op.ra]), g_spu_imm.scale[op.i8 - 155]);
 	return true;
 }
 
-bool spu_interpreter_fast::CUFLT(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CUFLT(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(a, 31)), _mm_set1_ps(0x80000000));
-	spu.gpr[op.rt].vf = _mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(a, _mm_set1_epi32(0x7fffffff))), fix), g_spu_imm.scale[op.i8 - 155]);
+	spu.gpr[op.rt] = _mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(a, _mm_set1_epi32(0x7fffffff))), fix), g_spu_imm.scale[op.i8 - 155]);
 	return true;
 }
 
 
-bool spu_interpreter::BRZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BRZ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.rt]._u32[3] == 0)
 	{
@@ -1385,13 +1557,15 @@ bool spu_interpreter::BRZ(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::STQA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool STQA(spu_thread& spu, spu_opcode_t op)
 {
 	spu._ref<v128>(spu_ls_target(0, op.i16)) = spu.gpr[op.rt];
 	return true;
 }
 
-bool spu_interpreter::BRNZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BRNZ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.rt]._u32[3] != 0)
 	{
@@ -1401,7 +1575,8 @@ bool spu_interpreter::BRNZ(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::BRHZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BRHZ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.rt]._u16[6] == 0)
 	{
@@ -1411,7 +1586,8 @@ bool spu_interpreter::BRHZ(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::BRHNZ(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BRHNZ(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.rt]._u16[6] != 0)
 	{
@@ -1421,25 +1597,29 @@ bool spu_interpreter::BRHNZ(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::STQR(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool STQR(spu_thread& spu, spu_opcode_t op)
 {
 	spu._ref<v128>(spu_ls_target(spu.pc, op.i16)) = spu.gpr[op.rt];
 	return true;
 }
 
-bool spu_interpreter::BRA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BRA(spu_thread& spu, spu_opcode_t op)
 {
 	spu.pc = spu_branch_target(0, op.i16);
 	return false;
 }
 
-bool spu_interpreter::LQA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool LQA(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = spu._ref<v128>(spu_ls_target(0, op.i16));
 	return true;
 }
 
-bool spu_interpreter::BRASL(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BRASL(spu_thread& spu, spu_opcode_t op)
 {
 	const u32 target = spu_branch_target(0, op.i16);
 	spu.gpr[op.rt] = v128::from32r(spu_branch_target(spu.pc + 4));
@@ -1447,22 +1627,25 @@ bool spu_interpreter::BRASL(spu_thread& spu, spu_opcode_t op)
 	return false;
 }
 
-bool spu_interpreter::BR(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BR(spu_thread& spu, spu_opcode_t op)
 {
 	spu.pc = spu_branch_target(spu.pc, op.i16);
 	return false;
 }
 
-bool spu_interpreter::FSMBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FSMBI(spu_thread& spu, spu_opcode_t op)
 {
 	const auto vsrc = _mm_set_epi32(0, 0, 0, op.i16);
 	const auto bits = _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_unpacklo_epi8(vsrc, vsrc), 0x50), 0x50);
 	const auto mask = _mm_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi8(_mm_and_si128(bits, mask), mask);
+	spu.gpr[op.rt] = _mm_cmpeq_epi8(_mm_and_si128(bits, mask), mask);
 	return true;
 }
 
-bool spu_interpreter::BRSL(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool BRSL(spu_thread& spu, spu_opcode_t op)
 {
 	const u32 target = spu_branch_target(spu.pc, op.i16);
 	spu.gpr[op.rt] = v128::from32r(spu_branch_target(spu.pc + 4));
@@ -1470,146 +1653,170 @@ bool spu_interpreter::BRSL(spu_thread& spu, spu_opcode_t op)
 	return false;
 }
 
-bool spu_interpreter::LQR(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool LQR(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = spu._ref<v128>(spu_ls_target(spu.pc, op.i16));
 	return true;
 }
 
-bool spu_interpreter::IL(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool IL(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_set1_epi32(op.si16);
+	spu.gpr[op.rt] = _mm_set1_epi32(op.si16);
 	return true;
 }
 
-bool spu_interpreter::ILHU(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ILHU(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_set1_epi32(op.i16 << 16);
+	spu.gpr[op.rt] = _mm_set1_epi32(op.i16 << 16);
 	return true;
 }
 
-bool spu_interpreter::ILH(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ILH(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_set1_epi16(op.i16);
+	spu.gpr[op.rt] = _mm_set1_epi16(op.i16);
 	return true;
 }
 
-bool spu_interpreter::IOHL(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool IOHL(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_or_si128(spu.gpr[op.rt].vi, _mm_set1_epi32(op.i16));
+	spu.gpr[op.rt] = _mm_or_si128(spu.gpr[op.rt], _mm_set1_epi32(op.i16));
 	return true;
 }
 
 
-bool spu_interpreter::ORI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ORI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_or_si128(spu.gpr[op.ra].vi, _mm_set1_epi32(op.si10));
+	spu.gpr[op.rt] = _mm_or_si128(spu.gpr[op.ra], _mm_set1_epi32(op.si10));
 	return true;
 }
 
-bool spu_interpreter::ORHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ORHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_or_si128(spu.gpr[op.ra].vi, _mm_set1_epi16(op.si10));
+	spu.gpr[op.rt] = _mm_or_si128(spu.gpr[op.ra], _mm_set1_epi16(op.si10));
 	return true;
 }
 
-bool spu_interpreter::ORBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ORBI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_or_si128(spu.gpr[op.ra].vi, _mm_set1_epi8(op.i8));
+	spu.gpr[op.rt] = _mm_or_si128(spu.gpr[op.ra], _mm_set1_epi8(op.i8));
 	return true;
 }
 
-bool spu_interpreter::SFI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SFI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_sub_epi32(_mm_set1_epi32(op.si10), spu.gpr[op.ra].vi);
+	spu.gpr[op.rt] = _mm_sub_epi32(_mm_set1_epi32(op.si10), spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter::SFHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SFHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_sub_epi16(_mm_set1_epi16(op.si10), spu.gpr[op.ra].vi);
+	spu.gpr[op.rt] = _mm_sub_epi16(_mm_set1_epi16(op.si10), spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter::ANDI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ANDI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_and_si128(spu.gpr[op.ra].vi, _mm_set1_epi32(op.si10));
+	spu.gpr[op.rt] = _mm_and_si128(spu.gpr[op.ra], _mm_set1_epi32(op.si10));
 	return true;
 }
 
-bool spu_interpreter::ANDHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ANDHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_and_si128(spu.gpr[op.ra].vi, _mm_set1_epi16(op.si10));
+	spu.gpr[op.rt] = _mm_and_si128(spu.gpr[op.ra], _mm_set1_epi16(op.si10));
 	return true;
 }
 
-bool spu_interpreter::ANDBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ANDBI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_and_si128(spu.gpr[op.ra].vi, _mm_set1_epi8(op.i8));
+	spu.gpr[op.rt] = _mm_and_si128(spu.gpr[op.ra], _mm_set1_epi8(op.i8));
 	return true;
 }
 
-bool spu_interpreter::AI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool AI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_add_epi32(_mm_set1_epi32(op.si10), spu.gpr[op.ra].vi);
+	spu.gpr[op.rt] = _mm_add_epi32(_mm_set1_epi32(op.si10), spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter::AHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool AHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_add_epi16(_mm_set1_epi16(op.si10), spu.gpr[op.ra].vi);
+	spu.gpr[op.rt] = _mm_add_epi16(_mm_set1_epi16(op.si10), spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter::STQD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool STQD(spu_thread& spu, spu_opcode_t op)
 {
 	spu._ref<v128>((spu.gpr[op.ra]._s32[3] + (op.si10 * 16)) & 0x3fff0) = spu.gpr[op.rt];
 	return true;
 }
 
-bool spu_interpreter::LQD(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool LQD(spu_thread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = spu._ref<v128>((spu.gpr[op.ra]._s32[3] + (op.si10 * 16)) & 0x3fff0);
 	return true;
 }
 
-bool spu_interpreter::XORI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool XORI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_xor_si128(spu.gpr[op.ra].vi, _mm_set1_epi32(op.si10));
+	spu.gpr[op.rt] = _mm_xor_si128(spu.gpr[op.ra], _mm_set1_epi32(op.si10));
 	return true;
 }
 
-bool spu_interpreter::XORHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool XORHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_xor_si128(spu.gpr[op.ra].vi, _mm_set1_epi16(op.si10));
+	spu.gpr[op.rt] = _mm_xor_si128(spu.gpr[op.ra], _mm_set1_epi16(op.si10));
 	return true;
 }
 
-bool spu_interpreter::XORBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool XORBI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_xor_si128(spu.gpr[op.ra].vi, _mm_set1_epi8(op.i8));
+	spu.gpr[op.rt] = _mm_xor_si128(spu.gpr[op.ra], _mm_set1_epi8(op.i8));
 	return true;
 }
 
-bool spu_interpreter::CGTI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CGTI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi32(spu.gpr[op.ra].vi, _mm_set1_epi32(op.si10));
+	spu.gpr[op.rt] = _mm_cmpgt_epi32(spu.gpr[op.ra], _mm_set1_epi32(op.si10));
 	return true;
 }
 
-bool spu_interpreter::CGTHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CGTHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi16(spu.gpr[op.ra].vi, _mm_set1_epi16(op.si10));
+	spu.gpr[op.rt] = _mm_cmpgt_epi16(spu.gpr[op.ra], _mm_set1_epi16(op.si10));
 	return true;
 }
 
-bool spu_interpreter::CGTBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CGTBI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi8(spu.gpr[op.ra].vi, _mm_set1_epi8(op.i8));
+	spu.gpr[op.rt] = _mm_cmpgt_epi8(spu.gpr[op.ra], _mm_set1_epi8(op.i8));
 	return true;
 }
 
-bool spu_interpreter::HGTI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool HGTI(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.ra]._s32[3] > op.si10)
 	{
@@ -1618,25 +1825,29 @@ bool spu_interpreter::HGTI(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::CLGTI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CLGTI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi32(_mm_xor_si128(spu.gpr[op.ra].vi, _mm_set1_epi32(0x80000000)), _mm_set1_epi32(op.si10 ^ 0x80000000));
+	spu.gpr[op.rt] = _mm_cmpgt_epi32(_mm_xor_si128(spu.gpr[op.ra], _mm_set1_epi32(0x80000000)), _mm_set1_epi32(op.si10 ^ 0x80000000));
 	return true;
 }
 
-bool spu_interpreter::CLGTHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CLGTHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi16(_mm_xor_si128(spu.gpr[op.ra].vi, _mm_set1_epi32(0x80008000)), _mm_set1_epi16(op.si10 ^ 0x8000));
+	spu.gpr[op.rt] = _mm_cmpgt_epi16(_mm_xor_si128(spu.gpr[op.ra], _mm_set1_epi32(0x80008000)), _mm_set1_epi16(op.si10 ^ 0x8000));
 	return true;
 }
 
-bool spu_interpreter::CLGTBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CLGTBI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpgt_epi8(_mm_xor_si128(spu.gpr[op.ra].vi, _mm_set1_epi32(0x80808080)), _mm_set1_epi8(op.i8 ^ 0x80));
+	spu.gpr[op.rt] = _mm_cmpgt_epi8(_mm_xor_si128(spu.gpr[op.ra], _mm_set1_epi32(0x80808080)), _mm_set1_epi8(op.i8 ^ 0x80));
 	return true;
 }
 
-bool spu_interpreter::HLGTI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool HLGTI(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.ra]._u32[3] > static_cast<u32>(op.si10))
 	{
@@ -1645,39 +1856,45 @@ bool spu_interpreter::HLGTI(spu_thread& spu, spu_opcode_t op)
 	return true;
 }
 
-bool spu_interpreter::MPYI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_madd_epi16(spu.gpr[op.ra].vi, _mm_set1_epi32(op.si10 & 0xffff));
+	spu.gpr[op.rt] = _mm_madd_epi16(spu.gpr[op.ra], _mm_set1_epi32(op.si10 & 0xffff));
 	return true;
 }
 
-bool spu_interpreter::MPYUI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYUI(spu_thread& spu, spu_opcode_t op)
 {
-	const auto a = spu.gpr[op.ra].vi;
+	const auto a = spu.gpr[op.ra];
 	const auto i = _mm_set1_epi32(op.si10 & 0xffff);
-	spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, i), 16), _mm_mullo_epi16(a, i));
+	spu.gpr[op.rt] = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, i), 16), _mm_mullo_epi16(a, i));
 	return true;
 }
 
-bool spu_interpreter::CEQI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CEQI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi32(spu.gpr[op.ra].vi, _mm_set1_epi32(op.si10));
+	spu.gpr[op.rt] = _mm_cmpeq_epi32(spu.gpr[op.ra], _mm_set1_epi32(op.si10));
 	return true;
 }
 
-bool spu_interpreter::CEQHI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CEQHI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi16(spu.gpr[op.ra].vi, _mm_set1_epi16(op.si10));
+	spu.gpr[op.rt] = _mm_cmpeq_epi16(spu.gpr[op.ra], _mm_set1_epi16(op.si10));
 	return true;
 }
 
-bool spu_interpreter::CEQBI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool CEQBI(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_cmpeq_epi8(spu.gpr[op.ra].vi, _mm_set1_epi8(op.i8));
+	spu.gpr[op.rt] = _mm_cmpeq_epi8(spu.gpr[op.ra], _mm_set1_epi8(op.i8));
 	return true;
 }
 
-bool spu_interpreter::HEQI(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool HEQI(spu_thread& spu, spu_opcode_t op)
 {
 	if (spu.gpr[op.ra]._s32[3] == op.si10)
 	{
@@ -1687,34 +1904,39 @@ bool spu_interpreter::HEQI(spu_thread& spu, spu_opcode_t op)
 }
 
 
-bool spu_interpreter::HBRA(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool HBRA(spu_thread&, spu_opcode_t)
 {
 	return true;
 }
 
-bool spu_interpreter::HBRR(spu_thread&, spu_opcode_t)
+template <spu_exec_bit... Flags>
+bool HBRR(spu_thread&, spu_opcode_t)
 {
 	return true;
 }
 
-bool spu_interpreter::ILA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool ILA(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vi = _mm_set1_epi32(op.i18);
+	spu.gpr[op.rt] = _mm_set1_epi32(op.i18);
 	return true;
 }
 
 
-bool spu_interpreter::SELB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SELB(spu_thread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt4] = (spu.gpr[op.rc] & spu.gpr[op.rb]) | v128::andnot(spu.gpr[op.rc], spu.gpr[op.ra]);
+	spu.gpr[op.rt4] = (spu.gpr[op.rc] & spu.gpr[op.rb]) | gv_andn(spu.gpr[op.rc], spu.gpr[op.ra]);
 	return true;
 }
 
-bool spu_interpreter::SHUFB(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool SHUFB(spu_thread& spu, spu_opcode_t op)
 {
-	__m128i ab[2]{spu.gpr[op.rb].vi, spu.gpr[op.ra].vi};
+	__m128i ab[2]{__m128i(spu.gpr[op.rb]), __m128i(spu.gpr[op.ra])};
 	v128 c = spu.gpr[op.rc];
-	v128 x = v128::fromV(_mm_andnot_si128(c.vi, _mm_set1_epi8(0x1f)));
+	v128 x = _mm_andnot_si128(c, _mm_set1_epi8(0x1f));
 	v128 res;
 
 	// Select bytes
@@ -1726,14 +1948,15 @@ bool spu_interpreter::SHUFB(spu_thread& spu, spu_opcode_t op)
 	// Select special values
 	const auto xc0 = _mm_set1_epi8(static_cast<s8>(0xc0));
 	const auto xe0 = _mm_set1_epi8(static_cast<s8>(0xe0));
-	const auto cmp0 = _mm_cmpgt_epi8(_mm_setzero_si128(), c.vi);
-	const auto cmp1 = _mm_cmpeq_epi8(_mm_and_si128(c.vi, xc0), xc0);
-	const auto cmp2 = _mm_cmpeq_epi8(_mm_and_si128(c.vi, xe0), xc0);
-	spu.gpr[op.rt4].vi = _mm_or_si128(_mm_andnot_si128(cmp0, res.vi), _mm_avg_epu8(cmp1, cmp2));
+	const auto cmp0 = _mm_cmpgt_epi8(_mm_setzero_si128(), c);
+	const auto cmp1 = _mm_cmpeq_epi8(_mm_and_si128(c, xc0), xc0);
+	const auto cmp2 = _mm_cmpeq_epi8(_mm_and_si128(c, xe0), xc0);
+	spu.gpr[op.rt4] = _mm_or_si128(_mm_andnot_si128(cmp0, res), _mm_avg_epu8(cmp1, cmp2));
 	return true;
 }
 
-const spu_inter_func_t optimized_shufb = build_function_asm<spu_inter_func_t>("spu_shufb", [](asmjit::x86::Assembler& c, auto& /*args*/)
+#if defined(ARCH_X64)
+const spu_intrp_func_t optimized_shufb = build_function_asm<spu_intrp_func_t>("spu_shufb", [](asmjit::x86::Assembler& c, auto& /*args*/)
 {
 	using namespace asmjit;
 
@@ -1804,65 +2027,72 @@ const spu_inter_func_t optimized_shufb = build_function_asm<spu_inter_func_t>("s
 	c.dq(0x0f0f0f0f0f0f0f0f);
 	c.dq(0x0f0f0f0f0f0f0f0f);
 });
+#endif
 
-bool spu_interpreter::MPYA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool MPYA(spu_thread& spu, spu_opcode_t op)
 {
 	const auto mask = _mm_set1_epi32(0xffff);
-	spu.gpr[op.rt4].vi = _mm_add_epi32(spu.gpr[op.rc].vi, _mm_madd_epi16(_mm_and_si128(spu.gpr[op.ra].vi, mask), _mm_and_si128(spu.gpr[op.rb].vi, mask)));
+	spu.gpr[op.rt4] = _mm_add_epi32(spu.gpr[op.rc], _mm_madd_epi16(_mm_and_si128(spu.gpr[op.ra], mask), _mm_and_si128(spu.gpr[op.rb], mask)));
 	return true;
 }
 
-bool spu_interpreter_fast::FNMS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FNMS(spu_thread& spu, spu_opcode_t op)
 {
 	const u32 test_bits = 0x7f800000;
 	auto mask = _mm_set1_ps(std::bit_cast<f32>(test_bits));
 
-	auto test_a = _mm_and_ps(spu.gpr[op.ra].vf, mask);
+	auto test_a = _mm_and_ps(spu.gpr[op.ra], mask);
 	auto mask_a = _mm_cmpneq_ps(test_a, mask);
-	auto test_b = _mm_and_ps(spu.gpr[op.rb].vf, mask);
+	auto test_b = _mm_and_ps(spu.gpr[op.rb], mask);
 	auto mask_b = _mm_cmpneq_ps(test_b, mask);
 
-	auto a = _mm_and_ps(spu.gpr[op.ra].vf, mask_a);
-	auto b = _mm_and_ps(spu.gpr[op.rb].vf, mask_b);
+	auto a = _mm_and_ps(spu.gpr[op.ra], mask_a);
+	auto b = _mm_and_ps(spu.gpr[op.rb], mask_b);
 
-	spu.gpr[op.rt4].vf = _mm_sub_ps(spu.gpr[op.rc].vf, _mm_mul_ps(a, b));
+	spu.gpr[op.rt4] = _mm_sub_ps(spu.gpr[op.rc], _mm_mul_ps(a, b));
 	return true;
 }
 
-bool spu_interpreter_fast::FMA(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FMA(spu_thread& spu, spu_opcode_t op)
 {
 	const u32 test_bits = 0x7f800000;
 	auto mask = _mm_set1_ps(std::bit_cast<f32>(test_bits));
 
-	auto test_a = _mm_and_ps(spu.gpr[op.ra].vf, mask);
+	auto test_a = _mm_and_ps(spu.gpr[op.ra], mask);
 	auto mask_a = _mm_cmpneq_ps(test_a, mask);
-	auto test_b = _mm_and_ps(spu.gpr[op.rb].vf, mask);
+	auto test_b = _mm_and_ps(spu.gpr[op.rb], mask);
 	auto mask_b = _mm_cmpneq_ps(test_b, mask);
 
-	auto a = _mm_and_ps(spu.gpr[op.ra].vf, mask_a);
-	auto b = _mm_and_ps(spu.gpr[op.rb].vf, mask_b);
+	auto a = _mm_and_ps(spu.gpr[op.ra], mask_a);
+	auto b = _mm_and_ps(spu.gpr[op.rb], mask_b);
 
-	spu.gpr[op.rt4].vf = _mm_add_ps(_mm_mul_ps(a, b), spu.gpr[op.rc].vf);
+	spu.gpr[op.rt4] = _mm_add_ps(_mm_mul_ps(a, b), spu.gpr[op.rc]);
 	return true;
 }
 
-bool spu_interpreter_fast::FMS(spu_thread& spu, spu_opcode_t op)
+template <spu_exec_bit... Flags>
+bool FMS(spu_thread& spu, spu_opcode_t op)
 {
 	const u32 test_bits = 0x7f800000;
 	auto mask = _mm_set1_ps(std::bit_cast<f32>(test_bits));
 
-	auto test_a = _mm_and_ps(spu.gpr[op.ra].vf, mask);
+	auto test_a = _mm_and_ps(spu.gpr[op.ra], mask);
 	auto mask_a = _mm_cmpneq_ps(test_a, mask);
-	auto test_b = _mm_and_ps(spu.gpr[op.rb].vf, mask);
+	auto test_b = _mm_and_ps(spu.gpr[op.rb], mask);
 	auto mask_b = _mm_cmpneq_ps(test_b, mask);
 
-	auto a = _mm_and_ps(spu.gpr[op.ra].vf, mask_a);
-	auto b = _mm_and_ps(spu.gpr[op.rb].vf, mask_b);
+	auto a = _mm_and_ps(spu.gpr[op.ra], mask_a);
+	auto b = _mm_and_ps(spu.gpr[op.rb], mask_b);
 
-	spu.gpr[op.rt4].vf = _mm_sub_ps(_mm_mul_ps(a, b), spu.gpr[op.rc].vf);
+	spu.gpr[op.rt4] = _mm_sub_ps(_mm_mul_ps(a, b), spu.gpr[op.rc]);
 	return true;
 }
 
+#if 0
+
 static void SetHostRoundingMode(u32 rn)
 {
 	switch (rn)
@@ -1932,7 +2162,7 @@ bool spu_interpreter_precise::FREST(spu_thread& spu, spu_opcode_t op)
 {
 	fesetround(FE_TOWARDZERO);
 	const auto ra = spu.gpr[op.ra];
-	auto res = v128::fromF(_mm_rcp_ps(ra.vf));
+	v128 res = _mm_rcp_ps(ra);
 	for (int i = 0; i < 4; i++)
 	{
 		const auto a = ra._f[i];
@@ -2664,3 +2894,437 @@ bool spu_interpreter_precise::FNMS(spu_thread& spu, spu_opcode_t op) { ::FMA(spu
 bool spu_interpreter_precise::FMA(spu_thread& spu, spu_opcode_t op) { ::FMA(spu, op, false, false); return true; }
 
 bool spu_interpreter_precise::FMS(spu_thread& spu, spu_opcode_t op) { ::FMA(spu, op, false, true); return true; }
+
+#endif /* __SSE2__ */
+
+template <typename IT>
+struct spu_interpreter_t
+{
+	IT UNK;
+	IT HEQ;
+	IT HEQI;
+	IT HGT;
+	IT HGTI;
+	IT HLGT;
+	IT HLGTI;
+	IT HBR;
+	IT HBRA;
+	IT HBRR;
+	IT STOP;
+	IT STOPD;
+	IT LNOP;
+	IT NOP;
+	IT SYNC;
+	IT DSYNC;
+	IT MFSPR;
+	IT MTSPR;
+	IT RDCH;
+	IT RCHCNT;
+	IT WRCH;
+	IT LQD;
+	IT LQX;
+	IT LQA;
+	IT LQR;
+	IT STQD;
+	IT STQX;
+	IT STQA;
+	IT STQR;
+	IT CBD;
+	IT CBX;
+	IT CHD;
+	IT CHX;
+	IT CWD;
+	IT CWX;
+	IT CDD;
+	IT CDX;
+	IT ILH;
+	IT ILHU;
+	IT IL;
+	IT ILA;
+	IT IOHL;
+	IT FSMBI;
+	IT AH;
+	IT AHI;
+	IT A;
+	IT AI;
+	IT SFH;
+	IT SFHI;
+	IT SF;
+	IT SFI;
+	IT ADDX;
+	IT CG;
+	IT CGX;
+	IT SFX;
+	IT BG;
+	IT BGX;
+	IT MPY;
+	IT MPYU;
+	IT MPYI;
+	IT MPYUI;
+	IT MPYH;
+	IT MPYS;
+	IT MPYHH;
+	IT MPYHHA;
+	IT MPYHHU;
+	IT MPYHHAU;
+	IT CLZ;
+	IT CNTB;
+	IT FSMB;
+	IT FSMH;
+	IT FSM;
+	IT GBB;
+	IT GBH;
+	IT GB;
+	IT AVGB;
+	IT ABSDB;
+	IT SUMB;
+	IT XSBH;
+	IT XSHW;
+	IT XSWD;
+	IT AND;
+	IT ANDC;
+	IT ANDBI;
+	IT ANDHI;
+	IT ANDI;
+	IT OR;
+	IT ORC;
+	IT ORBI;
+	IT ORHI;
+	IT ORI;
+	IT ORX;
+	IT XOR;
+	IT XORBI;
+	IT XORHI;
+	IT XORI;
+	IT NAND;
+	IT NOR;
+	IT EQV;
+	IT MPYA;
+	IT SELB;
+	IT SHUFB;
+	IT SHLH;
+	IT SHLHI;
+	IT SHL;
+	IT SHLI;
+	IT SHLQBI;
+	IT SHLQBII;
+	IT SHLQBY;
+	IT SHLQBYI;
+	IT SHLQBYBI;
+	IT ROTH;
+	IT ROTHI;
+	IT ROT;
+	IT ROTI;
+	IT ROTQBY;
+	IT ROTQBYI;
+	IT ROTQBYBI;
+	IT ROTQBI;
+	IT ROTQBII;
+	IT ROTHM;
+	IT ROTHMI;
+	IT ROTM;
+	IT ROTMI;
+	IT ROTQMBY;
+	IT ROTQMBYI;
+	IT ROTQMBYBI;
+	IT ROTQMBI;
+	IT ROTQMBII;
+	IT ROTMAH;
+	IT ROTMAHI;
+	IT ROTMA;
+	IT ROTMAI;
+	IT CEQB;
+	IT CEQBI;
+	IT CEQH;
+	IT CEQHI;
+	IT CEQ;
+	IT CEQI;
+	IT CGTB;
+	IT CGTBI;
+	IT CGTH;
+	IT CGTHI;
+	IT CGT;
+	IT CGTI;
+	IT CLGTB;
+	IT CLGTBI;
+	IT CLGTH;
+	IT CLGTHI;
+	IT CLGT;
+	IT CLGTI;
+	IT BR;
+	IT BRA;
+	IT BRSL;
+	IT BRASL;
+	IT BI;
+	IT IRET;
+	IT BISLED;
+	IT BISL;
+	IT BRNZ;
+	IT BRZ;
+	IT BRHNZ;
+	IT BRHZ;
+	IT BIZ;
+	IT BINZ;
+	IT BIHZ;
+	IT BIHNZ;
+	IT FA;
+	IT DFA;
+	IT FS;
+	IT DFS;
+	IT FM;
+	IT DFM;
+	IT DFMA;
+	IT DFNMS;
+	IT DFMS;
+	IT DFNMA;
+	IT FREST;
+	IT FRSQEST;
+	IT FI;
+	IT CSFLT;
+	IT CFLTS;
+	IT CUFLT;
+	IT CFLTU;
+	IT FRDS;
+	IT FESD;
+	IT FCEQ;
+	IT FCMEQ;
+	IT FCGT;
+	IT FCMGT;
+	IT FSCRWR;
+	IT FSCRRD;
+	IT DFCEQ;
+	IT DFCMEQ;
+	IT DFCGT;
+	IT DFCMGT;
+	IT DFTSV;
+	IT FMA;
+	IT FNMS;
+	IT FMS;
+};
+
+spu_interpreter_rt_base::spu_interpreter_rt_base() noexcept
+{
+	// Obtain required set of flags from settings
+	bs_t<spu_exec_bit> selected{};
+	if (g_cfg.core.use_accurate_dfma)
+		selected += use_dfma;
+
+	ptrs = std::make_unique<decltype(ptrs)::element_type>();
+
+	// Initialize instructions with their own sets of supported flags
+#define INIT(name, ...) \
+	ptrs->name = spu_exec_select<>::select<__VA_ARGS__>(selected, []<spu_exec_bit... Flags>(){ return &::name<Flags...>; }); \
+
+	using enum spu_exec_bit;
+
+	INIT(UNK);
+	INIT(HEQ);
+	INIT(HEQI);
+	INIT(HGT);
+	INIT(HGTI);
+	INIT(HLGT);
+	INIT(HLGTI);
+	INIT(HBR);
+	INIT(HBRA);
+	INIT(HBRR);
+	INIT(STOP);
+	INIT(STOPD);
+	INIT(LNOP);
+	INIT(NOP);
+	INIT(SYNC);
+	INIT(DSYNC);
+	INIT(MFSPR);
+	INIT(MTSPR);
+	INIT(RDCH);
+	INIT(RCHCNT);
+	INIT(WRCH);
+	INIT(LQD);
+	INIT(LQX);
+	INIT(LQA);
+	INIT(LQR);
+	INIT(STQD);
+	INIT(STQX);
+	INIT(STQA);
+	INIT(STQR);
+	INIT(CBD);
+	INIT(CBX);
+	INIT(CHD);
+	INIT(CHX);
+	INIT(CWD);
+	INIT(CWX);
+	INIT(CDD);
+	INIT(CDX);
+	INIT(ILH);
+	INIT(ILHU);
+	INIT(IL);
+	INIT(ILA);
+	INIT(IOHL);
+	INIT(FSMBI);
+	INIT(AH);
+	INIT(AHI);
+	INIT(A);
+	INIT(AI);
+	INIT(SFH);
+	INIT(SFHI);
+	INIT(SF);
+	INIT(SFI);
+	INIT(ADDX);
+	INIT(CG);
+	INIT(CGX);
+	INIT(SFX);
+	INIT(BG);
+	INIT(BGX);
+	INIT(MPY);
+	INIT(MPYU);
+	INIT(MPYI);
+	INIT(MPYUI);
+	INIT(MPYH);
+	INIT(MPYS);
+	INIT(MPYHH);
+	INIT(MPYHHA);
+	INIT(MPYHHU);
+	INIT(MPYHHAU);
+	INIT(CLZ);
+	INIT(CNTB);
+	INIT(FSMB);
+	INIT(FSMH);
+	INIT(FSM);
+	INIT(GBB);
+	INIT(GBH);
+	INIT(GB);
+	INIT(AVGB);
+	INIT(ABSDB);
+	INIT(SUMB);
+	INIT(XSBH);
+	INIT(XSHW);
+	INIT(XSWD);
+	INIT(AND);
+	INIT(ANDC);
+	INIT(ANDBI);
+	INIT(ANDHI);
+	INIT(ANDI);
+	INIT(OR);
+	INIT(ORC);
+	INIT(ORBI);
+	INIT(ORHI);
+	INIT(ORI);
+	INIT(ORX);
+	INIT(XOR);
+	INIT(XORBI);
+	INIT(XORHI);
+	INIT(XORI);
+	INIT(NAND);
+	INIT(NOR);
+	INIT(EQV);
+	INIT(MPYA);
+	INIT(SELB);
+	INIT(SHUFB);
+	INIT(SHLH);
+	INIT(SHLHI);
+	INIT(SHL);
+	INIT(SHLI);
+	INIT(SHLQBI);
+	INIT(SHLQBII);
+	INIT(SHLQBY);
+	INIT(SHLQBYI);
+	INIT(SHLQBYBI);
+	INIT(ROTH);
+	INIT(ROTHI);
+	INIT(ROT);
+	INIT(ROTI);
+	INIT(ROTQBY);
+	INIT(ROTQBYI);
+	INIT(ROTQBYBI);
+	INIT(ROTQBI);
+	INIT(ROTQBII);
+	INIT(ROTHM);
+	INIT(ROTHMI);
+	INIT(ROTM);
+	INIT(ROTMI);
+	INIT(ROTQMBY);
+	INIT(ROTQMBYI);
+	INIT(ROTQMBYBI);
+	INIT(ROTQMBI);
+	INIT(ROTQMBII);
+	INIT(ROTMAH);
+	INIT(ROTMAHI);
+	INIT(ROTMA);
+	INIT(ROTMAI);
+	INIT(CEQB);
+	INIT(CEQBI);
+	INIT(CEQH);
+	INIT(CEQHI);
+	INIT(CEQ);
+	INIT(CEQI);
+	INIT(CGTB);
+	INIT(CGTBI);
+	INIT(CGTH);
+	INIT(CGTHI);
+	INIT(CGT);
+	INIT(CGTI);
+	INIT(CLGTB);
+	INIT(CLGTBI);
+	INIT(CLGTH);
+	INIT(CLGTHI);
+	INIT(CLGT);
+	INIT(CLGTI);
+	INIT(BR);
+	INIT(BRA);
+	INIT(BRSL);
+	INIT(BRASL);
+	INIT(BI);
+	INIT(IRET);
+	INIT(BISLED);
+	INIT(BISL);
+	INIT(BRNZ);
+	INIT(BRZ);
+	INIT(BRHNZ);
+	INIT(BRHZ);
+	INIT(BIZ);
+	INIT(BINZ);
+	INIT(BIHZ);
+	INIT(BIHNZ);
+	INIT(FA);
+	INIT(DFA);
+	INIT(FS);
+	INIT(DFS);
+	INIT(FM);
+	INIT(DFM);
+	INIT(DFMA);
+	INIT(DFNMS);
+	INIT(DFMS);
+	INIT(DFNMA);
+	INIT(FREST);
+	INIT(FRSQEST);
+	INIT(FI);
+	INIT(CSFLT);
+	INIT(CFLTS);
+	INIT(CUFLT);
+	INIT(CFLTU);
+	INIT(FRDS);
+	INIT(FESD);
+	INIT(FCEQ);
+	INIT(FCMEQ);
+	INIT(FCGT);
+	INIT(FCMGT);
+	INIT(FSCRWR);
+	INIT(FSCRRD);
+	INIT(DFCEQ);
+	INIT(DFCMEQ);
+	INIT(DFCGT);
+	INIT(DFCMGT);
+	INIT(DFTSV);
+	INIT(FMA);
+	INIT(FNMS);
+	INIT(FMS);
+}
+
+spu_interpreter_rt_base::~spu_interpreter_rt_base()
+{
+}
+
+spu_interpreter_rt::spu_interpreter_rt() noexcept
+	: spu_interpreter_rt_base()
+	, table(*ptrs)
+{
+}
diff --git a/rpcs3/Emu/Cell/SPUInterpreter.h b/rpcs3/Emu/Cell/SPUInterpreter.h
index 9e30793c3b..c6073ba575 100644
--- a/rpcs3/Emu/Cell/SPUInterpreter.h
+++ b/rpcs3/Emu/Cell/SPUInterpreter.h
@@ -4,246 +4,39 @@
 
 class spu_thread;
 
-using spu_inter_func_t = bool(*)(spu_thread& spu, spu_opcode_t op);
+using spu_intrp_func_t = bool(*)(spu_thread& spu, spu_opcode_t op);
+
+template <typename IT>
+struct spu_interpreter_t;
 
 struct spu_interpreter
 {
-	static bool UNK(spu_thread&, spu_opcode_t);
 	static void set_interrupt_status(spu_thread&, spu_opcode_t);
-
-	static bool STOP(spu_thread&, spu_opcode_t);
-	static bool LNOP(spu_thread&, spu_opcode_t);
-	static bool SYNC(spu_thread&, spu_opcode_t);
-	static bool DSYNC(spu_thread&, spu_opcode_t);
-	static bool MFSPR(spu_thread&, spu_opcode_t);
-	static bool RDCH(spu_thread&, spu_opcode_t);
-	static bool RCHCNT(spu_thread&, spu_opcode_t);
-	static bool SF(spu_thread&, spu_opcode_t);
-	static bool OR(spu_thread&, spu_opcode_t);
-	static bool BG(spu_thread&, spu_opcode_t);
-	static bool SFH(spu_thread&, spu_opcode_t);
-	static bool NOR(spu_thread&, spu_opcode_t);
-	static bool ABSDB(spu_thread&, spu_opcode_t);
-	static bool ROT(spu_thread&, spu_opcode_t);
-	static bool ROTM(spu_thread&, spu_opcode_t);
-	static bool ROTMA(spu_thread&, spu_opcode_t);
-	static bool SHL(spu_thread&, spu_opcode_t);
-	static bool ROTH(spu_thread&, spu_opcode_t);
-	static bool ROTHM(spu_thread&, spu_opcode_t);
-	static bool ROTMAH(spu_thread&, spu_opcode_t);
-	static bool SHLH(spu_thread&, spu_opcode_t);
-	static bool ROTI(spu_thread&, spu_opcode_t);
-	static bool ROTMI(spu_thread&, spu_opcode_t);
-	static bool ROTMAI(spu_thread&, spu_opcode_t);
-	static bool SHLI(spu_thread&, spu_opcode_t);
-	static bool ROTHI(spu_thread&, spu_opcode_t);
-	static bool ROTHMI(spu_thread&, spu_opcode_t);
-	static bool ROTMAHI(spu_thread&, spu_opcode_t);
-	static bool SHLHI(spu_thread&, spu_opcode_t);
-	static bool A(spu_thread&, spu_opcode_t);
-	static bool AND(spu_thread&, spu_opcode_t);
-	static bool CG(spu_thread&, spu_opcode_t);
-	static bool AH(spu_thread&, spu_opcode_t);
-	static bool NAND(spu_thread&, spu_opcode_t);
-	static bool AVGB(spu_thread&, spu_opcode_t);
-	static bool MTSPR(spu_thread&, spu_opcode_t);
-	static bool WRCH(spu_thread&, spu_opcode_t);
-	static bool BIZ(spu_thread&, spu_opcode_t);
-	static bool BINZ(spu_thread&, spu_opcode_t);
-	static bool BIHZ(spu_thread&, spu_opcode_t);
-	static bool BIHNZ(spu_thread&, spu_opcode_t);
-	static bool STOPD(spu_thread&, spu_opcode_t);
-	static bool STQX(spu_thread&, spu_opcode_t);
-	static bool BI(spu_thread&, spu_opcode_t);
-	static bool BISL(spu_thread&, spu_opcode_t);
-	static bool IRET(spu_thread&, spu_opcode_t);
-	static bool BISLED(spu_thread&, spu_opcode_t);
-	static bool HBR(spu_thread&, spu_opcode_t);
-	static bool GB(spu_thread&, spu_opcode_t);
-	static bool GBH(spu_thread&, spu_opcode_t);
-	static bool GBB(spu_thread&, spu_opcode_t);
-	static bool FSM(spu_thread&, spu_opcode_t);
-	static bool FSMH(spu_thread&, spu_opcode_t);
-	static bool FSMB(spu_thread&, spu_opcode_t);
-	static bool LQX(spu_thread&, spu_opcode_t);
-	static bool ROTQBYBI(spu_thread&, spu_opcode_t);
-	static bool ROTQMBYBI(spu_thread&, spu_opcode_t);
-	static bool SHLQBYBI(spu_thread&, spu_opcode_t);
-	static bool CBX(spu_thread&, spu_opcode_t);
-	static bool CHX(spu_thread&, spu_opcode_t);
-	static bool CWX(spu_thread&, spu_opcode_t);
-	static bool CDX(spu_thread&, spu_opcode_t);
-	static bool ROTQBI(spu_thread&, spu_opcode_t);
-	static bool ROTQMBI(spu_thread&, spu_opcode_t);
-	static bool SHLQBI(spu_thread&, spu_opcode_t);
-	static bool ROTQBY(spu_thread&, spu_opcode_t);
-	static bool ROTQMBY(spu_thread&, spu_opcode_t);
-	static bool SHLQBY(spu_thread&, spu_opcode_t);
-	static bool ORX(spu_thread&, spu_opcode_t);
-	static bool CBD(spu_thread&, spu_opcode_t);
-	static bool CHD(spu_thread&, spu_opcode_t);
-	static bool CWD(spu_thread&, spu_opcode_t);
-	static bool CDD(spu_thread&, spu_opcode_t);
-	static bool ROTQBII(spu_thread&, spu_opcode_t);
-	static bool ROTQMBII(spu_thread&, spu_opcode_t);
-	static bool SHLQBII(spu_thread&, spu_opcode_t);
-	static bool ROTQBYI(spu_thread&, spu_opcode_t);
-	static bool ROTQMBYI(spu_thread&, spu_opcode_t);
-	static bool SHLQBYI(spu_thread&, spu_opcode_t);
-	static bool NOP(spu_thread&, spu_opcode_t);
-	static bool CGT(spu_thread&, spu_opcode_t);
-	static bool XOR(spu_thread&, spu_opcode_t);
-	static bool CGTH(spu_thread&, spu_opcode_t);
-	static bool EQV(spu_thread&, spu_opcode_t);
-	static bool CGTB(spu_thread&, spu_opcode_t);
-	static bool SUMB(spu_thread&, spu_opcode_t);
-	static bool HGT(spu_thread&, spu_opcode_t);
-	static bool CLZ(spu_thread&, spu_opcode_t);
-	static bool XSWD(spu_thread&, spu_opcode_t);
-	static bool XSHW(spu_thread&, spu_opcode_t);
-	static bool CNTB(spu_thread&, spu_opcode_t);
-	static bool XSBH(spu_thread&, spu_opcode_t);
-	static bool CLGT(spu_thread&, spu_opcode_t);
-	static bool ANDC(spu_thread&, spu_opcode_t);
-	static bool CLGTH(spu_thread&, spu_opcode_t);
-	static bool ORC(spu_thread&, spu_opcode_t);
-	static bool CLGTB(spu_thread&, spu_opcode_t);
-	static bool HLGT(spu_thread&, spu_opcode_t);
-	static bool CEQ(spu_thread&, spu_opcode_t);
-	static bool MPYHHU(spu_thread&, spu_opcode_t);
-	static bool ADDX(spu_thread&, spu_opcode_t);
-	static bool SFX(spu_thread&, spu_opcode_t);
-	static bool CGX(spu_thread&, spu_opcode_t);
-	static bool BGX(spu_thread&, spu_opcode_t);
-	static bool MPYHHA(spu_thread&, spu_opcode_t);
-	static bool MPYHHAU(spu_thread&, spu_opcode_t);
-	static bool MPY(spu_thread&, spu_opcode_t);
-	static bool MPYH(spu_thread&, spu_opcode_t);
-	static bool MPYHH(spu_thread&, spu_opcode_t);
-	static bool MPYS(spu_thread&, spu_opcode_t);
-	static bool CEQH(spu_thread&, spu_opcode_t);
-	static bool MPYU(spu_thread&, spu_opcode_t);
-	static bool CEQB(spu_thread&, spu_opcode_t);
-	static bool HEQ(spu_thread&, spu_opcode_t);
-	static bool BRZ(spu_thread&, spu_opcode_t);
-	static bool STQA(spu_thread&, spu_opcode_t);
-	static bool BRNZ(spu_thread&, spu_opcode_t);
-	static bool BRHZ(spu_thread&, spu_opcode_t);
-	static bool BRHNZ(spu_thread&, spu_opcode_t);
-	static bool STQR(spu_thread&, spu_opcode_t);
-	static bool BRA(spu_thread&, spu_opcode_t);
-	static bool LQA(spu_thread&, spu_opcode_t);
-	static bool BRASL(spu_thread&, spu_opcode_t);
-	static bool BR(spu_thread&, spu_opcode_t);
-	static bool FSMBI(spu_thread&, spu_opcode_t);
-	static bool BRSL(spu_thread&, spu_opcode_t);
-	static bool LQR(spu_thread&, spu_opcode_t);
-	static bool IL(spu_thread&, spu_opcode_t);
-	static bool ILHU(spu_thread&, spu_opcode_t);
-	static bool ILH(spu_thread&, spu_opcode_t);
-	static bool IOHL(spu_thread&, spu_opcode_t);
-	static bool ORI(spu_thread&, spu_opcode_t);
-	static bool ORHI(spu_thread&, spu_opcode_t);
-	static bool ORBI(spu_thread&, spu_opcode_t);
-	static bool SFI(spu_thread&, spu_opcode_t);
-	static bool SFHI(spu_thread&, spu_opcode_t);
-	static bool ANDI(spu_thread&, spu_opcode_t);
-	static bool ANDHI(spu_thread&, spu_opcode_t);
-	static bool ANDBI(spu_thread&, spu_opcode_t);
-	static bool AI(spu_thread&, spu_opcode_t);
-	static bool AHI(spu_thread&, spu_opcode_t);
-	static bool STQD(spu_thread&, spu_opcode_t);
-	static bool LQD(spu_thread&, spu_opcode_t);
-	static bool XORI(spu_thread&, spu_opcode_t);
-	static bool XORHI(spu_thread&, spu_opcode_t);
-	static bool XORBI(spu_thread&, spu_opcode_t);
-	static bool CGTI(spu_thread&, spu_opcode_t);
-	static bool CGTHI(spu_thread&, spu_opcode_t);
-	static bool CGTBI(spu_thread&, spu_opcode_t);
-	static bool HGTI(spu_thread&, spu_opcode_t);
-	static bool CLGTI(spu_thread&, spu_opcode_t);
-	static bool CLGTHI(spu_thread&, spu_opcode_t);
-	static bool CLGTBI(spu_thread&, spu_opcode_t);
-	static bool HLGTI(spu_thread&, spu_opcode_t);
-	static bool MPYI(spu_thread&, spu_opcode_t);
-	static bool MPYUI(spu_thread&, spu_opcode_t);
-	static bool CEQI(spu_thread&, spu_opcode_t);
-	static bool CEQHI(spu_thread&, spu_opcode_t);
-	static bool CEQBI(spu_thread&, spu_opcode_t);
-	static bool HEQI(spu_thread&, spu_opcode_t);
-	static bool HBRA(spu_thread&, spu_opcode_t);
-	static bool HBRR(spu_thread&, spu_opcode_t);
-	static bool ILA(spu_thread&, spu_opcode_t);
-	static bool SELB(spu_thread&, spu_opcode_t);
-	static bool SHUFB(spu_thread&, spu_opcode_t);
-	static bool MPYA(spu_thread&, spu_opcode_t);
-	static bool DFCGT(spu_thread&, spu_opcode_t);
-	static bool DFCMGT(spu_thread&, spu_opcode_t);
-	static bool DFTSV(spu_thread&, spu_opcode_t);
-	static bool DFCEQ(spu_thread&, spu_opcode_t);
-	static bool DFCMEQ(spu_thread&, spu_opcode_t);
 };
 
-struct spu_interpreter_fast final : spu_interpreter
+struct spu_interpreter_rt_base
 {
-	static bool FREST(spu_thread&, spu_opcode_t);
-	static bool FRSQEST(spu_thread&, spu_opcode_t);
-	static bool FCGT(spu_thread&, spu_opcode_t);
-	static bool FA(spu_thread&, spu_opcode_t);
-	static bool FS(spu_thread&, spu_opcode_t);
-	static bool FM(spu_thread&, spu_opcode_t);
-	static bool FCMGT(spu_thread&, spu_opcode_t);
-	static bool DFA(spu_thread&, spu_opcode_t);
-	static bool DFS(spu_thread&, spu_opcode_t);
-	static bool DFM(spu_thread&, spu_opcode_t);
-	static bool DFMA(spu_thread&, spu_opcode_t);
-	static bool DFMS(spu_thread&, spu_opcode_t);
-	static bool DFNMS(spu_thread&, spu_opcode_t);
-	static bool DFNMA(spu_thread&, spu_opcode_t);
-	static bool FSCRRD(spu_thread&, spu_opcode_t);
-	static bool FESD(spu_thread&, spu_opcode_t);
-	static bool FRDS(spu_thread&, spu_opcode_t);
-	static bool FSCRWR(spu_thread&, spu_opcode_t);
-	static bool FCEQ(spu_thread&, spu_opcode_t);
-	static bool FCMEQ(spu_thread&, spu_opcode_t);
-	static bool FI(spu_thread&, spu_opcode_t);
-	static bool CFLTS(spu_thread&, spu_opcode_t);
-	static bool CFLTU(spu_thread&, spu_opcode_t);
-	static bool CSFLT(spu_thread&, spu_opcode_t);
-	static bool CUFLT(spu_thread&, spu_opcode_t);
-	static bool FNMS(spu_thread&, spu_opcode_t);
-	static bool FMA(spu_thread&, spu_opcode_t);
-	static bool FMS(spu_thread&, spu_opcode_t);
+protected:
+	std::unique_ptr<spu_interpreter_t<spu_intrp_func_t>> ptrs;
+
+	spu_interpreter_rt_base() noexcept;
+
+	spu_interpreter_rt_base(const spu_interpreter_rt_base&) = delete;
+
+	spu_interpreter_rt_base& operator=(const spu_interpreter_rt_base&) = delete;
+
+	virtual ~spu_interpreter_rt_base();
 };
 
-struct spu_interpreter_precise final : spu_interpreter
+struct spu_interpreter_rt : spu_interpreter_rt_base
 {
-	static bool FREST(spu_thread&, spu_opcode_t);
-	static bool FRSQEST(spu_thread&, spu_opcode_t);
-	static bool FCGT(spu_thread&, spu_opcode_t);
-	static bool FA(spu_thread&, spu_opcode_t);
-	static bool FS(spu_thread&, spu_opcode_t);
-	static bool FM(spu_thread&, spu_opcode_t);
-	static bool FCMGT(spu_thread&, spu_opcode_t);
-	static bool DFA(spu_thread&, spu_opcode_t);
-	static bool DFS(spu_thread&, spu_opcode_t);
-	static bool DFM(spu_thread&, spu_opcode_t);
-	static bool DFMA(spu_thread&, spu_opcode_t);
-	static bool DFMS(spu_thread&, spu_opcode_t);
-	static bool DFNMS(spu_thread&, spu_opcode_t);
-	static bool DFNMA(spu_thread&, spu_opcode_t);
-	static bool FSCRRD(spu_thread&, spu_opcode_t);
-	static bool FESD(spu_thread&, spu_opcode_t);
-	static bool FRDS(spu_thread&, spu_opcode_t);
-	static bool FSCRWR(spu_thread&, spu_opcode_t);
-	static bool FCEQ(spu_thread&, spu_opcode_t);
-	static bool FCMEQ(spu_thread&, spu_opcode_t);
-	static bool FI(spu_thread&, spu_opcode_t);
-	static bool CFLTS(spu_thread&, spu_opcode_t);
-	static bool CFLTU(spu_thread&, spu_opcode_t);
-	static bool CSFLT(spu_thread&, spu_opcode_t);
-	static bool CUFLT(spu_thread&, spu_opcode_t);
-	static bool FNMS(spu_thread&, spu_opcode_t);
-	static bool FMA(spu_thread&, spu_opcode_t);
-	static bool FMS(spu_thread&, spu_opcode_t);
+	spu_interpreter_rt() noexcept;
+
+	spu_intrp_func_t decode(u32 op) const noexcept
+	{
+		return table.decode(op);
+	}
+
+private:
+	spu_decoder<spu_interpreter_t<spu_intrp_func_t>, spu_intrp_func_t> table;
 };
diff --git a/rpcs3/Emu/Cell/SPUOpcodes.h b/rpcs3/Emu/Cell/SPUOpcodes.h
index efdc68a7f0..60e3d0d1b5 100644
--- a/rpcs3/Emu/Cell/SPUOpcodes.h
+++ b/rpcs3/Emu/Cell/SPUOpcodes.h
@@ -71,215 +71,227 @@ class spu_decoder
 		}
 	};
 
-public:
-	spu_decoder() noexcept
+	// Helper
+	static const D& _first(const D& arg)
 	{
+		return arg;
+	}
+
+public:
+	template <typename... Args>
+	spu_decoder(const Args&... args) noexcept
+	{
+		// If an object is passed to the constructor, assign values from that object
+#define GET(name) [&]{ if constexpr (sizeof...(Args) > 0) return _first(args...).name; else return &D::name; }()
+
+		static_assert(sizeof...(Args) <= 1);
+
 		const std::initializer_list<instruction_info> instructions
 		{
-			{ 0, 0x0, &D::STOP },
-			{ 0, 0x1, &D::LNOP },
-			{ 0, 0x2, &D::SYNC },
-			{ 0, 0x3, &D::DSYNC },
-			{ 0, 0xc, &D::MFSPR },
-			{ 0, 0xd, &D::RDCH },
-			{ 0, 0xf, &D::RCHCNT },
-			{ 0, 0x40, &D::SF },
-			{ 0, 0x41, &D::OR },
-			{ 0, 0x42, &D::BG },
-			{ 0, 0x48, &D::SFH },
-			{ 0, 0x49, &D::NOR },
-			{ 0, 0x53, &D::ABSDB },
-			{ 0, 0x58, &D::ROT },
-			{ 0, 0x59, &D::ROTM },
-			{ 0, 0x5a, &D::ROTMA },
-			{ 0, 0x5b, &D::SHL },
-			{ 0, 0x5c, &D::ROTH },
-			{ 0, 0x5d, &D::ROTHM },
-			{ 0, 0x5e, &D::ROTMAH },
-			{ 0, 0x5f, &D::SHLH },
-			{ 0, 0x78, &D::ROTI },
-			{ 0, 0x79, &D::ROTMI },
-			{ 0, 0x7a, &D::ROTMAI },
-			{ 0, 0x7b, &D::SHLI },
-			{ 0, 0x7c, &D::ROTHI },
-			{ 0, 0x7d, &D::ROTHMI },
-			{ 0, 0x7e, &D::ROTMAHI },
-			{ 0, 0x7f, &D::SHLHI },
-			{ 0, 0xc0, &D::A },
-			{ 0, 0xc1, &D::AND },
-			{ 0, 0xc2, &D::CG },
-			{ 0, 0xc8, &D::AH },
-			{ 0, 0xc9, &D::NAND },
-			{ 0, 0xd3, &D::AVGB },
-			{ 0, 0x10c, &D::MTSPR },
-			{ 0, 0x10d, &D::WRCH },
-			{ 0, 0x128, &D::BIZ },
-			{ 0, 0x129, &D::BINZ },
-			{ 0, 0x12a, &D::BIHZ },
-			{ 0, 0x12b, &D::BIHNZ },
-			{ 0, 0x140, &D::STOPD },
-			{ 0, 0x144, &D::STQX },
-			{ 0, 0x1a8, &D::BI },
-			{ 0, 0x1a9, &D::BISL },
-			{ 0, 0x1aa, &D::IRET },
-			{ 0, 0x1ab, &D::BISLED },
-			{ 0, 0x1ac, &D::HBR },
-			{ 0, 0x1b0, &D::GB },
-			{ 0, 0x1b1, &D::GBH },
-			{ 0, 0x1b2, &D::GBB },
-			{ 0, 0x1b4, &D::FSM },
-			{ 0, 0x1b5, &D::FSMH },
-			{ 0, 0x1b6, &D::FSMB },
-			{ 0, 0x1b8, &D::FREST },
-			{ 0, 0x1b9, &D::FRSQEST },
-			{ 0, 0x1c4, &D::LQX },
-			{ 0, 0x1cc, &D::ROTQBYBI },
-			{ 0, 0x1cd, &D::ROTQMBYBI },
-			{ 0, 0x1cf, &D::SHLQBYBI },
-			{ 0, 0x1d4, &D::CBX },
-			{ 0, 0x1d5, &D::CHX },
-			{ 0, 0x1d6, &D::CWX },
-			{ 0, 0x1d7, &D::CDX },
-			{ 0, 0x1d8, &D::ROTQBI },
-			{ 0, 0x1d9, &D::ROTQMBI },
-			{ 0, 0x1db, &D::SHLQBI },
-			{ 0, 0x1dc, &D::ROTQBY },
-			{ 0, 0x1dd, &D::ROTQMBY },
-			{ 0, 0x1df, &D::SHLQBY },
-			{ 0, 0x1f0, &D::ORX },
-			{ 0, 0x1f4, &D::CBD },
-			{ 0, 0x1f5, &D::CHD },
-			{ 0, 0x1f6, &D::CWD },
-			{ 0, 0x1f7, &D::CDD },
-			{ 0, 0x1f8, &D::ROTQBII },
-			{ 0, 0x1f9, &D::ROTQMBII },
-			{ 0, 0x1fb, &D::SHLQBII },
-			{ 0, 0x1fc, &D::ROTQBYI },
-			{ 0, 0x1fd, &D::ROTQMBYI },
-			{ 0, 0x1ff, &D::SHLQBYI },
-			{ 0, 0x201, &D::NOP },
-			{ 0, 0x240, &D::CGT },
-			{ 0, 0x241, &D::XOR },
-			{ 0, 0x248, &D::CGTH },
-			{ 0, 0x249, &D::EQV },
-			{ 0, 0x250, &D::CGTB },
-			{ 0, 0x253, &D::SUMB },
-			{ 0, 0x258, &D::HGT },
-			{ 0, 0x2a5, &D::CLZ },
-			{ 0, 0x2a6, &D::XSWD },
-			{ 0, 0x2ae, &D::XSHW },
-			{ 0, 0x2b4, &D::CNTB },
-			{ 0, 0x2b6, &D::XSBH },
-			{ 0, 0x2c0, &D::CLGT },
-			{ 0, 0x2c1, &D::ANDC },
-			{ 0, 0x2c2, &D::FCGT },
-			{ 0, 0x2c3, &D::DFCGT },
-			{ 0, 0x2c4, &D::FA },
-			{ 0, 0x2c5, &D::FS },
-			{ 0, 0x2c6, &D::FM },
-			{ 0, 0x2c8, &D::CLGTH },
-			{ 0, 0x2c9, &D::ORC },
-			{ 0, 0x2ca, &D::FCMGT },
-			{ 0, 0x2cb, &D::DFCMGT },
-			{ 0, 0x2cc, &D::DFA },
-			{ 0, 0x2cd, &D::DFS },
-			{ 0, 0x2ce, &D::DFM },
-			{ 0, 0x2d0, &D::CLGTB },
-			{ 0, 0x2d8, &D::HLGT },
-			{ 0, 0x35c, &D::DFMA },
-			{ 0, 0x35d, &D::DFMS },
-			{ 0, 0x35e, &D::DFNMS },
-			{ 0, 0x35f, &D::DFNMA },
-			{ 0, 0x3c0, &D::CEQ },
-			{ 0, 0x3ce, &D::MPYHHU },
-			{ 0, 0x340, &D::ADDX },
-			{ 0, 0x341, &D::SFX },
-			{ 0, 0x342, &D::CGX },
-			{ 0, 0x343, &D::BGX },
-			{ 0, 0x346, &D::MPYHHA },
-			{ 0, 0x34e, &D::MPYHHAU },
-			{ 0, 0x398, &D::FSCRRD },
-			{ 0, 0x3b8, &D::FESD },
-			{ 0, 0x3b9, &D::FRDS },
-			{ 0, 0x3ba, &D::FSCRWR },
-			{ 0, 0x3bf, &D::DFTSV },
-			{ 0, 0x3c2, &D::FCEQ },
-			{ 0, 0x3c3, &D::DFCEQ },
-			{ 0, 0x3c4, &D::MPY },
-			{ 0, 0x3c5, &D::MPYH },
-			{ 0, 0x3c6, &D::MPYHH },
-			{ 0, 0x3c7, &D::MPYS },
-			{ 0, 0x3c8, &D::CEQH },
-			{ 0, 0x3ca, &D::FCMEQ },
-			{ 0, 0x3cb, &D::DFCMEQ },
-			{ 0, 0x3cc, &D::MPYU },
-			{ 0, 0x3d0, &D::CEQB },
-			{ 0, 0x3d4, &D::FI },
-			{ 0, 0x3d8, &D::HEQ },
-			{ 1, 0x1d8, &D::CFLTS },
-			{ 1, 0x1d9, &D::CFLTU },
-			{ 1, 0x1da, &D::CSFLT },
-			{ 1, 0x1db, &D::CUFLT },
-			{ 2, 0x40, &D::BRZ },
-			{ 2, 0x41, &D::STQA },
-			{ 2, 0x42, &D::BRNZ },
-			{ 2, 0x44, &D::BRHZ },
-			{ 2, 0x46, &D::BRHNZ },
-			{ 2, 0x47, &D::STQR },
-			{ 2, 0x60, &D::BRA },
-			{ 2, 0x61, &D::LQA },
-			{ 2, 0x62, &D::BRASL },
-			{ 2, 0x64, &D::BR },
-			{ 2, 0x65, &D::FSMBI },
-			{ 2, 0x66, &D::BRSL },
-			{ 2, 0x67, &D::LQR },
-			{ 2, 0x81, &D::IL },
-			{ 2, 0x82, &D::ILHU },
-			{ 2, 0x83, &D::ILH },
-			{ 2, 0xc1, &D::IOHL },
-			{ 3, 0x4, &D::ORI },
-			{ 3, 0x5, &D::ORHI },
-			{ 3, 0x6, &D::ORBI },
-			{ 3, 0xc, &D::SFI },
-			{ 3, 0xd, &D::SFHI },
-			{ 3, 0x14, &D::ANDI },
-			{ 3, 0x15, &D::ANDHI },
-			{ 3, 0x16, &D::ANDBI },
-			{ 3, 0x1c, &D::AI },
-			{ 3, 0x1d, &D::AHI },
-			{ 3, 0x24, &D::STQD },
-			{ 3, 0x34, &D::LQD },
-			{ 3, 0x44, &D::XORI },
-			{ 3, 0x45, &D::XORHI },
-			{ 3, 0x46, &D::XORBI },
-			{ 3, 0x4c, &D::CGTI },
-			{ 3, 0x4d, &D::CGTHI },
-			{ 3, 0x4e, &D::CGTBI },
-			{ 3, 0x4f, &D::HGTI },
-			{ 3, 0x5c, &D::CLGTI },
-			{ 3, 0x5d, &D::CLGTHI },
-			{ 3, 0x5e, &D::CLGTBI },
-			{ 3, 0x5f, &D::HLGTI },
-			{ 3, 0x74, &D::MPYI },
-			{ 3, 0x75, &D::MPYUI },
-			{ 3, 0x7c, &D::CEQI },
-			{ 3, 0x7d, &D::CEQHI },
-			{ 3, 0x7e, &D::CEQBI },
-			{ 3, 0x7f, &D::HEQI },
-			{ 4, 0x8, &D::HBRA },
-			{ 4, 0x9, &D::HBRR },
-			{ 4, 0x21, &D::ILA },
-			{ 7, 0x8, &D::SELB },
-			{ 7, 0xb, &D::SHUFB },
-			{ 7, 0xc, &D::MPYA },
-			{ 7, 0xd, &D::FNMS },
-			{ 7, 0xe, &D::FMA },
-			{ 7, 0xf, &D::FMS },
+			{ 0, 0x0, GET(STOP) },
+			{ 0, 0x1, GET(LNOP) },
+			{ 0, 0x2, GET(SYNC) },
+			{ 0, 0x3, GET(DSYNC) },
+			{ 0, 0xc, GET(MFSPR) },
+			{ 0, 0xd, GET(RDCH) },
+			{ 0, 0xf, GET(RCHCNT) },
+			{ 0, 0x40, GET(SF) },
+			{ 0, 0x41, GET(OR) },
+			{ 0, 0x42, GET(BG) },
+			{ 0, 0x48, GET(SFH) },
+			{ 0, 0x49, GET(NOR) },
+			{ 0, 0x53, GET(ABSDB) },
+			{ 0, 0x58, GET(ROT) },
+			{ 0, 0x59, GET(ROTM) },
+			{ 0, 0x5a, GET(ROTMA) },
+			{ 0, 0x5b, GET(SHL) },
+			{ 0, 0x5c, GET(ROTH) },
+			{ 0, 0x5d, GET(ROTHM) },
+			{ 0, 0x5e, GET(ROTMAH) },
+			{ 0, 0x5f, GET(SHLH) },
+			{ 0, 0x78, GET(ROTI) },
+			{ 0, 0x79, GET(ROTMI) },
+			{ 0, 0x7a, GET(ROTMAI) },
+			{ 0, 0x7b, GET(SHLI) },
+			{ 0, 0x7c, GET(ROTHI) },
+			{ 0, 0x7d, GET(ROTHMI) },
+			{ 0, 0x7e, GET(ROTMAHI) },
+			{ 0, 0x7f, GET(SHLHI) },
+			{ 0, 0xc0, GET(A) },
+			{ 0, 0xc1, GET(AND) },
+			{ 0, 0xc2, GET(CG) },
+			{ 0, 0xc8, GET(AH) },
+			{ 0, 0xc9, GET(NAND) },
+			{ 0, 0xd3, GET(AVGB) },
+			{ 0, 0x10c, GET(MTSPR) },
+			{ 0, 0x10d, GET(WRCH) },
+			{ 0, 0x128, GET(BIZ) },
+			{ 0, 0x129, GET(BINZ) },
+			{ 0, 0x12a, GET(BIHZ) },
+			{ 0, 0x12b, GET(BIHNZ) },
+			{ 0, 0x140, GET(STOPD) },
+			{ 0, 0x144, GET(STQX) },
+			{ 0, 0x1a8, GET(BI) },
+			{ 0, 0x1a9, GET(BISL) },
+			{ 0, 0x1aa, GET(IRET) },
+			{ 0, 0x1ab, GET(BISLED) },
+			{ 0, 0x1ac, GET(HBR) },
+			{ 0, 0x1b0, GET(GB) },
+			{ 0, 0x1b1, GET(GBH) },
+			{ 0, 0x1b2, GET(GBB) },
+			{ 0, 0x1b4, GET(FSM) },
+			{ 0, 0x1b5, GET(FSMH) },
+			{ 0, 0x1b6, GET(FSMB) },
+			{ 0, 0x1b8, GET(FREST) },
+			{ 0, 0x1b9, GET(FRSQEST) },
+			{ 0, 0x1c4, GET(LQX) },
+			{ 0, 0x1cc, GET(ROTQBYBI) },
+			{ 0, 0x1cd, GET(ROTQMBYBI) },
+			{ 0, 0x1cf, GET(SHLQBYBI) },
+			{ 0, 0x1d4, GET(CBX) },
+			{ 0, 0x1d5, GET(CHX) },
+			{ 0, 0x1d6, GET(CWX) },
+			{ 0, 0x1d7, GET(CDX) },
+			{ 0, 0x1d8, GET(ROTQBI) },
+			{ 0, 0x1d9, GET(ROTQMBI) },
+			{ 0, 0x1db, GET(SHLQBI) },
+			{ 0, 0x1dc, GET(ROTQBY) },
+			{ 0, 0x1dd, GET(ROTQMBY) },
+			{ 0, 0x1df, GET(SHLQBY) },
+			{ 0, 0x1f0, GET(ORX) },
+			{ 0, 0x1f4, GET(CBD) },
+			{ 0, 0x1f5, GET(CHD) },
+			{ 0, 0x1f6, GET(CWD) },
+			{ 0, 0x1f7, GET(CDD) },
+			{ 0, 0x1f8, GET(ROTQBII) },
+			{ 0, 0x1f9, GET(ROTQMBII) },
+			{ 0, 0x1fb, GET(SHLQBII) },
+			{ 0, 0x1fc, GET(ROTQBYI) },
+			{ 0, 0x1fd, GET(ROTQMBYI) },
+			{ 0, 0x1ff, GET(SHLQBYI) },
+			{ 0, 0x201, GET(NOP) },
+			{ 0, 0x240, GET(CGT) },
+			{ 0, 0x241, GET(XOR) },
+			{ 0, 0x248, GET(CGTH) },
+			{ 0, 0x249, GET(EQV) },
+			{ 0, 0x250, GET(CGTB) },
+			{ 0, 0x253, GET(SUMB) },
+			{ 0, 0x258, GET(HGT) },
+			{ 0, 0x2a5, GET(CLZ) },
+			{ 0, 0x2a6, GET(XSWD) },
+			{ 0, 0x2ae, GET(XSHW) },
+			{ 0, 0x2b4, GET(CNTB) },
+			{ 0, 0x2b6, GET(XSBH) },
+			{ 0, 0x2c0, GET(CLGT) },
+			{ 0, 0x2c1, GET(ANDC) },
+			{ 0, 0x2c2, GET(FCGT) },
+			{ 0, 0x2c3, GET(DFCGT) },
+			{ 0, 0x2c4, GET(FA) },
+			{ 0, 0x2c5, GET(FS) },
+			{ 0, 0x2c6, GET(FM) },
+			{ 0, 0x2c8, GET(CLGTH) },
+			{ 0, 0x2c9, GET(ORC) },
+			{ 0, 0x2ca, GET(FCMGT) },
+			{ 0, 0x2cb, GET(DFCMGT) },
+			{ 0, 0x2cc, GET(DFA) },
+			{ 0, 0x2cd, GET(DFS) },
+			{ 0, 0x2ce, GET(DFM) },
+			{ 0, 0x2d0, GET(CLGTB) },
+			{ 0, 0x2d8, GET(HLGT) },
+			{ 0, 0x35c, GET(DFMA) },
+			{ 0, 0x35d, GET(DFMS) },
+			{ 0, 0x35e, GET(DFNMS) },
+			{ 0, 0x35f, GET(DFNMA) },
+			{ 0, 0x3c0, GET(CEQ) },
+			{ 0, 0x3ce, GET(MPYHHU) },
+			{ 0, 0x340, GET(ADDX) },
+			{ 0, 0x341, GET(SFX) },
+			{ 0, 0x342, GET(CGX) },
+			{ 0, 0x343, GET(BGX) },
+			{ 0, 0x346, GET(MPYHHA) },
+			{ 0, 0x34e, GET(MPYHHAU) },
+			{ 0, 0x398, GET(FSCRRD) },
+			{ 0, 0x3b8, GET(FESD) },
+			{ 0, 0x3b9, GET(FRDS) },
+			{ 0, 0x3ba, GET(FSCRWR) },
+			{ 0, 0x3bf, GET(DFTSV) },
+			{ 0, 0x3c2, GET(FCEQ) },
+			{ 0, 0x3c3, GET(DFCEQ) },
+			{ 0, 0x3c4, GET(MPY) },
+			{ 0, 0x3c5, GET(MPYH) },
+			{ 0, 0x3c6, GET(MPYHH) },
+			{ 0, 0x3c7, GET(MPYS) },
+			{ 0, 0x3c8, GET(CEQH) },
+			{ 0, 0x3ca, GET(FCMEQ) },
+			{ 0, 0x3cb, GET(DFCMEQ) },
+			{ 0, 0x3cc, GET(MPYU) },
+			{ 0, 0x3d0, GET(CEQB) },
+			{ 0, 0x3d4, GET(FI) },
+			{ 0, 0x3d8, GET(HEQ) },
+			{ 1, 0x1d8, GET(CFLTS) },
+			{ 1, 0x1d9, GET(CFLTU) },
+			{ 1, 0x1da, GET(CSFLT) },
+			{ 1, 0x1db, GET(CUFLT) },
+			{ 2, 0x40, GET(BRZ) },
+			{ 2, 0x41, GET(STQA) },
+			{ 2, 0x42, GET(BRNZ) },
+			{ 2, 0x44, GET(BRHZ) },
+			{ 2, 0x46, GET(BRHNZ) },
+			{ 2, 0x47, GET(STQR) },
+			{ 2, 0x60, GET(BRA) },
+			{ 2, 0x61, GET(LQA) },
+			{ 2, 0x62, GET(BRASL) },
+			{ 2, 0x64, GET(BR) },
+			{ 2, 0x65, GET(FSMBI) },
+			{ 2, 0x66, GET(BRSL) },
+			{ 2, 0x67, GET(LQR) },
+			{ 2, 0x81, GET(IL) },
+			{ 2, 0x82, GET(ILHU) },
+			{ 2, 0x83, GET(ILH) },
+			{ 2, 0xc1, GET(IOHL) },
+			{ 3, 0x4, GET(ORI) },
+			{ 3, 0x5, GET(ORHI) },
+			{ 3, 0x6, GET(ORBI) },
+			{ 3, 0xc, GET(SFI) },
+			{ 3, 0xd, GET(SFHI) },
+			{ 3, 0x14, GET(ANDI) },
+			{ 3, 0x15, GET(ANDHI) },
+			{ 3, 0x16, GET(ANDBI) },
+			{ 3, 0x1c, GET(AI) },
+			{ 3, 0x1d, GET(AHI) },
+			{ 3, 0x24, GET(STQD) },
+			{ 3, 0x34, GET(LQD) },
+			{ 3, 0x44, GET(XORI) },
+			{ 3, 0x45, GET(XORHI) },
+			{ 3, 0x46, GET(XORBI) },
+			{ 3, 0x4c, GET(CGTI) },
+			{ 3, 0x4d, GET(CGTHI) },
+			{ 3, 0x4e, GET(CGTBI) },
+			{ 3, 0x4f, GET(HGTI) },
+			{ 3, 0x5c, GET(CLGTI) },
+			{ 3, 0x5d, GET(CLGTHI) },
+			{ 3, 0x5e, GET(CLGTBI) },
+			{ 3, 0x5f, GET(HLGTI) },
+			{ 3, 0x74, GET(MPYI) },
+			{ 3, 0x75, GET(MPYUI) },
+			{ 3, 0x7c, GET(CEQI) },
+			{ 3, 0x7d, GET(CEQHI) },
+			{ 3, 0x7e, GET(CEQBI) },
+			{ 3, 0x7f, GET(HEQI) },
+			{ 4, 0x8, GET(HBRA) },
+			{ 4, 0x9, GET(HBRR) },
+			{ 4, 0x21, GET(ILA) },
+			{ 7, 0x8, GET(SELB) },
+			{ 7, 0xb, GET(SHUFB) },
+			{ 7, 0xc, GET(MPYA) },
+			{ 7, 0xd, GET(FNMS) },
+			{ 7, 0xe, GET(FMA) },
+			{ 7, 0xf, GET(FMS) },
 		};
 
 		for (auto& x : m_table)
 		{
-			x = &D::UNK;
+			x = GET(UNK);
 		}
 
 		for (auto& entry : instructions)
@@ -301,3 +313,5 @@ public:
 		return m_table[spu_decode(inst)];
 	}
 };
+
+#undef GET
diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp
index dddb48821f..a0d75b7cc1 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@@ -24,15 +24,12 @@
 #include <unordered_set>
 
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 #include "util/sysinfo.hpp"
 
-const spu_decoder<spu_itype> s_spu_itype;
-const spu_decoder<spu_iname> s_spu_iname;
-const spu_decoder<spu_iflag> s_spu_iflag;
-
-extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise{};
-extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast;
+const extern spu_decoder<spu_itype> g_spu_itype;
+const extern spu_decoder<spu_iname> g_spu_iname;
+const extern spu_decoder<spu_iflag> g_spu_iflag;
 
 // Move 4 args for calling native function from a GHC calling convention function
 static u8* move_args_ghc_to_native(u8* raw)
@@ -160,11 +157,12 @@ DECLARE(spu_runtime::tr_all) = []
 	return reinterpret_cast<spu_function_t>(trptr);
 }();
 
-DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>("spu_gateway", [](asmjit::x86::Assembler& c, auto& args)
+DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>("spu_gateway", [](native_asm& c, auto& args)
 {
 	// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 #ifdef _WIN32
 	c.push(x86::r15);
 	c.push(x86::r14);
@@ -247,24 +245,30 @@ DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>("spu_gateway",
 #endif
 
 	c.ret();
+#else
+	c.ret(a64::x30);
+#endif
 });
 
-DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>("spu_escape", [](asmjit::x86::Assembler& c, auto& args)
+DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>("spu_escape", [](native_asm& c, auto& args)
 {
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 	// Restore native stack pointer (longjmp emulation)
 	c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
 
 	// Return to the return location
 	c.sub(x86::rsp, 8);
 	c.ret();
+#endif
 });
 
-DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>("spu_tail_escape", [](asmjit::x86::Assembler& c, auto& args)
+DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>("spu_tail_escape", [](native_asm& c, auto& args)
 {
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 	// Restore native stack pointer (longjmp emulation)
 	c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
 
@@ -278,6 +282,7 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, sp
 	c.xor_(x86::ebx, x86::ebx);
 	c.mov(x86::qword_ptr(x86::rsp), args[1]);
 	c.ret();
+#endif
 });
 
 DECLARE(spu_runtime::g_interpreter_table) = {};
@@ -364,7 +369,7 @@ void spu_cache::initialize()
 {
 	spu_runtime::g_interpreter = spu_runtime::g_gateway;
 
-	if (g_cfg.core.spu_decoder == spu_decoder_type::precise || g_cfg.core.spu_decoder == spu_decoder_type::fast)
+	if (g_cfg.core.spu_decoder == spu_decoder_type::_static || g_cfg.core.spu_decoder == spu_decoder_type::dynamic)
 	{
 		for (auto& x : *spu_runtime::g_dispatcher)
 		{
@@ -395,7 +400,7 @@ void spu_cache::initialize()
 	atomic_t<usz> fnext{};
 	atomic_t<u8> fail_flag{0};
 
-	if (g_cfg.core.spu_decoder == spu_decoder_type::fast || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+	if (g_cfg.core.spu_decoder == spu_decoder_type::dynamic || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
 	{
 		if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11))
 		{
@@ -634,7 +639,7 @@ void spu_cache::initialize()
 
 				for (u32 i = 0; i < f->data.size(); i++)
 				{
-					fmt::append(dump, "%-10s", s_spu_iname.decode(std::bit_cast<be_t<u32>>(f->data[i])));
+					fmt::append(dump, "%-10s", g_spu_iname.decode(std::bit_cast<be_t<u32>>(f->data[i])));
 				}
 
 				n_max = std::max(n_max, ::size32(depth_n));
@@ -1289,15 +1294,13 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
 
 void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/)
 {
-	if (g_cfg.core.spu_decoder > spu_decoder_type::fast)
+	if (g_cfg.core.spu_decoder != spu_decoder_type::_static)
 	{
 		fmt::throw_exception("Invalid SPU decoder");
 	}
 
 	// Select opcode table
-	const auto& table = *(g_cfg.core.spu_decoder == spu_decoder_type::precise
-		? &g_spu_interpreter_precise.get_table()
-		: &g_spu_interpreter_fast.get_table());
+	const auto& table = g_fxo->get<spu_interpreter_rt>();
 
 	// LS pointer
 	const auto base = static_cast<const u8*>(ls);
@@ -1311,7 +1314,7 @@ void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/
 		}
 
 		const u32 op = *reinterpret_cast<const be_t<u32>*>(base + spu.pc);
-		if (table[spu_decode(op)](spu, {op}))
+		if (table.decode(op)(spu, {op}))
 			spu.pc += 4;
 	}
 }
@@ -1430,7 +1433,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
 		m_targets.erase(pos);
 
 		// Fill register access info
-		if (auto iflags = s_spu_iflag.decode(data))
+		if (auto iflags = g_spu_iflag.decode(data))
 		{
 			if (+iflags & +spu_iflag::use_ra)
 				m_use_ra[pos / 4] = op.ra;
@@ -1441,7 +1444,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
 		}
 
 		// Analyse instruction
-		switch (const auto type = s_spu_itype.decode(data))
+		switch (const auto type = g_spu_itype.decode(data))
 		{
 		case spu_itype::UNK:
 		case spu_itype::DFCEQ:
@@ -2297,7 +2300,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
 			// Decode instruction
 			const spu_opcode_t op{std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 4])};
 
-			const auto type = s_spu_itype.decode(op.opcode);
+			const auto type = g_spu_itype.decode(op.opcode);
 
 			u8 reg_save = 255;
 
@@ -2790,7 +2793,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
 		{
 			// Decode instruction again
 			op.opcode = std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 41]);
-			last_inst = s_spu_itype.decode(op.opcode);
+			last_inst = g_spu_itype.decode(op.opcode);
 
 			// Propagate some constants
 			switch (last_inst)
@@ -5035,7 +5038,7 @@ public:
 
 			// Execute interpreter instruction
 			const u32 op = *reinterpret_cast<const be_t<u32>*>(_spu->_ptr<u8>(0) + _spu->pc);
-			if (!g_spu_interpreter_fast.decode(op)(*_spu, {op}))
+			if (!g_fxo->get<spu_interpreter_rt>().decode(op)(*_spu, {op}))
 				spu_log.fatal("Bad instruction");
 
 			// Swap state
@@ -5151,10 +5154,10 @@ public:
 			const u32 op = i << (32u - m_interp_magn);
 
 			// Instruction type
-			const auto itype = s_spu_itype.decode(op);
+			const auto itype = g_spu_itype.decode(op);
 
 			// Function name
-			std::string fname = fmt::format("spu_%s", s_spu_iname.decode(op));
+			std::string fname = fmt::format("spu_%s", g_spu_iname.decode(op));
 
 			if (last_itype != itype)
 			{
@@ -5460,7 +5463,7 @@ public:
 		return _spu->check_state();
 	}
 
-	template <spu_inter_func_t F>
+	template <spu_intrp_func_t F>
 	static void exec_fall(spu_thread* _spu, spu_opcode_t op)
 	{
 		if (F(*_spu, op))
@@ -5469,10 +5472,10 @@ public:
 		}
 	}
 
-	template <spu_inter_func_t F>
+	template <spu_intrp_func_t F>
 	void fall(spu_opcode_t op)
 	{
-		std::string name = fmt::format("spu_%s", s_spu_iname.decode(op.opcode));
+		std::string name = fmt::format("spu_%s", g_spu_iname.decode(op.opcode));
 
 		if (m_interp_magn)
 		{
@@ -6808,11 +6811,21 @@ public:
 		set_vr(op.rt, fshl(a, zshuffle(a, 4, 0, 1, 2), b));
 	}
 
+#if defined(ARCH_X64)
 	static __m128i exec_rotqby(__m128i a, u8 b)
 	{
 		alignas(32) const __m128i buf[2]{a, a};
 		return _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (b & 0xf))));
 	}
+#else
+	static v128 exec_rotqby(v128 a, u8 b)
+	{
+		alignas(32) const v128 buf[2]{a, a};
+		alignas(16) v128 res;
+		std::memcpy(&res, reinterpret_cast<const u8*>(buf) + (16 - (b & 0xf)), 16);
+		return res;
+	}
+#endif
 
 	void ROTQBY(spu_opcode_t op)
 	{
@@ -6822,7 +6835,7 @@ public:
 		if (!m_use_ssse3)
 		{
 			value_t<u8[16]> r;
-			r.value = call("spu_rotqby", &exec_rotqby, a.value, eval(extract(b, 12)).value);
+			r.value = call<u8[16]>("spu_rotqby", &exec_rotqby, a.value, eval(extract(b, 12)).value);
 			set_vr(op.rt, r);
 			return;
 		}
@@ -7805,7 +7818,7 @@ public:
 	{
 		const auto [a, b, c] = get_vrs<f64[2]>(op.ra, op.rb, op.rt);
 
-		if (g_cfg.core.llvm_accurate_dfma)
+		if (g_cfg.core.use_accurate_dfma)
 			set_vr(op.rt, fmuladd(a, b, c, true));
 		else
 			set_vr(op.rt, a * b + c);
@@ -7815,7 +7828,7 @@ public:
 	{
 		const auto [a, b, c] = get_vrs<f64[2]>(op.ra, op.rb, op.rt);
 
-		if (g_cfg.core.llvm_accurate_dfma)
+		if (g_cfg.core.use_accurate_dfma)
 			set_vr(op.rt, fmuladd(a, b, -c, true));
 		else
 			set_vr(op.rt, a * b - c);
@@ -7825,7 +7838,7 @@ public:
 	{
 		const auto [a, b, c] = get_vrs<f64[2]>(op.ra, op.rb, op.rt);
 
-		if (g_cfg.core.llvm_accurate_dfma)
+		if (g_cfg.core.use_accurate_dfma)
 			set_vr(op.rt, fmuladd(-a, b, c, true));
 		else
 			set_vr(op.rt, c - (a * b));
@@ -7835,7 +7848,7 @@ public:
 	{
 		const auto [a, b, c] = get_vrs<f64[2]>(op.ra, op.rb, op.rt);
 
-		if (g_cfg.core.llvm_accurate_dfma)
+		if (g_cfg.core.use_accurate_dfma)
 			set_vr(op.rt, -fmuladd(a, b, c, true));
 		else
 			set_vr(op.rt, -(a * b + c));
@@ -9894,11 +9907,11 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_llvm_recompiler(u
 	return std::make_unique<spu_llvm_recompiler>(magn);
 }
 
-const spu_decoder<spu_llvm_recompiler> g_spu_llvm_decoder;
+const spu_decoder<spu_llvm_recompiler> s_spu_llvm_decoder;
 
 decltype(&spu_llvm_recompiler::UNK) spu_llvm_recompiler::decode(u32 op)
 {
-	return g_spu_llvm_decoder.decode(op);
+	return s_spu_llvm_decoder.decode(op);
 }
 
 #else
@@ -10025,6 +10038,11 @@ struct spu_llvm
 
 	void operator()()
 	{
+		if (g_cfg.core.spu_decoder != spu_decoder_type::llvm)
+		{
+			return;
+		}
+
 		// To compile (hash -> item)
 		std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;
 
@@ -10345,7 +10363,7 @@ struct spu_fast : public spu_recompiler_base
 			// Fix endianness
 			const spu_opcode_t op{std::bit_cast<be_t<u32>>(func.data[i])};
 
-			switch (auto type = s_spu_itype.decode(op.opcode))
+			switch (auto type = g_spu_itype.decode(op.opcode))
 			{
 			case spu_itype::BRZ:
 			case spu_itype::BRHZ:
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 4f5e7037c0..deaec570b0 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -30,7 +30,7 @@
 #include "util/vm.hpp"
 #include "util/asm.hpp"
 #include "util/v128.hpp"
-#include "util/v128sse.hpp"
+#include "util/simd.hpp"
 #include "util/sysinfo.hpp"
 
 using spu_rdata_t = decltype(spu_thread::rdata);
@@ -87,14 +87,13 @@ void fmt_class_string<spu_type>::format(std::string& out, u64 arg)
 // Verify AVX availability for TSX transactions
 static const bool s_tsx_avx = utils::has_avx();
 
-// For special case
-static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
-
 // Threshold for when rep mosvb is expected to outperform simd copies
 // The threshold will be 0xFFFFFFFF when the performance of rep movsb is expected to be bad
 static const u32 s_rep_movsb_threshold = utils::get_rep_movsb_threshold();
 
-#ifndef _MSC_VER
+#if defined(_M_X64)
+extern "C" void __movsb(uchar*, const uchar*, size_t);
+#elif defined(ARCH_X64)
 static FORCE_INLINE void __movsb(unsigned char * Dst, const unsigned char * Src, size_t Size)
 {
 	__asm__ __volatile__
@@ -104,8 +103,12 @@ static FORCE_INLINE void __movsb(unsigned char * Dst, const unsigned char * Src,
 		"[Dst]" (Dst), "[Src]" (Src), "[Size]" (Size)
 	);
 }
+#else
+#define s_rep_movsb_threshold umax
+#define __movsb std::memcpy
 #endif
 
+#if defined(ARCH_X64)
 static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
 {
 #if defined(_MSC_VER) || defined(__AVX__)
@@ -145,18 +148,21 @@ static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
 	return result;
 #endif
 }
+#endif
 
 #ifdef _MSC_VER
 __forceinline
 #endif
 extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs)
 {
+#if defined(ARCH_X64)
 #ifndef __AVX__
 	if (s_tsx_avx) [[likely]]
 #endif
 	{
 		return cmp_rdata_avx(reinterpret_cast<const __m256i*>(_lhs), reinterpret_cast<const __m256i*>(_rhs));
 	}
+#endif
 
 	const auto lhs = reinterpret_cast<const v128*>(_lhs);
 	const auto rhs = reinterpret_cast<const v128*>(_rhs);
@@ -165,9 +171,10 @@ extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs)
 	const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]);
 	const v128 d = (lhs[6] ^ rhs[6]) | (lhs[7] ^ rhs[7]);
 	const v128 r = (a | b) | (c | d);
-	return r == v128{};
+	return gv_testz(r);
 }
 
+#if defined(ARCH_X64)
 static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
 {
 #ifdef _MSC_VER
@@ -199,12 +206,14 @@ static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
 	);
 #endif
 }
+#endif
 
 #ifdef _MSC_VER
 __forceinline
 #endif
 extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src)
 {
+#if defined(ARCH_X64)
 #ifndef __AVX__
 	if (s_tsx_avx) [[likely]]
 #endif
@@ -232,8 +241,12 @@ extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src)
 	_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 80), v1);
 	_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 96), v2);
 	_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 112), v3);
+#else
+	std::memcpy(_dst, _src, 128);
+#endif
 }
 
+#if defined(ARCH_X64)
 static FORCE_INLINE void mov_rdata_nt_avx(__m256i* dst, const __m256i* src)
 {
 #ifdef _MSC_VER
@@ -265,9 +278,11 @@ static FORCE_INLINE void mov_rdata_nt_avx(__m256i* dst, const __m256i* src)
 	);
 #endif
 }
+#endif
 
 extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src)
 {
+#if defined(ARCH_X64)
 #ifndef __AVX__
 	if (s_tsx_avx) [[likely]]
 #endif
@@ -295,6 +310,9 @@ extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src)
 	_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 80), v1);
 	_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 96), v2);
 	_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 112), v3);
+#else
+	std::memcpy(_dst, _src, 128);
+#endif
 }
 
 void do_cell_atomic_128_store(u32 addr, const void* to_write);
@@ -421,10 +439,11 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
 	return res;
 }
 
-const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>("spu_putllc_tx", [](asmjit::x86::Assembler& c, auto& args)
+const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>("spu_putllc_tx", [](native_asm& c, auto& args)
 {
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 	Label fall = c.newLabel();
 	Label fail = c.newLabel();
 	Label _ret = c.newLabel();
@@ -677,12 +696,16 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
 	c.bind(ret2);
 #endif
 	c.ret();
+#else
+	c.ret(a64::x30);
+#endif
 });
 
-const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>("spu_putlluc_tx", [](asmjit::x86::Assembler& c, auto& args)
+const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>("spu_putlluc_tx", [](native_asm& c, auto& args)
 {
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 	Label fall = c.newLabel();
 	Label _ret = c.newLabel();
 
@@ -803,12 +826,16 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
 	c.bind(ret2);
 #endif
 	c.ret();
+#else
+	c.ret(a64::x30);
+#endif
 });
 
-const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>("spu_getllar_tx", [](asmjit::x86::Assembler& c, auto& args)
+const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>("spu_getllar_tx", [](native_asm& c, auto& args)
 {
 	using namespace asmjit;
 
+#if defined(ARCH_X64)
 	Label fall = c.newLabel();
 	Label _ret = c.newLabel();
 
@@ -938,6 +965,9 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
 	c.bind(ret2);
 #endif
 	c.ret();
+#else
+	c.ret(a64::x30);
+#endif
 });
 
 void spu_int_ctrl_t::set(u64 ints)
@@ -967,7 +997,7 @@ spu_imm_table_t::scale_table_t::scale_table_t()
 {
 	for (s32 i = -155; i < 174; i++)
 	{
-		m_data[i + 155].vf = _mm_set1_ps(static_cast<float>(std::exp2(i)));
+		m_data[i + 155] = v128::fromf32p(static_cast<float>(std::exp2(i)));
 	}
 }
 
@@ -1385,6 +1415,8 @@ void spu_thread::cpu_task()
 
 	std::fesetround(FE_TOWARDZERO);
 
+	gv_set_zeroing_denormals();
+
 	g_tls_log_prefix = []
 	{
 		const auto cpu = static_cast<spu_thread*>(get_current_cpu_thread());
@@ -1622,7 +1654,7 @@ spu_thread::spu_thread(lv2_spu_group* group, u32 index, std::string_view name, u
 		jit = spu_recompiler_base::make_fast_llvm_recompiler();
 	}
 
-	if (g_cfg.core.spu_decoder != spu_decoder_type::fast && g_cfg.core.spu_decoder != spu_decoder_type::precise)
+	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
 	{
 		if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
 		{
@@ -2640,7 +2672,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 					return false;
 				});
 
-				const u64 count2 = __rdtsc() - perf2.get();
+				const u64 count2 = utils::get_tsc() - perf2.get();
 
 				if (count2 > 20000 && g_cfg.core.perf_report) [[unlikely]]
 				{
@@ -2672,7 +2704,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 				utils::prefetch_read(rdata + 64);
 				last_faddr = addr;
 				last_ftime = res.load() & -128;
-				last_ftsc = __rdtsc();
+				last_ftsc = utils::get_tsc();
 				return false;
 			}
 			default:
@@ -2854,7 +2886,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 			});
 
 			vm::reservation_acquire(addr) += 32;
-			result = __rdtsc() - perf0.get();
+			result = utils::get_tsc() - perf0.get();
 		}
 
 		if (result > 20000 && g_cfg.core.perf_report) [[unlikely]]
@@ -3007,7 +3039,7 @@ bool spu_thread::do_mfc(bool can_escape, bool must_finish)
 	{
 		// Get commands' execution mask
 		// Mask bits are always set when mfc_transfers_shuffling is 0
-		return static_cast<u16>((0 - (1u << std::min<u32>(g_cfg.core.mfc_transfers_shuffling, size))) | __rdtsc());
+		return static_cast<u16>((0 - (1u << std::min<u32>(g_cfg.core.mfc_transfers_shuffling, size))) | utils::get_tsc());
 	};
 
 	// Process enqueued commands
@@ -3684,9 +3716,9 @@ void spu_thread::set_interrupt_status(bool enable)
 		// Detect enabling interrupts with events masked
 		if (auto mask = ch_events.load().mask; mask & SPU_EVENT_INTR_BUSY_CHECK)
 		{
-			if (g_cfg.core.spu_decoder != spu_decoder_type::precise && g_cfg.core.spu_decoder != spu_decoder_type::fast)
+			if (g_cfg.core.spu_decoder != spu_decoder_type::_static)
 			{
-				fmt::throw_exception("SPU Interrupts not implemented (mask=0x%x): Use interpreterts", mask);
+				fmt::throw_exception("SPU Interrupts not implemented (mask=0x%x): Use static interpreter", mask);
 			}
 
 			spu_log.trace("SPU Interrupts (mask=0x%x) are using CPU busy checking mode", mask);
diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h
index 0e1f47e2be..60fb86f6a6 100644
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@@ -503,9 +503,9 @@ struct spu_imm_table_t
 	public:
 		scale_table_t();
 
-		FORCE_INLINE const auto& operator [](s32 scale) const
+		FORCE_INLINE const v128& operator [](s32 scale) const
 		{
-			return m_data[scale + 155].vf;
+			return m_data[scale + 155];
 		}
 	}
 	const scale;
diff --git a/rpcs3/Emu/Cell/lv2/lv2.cpp b/rpcs3/Emu/Cell/lv2/lv2.cpp
index 4163fe7e97..3e65b95457 100644
--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@@ -75,30 +75,28 @@ void fmt_class_string<lv2_protocol>::format(std::string& out, u64 arg)
 	});
 }
 
-static bool null_func_(ppu_thread& ppu)
+static void null_func_(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
 {
 	ppu_log.todo("Unimplemented syscall %s -> CELL_OK (r3=0x%llx, r4=0x%llx, r5=0x%llx, r6=0x%llx, r7=0x%llx, r8=0x%llx, r9=0x%llx, r10=0x%llx)", ppu_syscall_code(ppu.gpr[11]),
 		ppu.gpr[3], ppu.gpr[4], ppu.gpr[5], ppu.gpr[6], ppu.gpr[7], ppu.gpr[8], ppu.gpr[9], ppu.gpr[10]);
 
 	ppu.gpr[3] = 0;
-	ppu.cia += 4;
-	return false;
+	ppu.cia = vm::get_addr(this_op) + 4;
 }
 
-static bool uns_func_(ppu_thread& ppu)
+static void uns_func_(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
 {
 	ppu_log.trace("Unused syscall %d -> ENOSYS", ppu.gpr[11]);
 	ppu.gpr[3] = CELL_ENOSYS;
-	ppu.cia += 4;
-	return false;
+	ppu.cia = vm::get_addr(this_op) + 4;
 }
 
 // Bind Syscall
 #define BIND_SYSC(func) {BIND_FUNC(func), #func}
 #define NULL_FUNC(name) {null_func_, #name}
 
-constexpr std::pair<ppu_function_t, std::string_view> null_func{null_func_, ""};
-constexpr std::pair<ppu_function_t, std::string_view> uns_func{uns_func_, ""};
+constexpr std::pair<ppu_intrp_func_t, std::string_view> null_func{null_func_, ""};
+constexpr std::pair<ppu_intrp_func_t, std::string_view> uns_func{uns_func_, ""};
 
 // UNS = Unused
 // ROOT = Root
@@ -106,7 +104,7 @@ constexpr std::pair<ppu_function_t, std::string_view> uns_func{uns_func_, ""};
 // DEX..DECR = Unavailable on retail consoles
 // PM = Product Mode
 // AuthID = Authentication ID
-const std::array<std::pair<ppu_function_t, std::string_view>, 1024> g_ppu_syscall_table
+const std::array<std::pair<ppu_intrp_func_t, std::string_view>, 1024> g_ppu_syscall_table
 {
 	null_func,
 	BIND_SYSC(sys_process_getpid),                          //1   (0x001)
@@ -1151,7 +1149,7 @@ extern void ppu_execute_syscall(ppu_thread& ppu, u64 code)
 
 		if (const auto func = g_ppu_syscall_table[code].first)
 		{
-			func(ppu);
+			func(ppu, {}, vm::_ptr<u32>(ppu.cia), nullptr);
 			ppu_log.trace("Syscall '%s' (%llu) finished, r3=0x%llx", ppu_syscall_code(code), code, ppu.gpr[3]);
 			return;
 		}
@@ -1160,7 +1158,7 @@ extern void ppu_execute_syscall(ppu_thread& ppu, u64 code)
 	fmt::throw_exception("Invalid syscall number (%llu)", code);
 }
 
-extern ppu_function_t ppu_get_syscall(u64 code)
+extern ppu_intrp_func_t ppu_get_syscall(u64 code)
 {
 	if (code < g_ppu_syscall_table.size())
 	{
diff --git a/rpcs3/Emu/Cell/lv2/sys_net.cpp b/rpcs3/Emu/Cell/lv2/sys_net.cpp
index 48a8b525ef..72dda6b6e7 100644
--- a/rpcs3/Emu/Cell/lv2/sys_net.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_net.cpp
@@ -11,6 +11,10 @@
 #include <winsock2.h>
 #include <WS2tcpip.h>
 #else
+#ifdef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wold-style-cast"
+#endif
 #include <errno.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -22,6 +26,9 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <poll.h>
+#ifdef __clang__
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 #include "Emu/NP/np_handler.h"
diff --git a/rpcs3/Emu/Cell/lv2/sys_usbd.cpp b/rpcs3/Emu/Cell/lv2/sys_usbd.cpp
index 3bdeaee9d0..aa6154ae0d 100644
--- a/rpcs3/Emu/Cell/lv2/sys_usbd.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_usbd.cpp
@@ -317,7 +317,7 @@ void usb_handler_thread::operator()()
 {
 	timeval lusb_tv{0, 200};
 
-	while (thread_ctrl::state() != thread_state::aborting)
+	while (ctx && thread_ctrl::state() != thread_state::aborting)
 	{
 		// Todo: Hotplug here?
 
diff --git a/rpcs3/Emu/GDB.cpp b/rpcs3/Emu/GDB.cpp
index 26da12ee51..e1c38de285 100644
--- a/rpcs3/Emu/GDB.cpp
+++ b/rpcs3/Emu/GDB.cpp
@@ -15,6 +15,10 @@
 #include <WS2tcpip.h>
 #include <afunix.h> // sockaddr_un
 #else
+#ifdef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wold-style-cast"
+#endif
 #include <errno.h>
 #include <sys/types.h>
 #include <sys/socket.h>
@@ -25,6 +29,9 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/un.h> // sockaddr_un
+#ifdef __clang__
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 #include <charconv>
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index 82197099a2..474ac84d3c 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -684,7 +684,7 @@ namespace vm
 				// 1. To simplify range_lock logic
 				// 2. To make sure it never overlaps with 32-bit addresses
 				// Also check that it's aligned (lowest 16 bits)
-				ensure((shm_self & 0xffff'8000'0000'ffff) == range_locked);
+				ensure((shm_self & 0xffff'0000'0000'ffff) == range_locked);
 
 				// Find another mirror and map it as shareable too
 				for (auto& ploc : g_locations)
@@ -714,7 +714,7 @@ namespace vm
 			u64 shm_self = reinterpret_cast<u64>(shm->get()) ^ range_locked;
 
 			// Check (see above)
-			ensure((shm_self & 0xffff'8000'0000'ffff) == range_locked);
+			ensure((shm_self & 0xffff'0000'0000'ffff) == range_locked);
 
 			// Map range as shareable
 			for (u32 i = addr / 65536; i < addr / 65536 + size / 65536; i++)
@@ -1129,13 +1129,16 @@ namespace vm
 		{
 			auto fill64 = [](u8* ptr, u64 data, usz count)
 			{
-#ifdef _MSC_VER
+#ifdef _M_X64
 				__stosq(reinterpret_cast<u64*>(ptr), data, count);
-#else
+#elif defined(ARCH_X64)
 				__asm__ ("mov %0, %%rdi; mov %1, %%rax; mov %2, %%rcx; rep stosq;"
 					:
 					: "r" (ptr), "r" (data), "r" (count)
 					: "rdi", "rax", "rcx", "memory");
+#else
+				for (usz i = 0; i < count; i++)
+					reinterpret_cast<u64*>(ptr)[i] = data;
 #endif
 			};
 
diff --git a/rpcs3/Emu/Memory/vm.h b/rpcs3/Emu/Memory/vm.h
index 8f064149bf..e819c7006d 100644
--- a/rpcs3/Emu/Memory/vm.h
+++ b/rpcs3/Emu/Memory/vm.h
@@ -200,16 +200,10 @@ namespace vm
 		return {};
 	}
 
+	// Unsafe convert host ptr to PS3 VM address (clamp with 4GiB alignment assumption)
 	inline vm::addr_t get_addr(const void* ptr)
 	{
-		const auto [addr, ok] = try_get_addr(ptr);
-
-		if (!ok)
-		{
-			fmt::throw_exception("Not a virtual memory pointer (%p)", ptr);
-		}
-
-		return addr;
+		return vm::addr_t{static_cast<u32>(uptr(ptr))};
 	}
 
 	template<typename T>
diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h
index aeedda7409..ec6af4f7ae 100644
--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@@ -3,6 +3,7 @@
 #include "vm.h"
 #include "vm_locking.h"
 #include "util/atomic.hpp"
+#include "util/tsc.hpp"
 #include <functional>
 
 extern bool g_use_rtm;
@@ -11,7 +12,6 @@ extern u64 g_rtm_tx_limit2;
 #ifdef _MSC_VER
 extern "C"
 {
-	u64 __rdtsc();
 	u32 _xbegin();
 	void _xend();
 }
@@ -19,15 +19,6 @@ extern "C"
 
 namespace vm
 {
-	inline u64 get_tsc()
-	{
-#ifdef _MSC_VER
-		return __rdtsc();
-#else
-		return __builtin_ia32_rdtsc();
-#endif
-	}
-
 	enum : u64
 	{
 		rsrv_lock_mask = 127,
@@ -108,13 +99,14 @@ namespace vm
 		auto& res = vm::reservation_acquire(addr);
 		//_m_prefetchw(&res);
 
+#if defined(ARCH_X64)
 		if (g_use_rtm)
 		{
 			// Stage 1: single optimistic transaction attempt
 			unsigned status = -1;
 			u64 _old = 0;
 
-			auto stamp0 = get_tsc(), stamp1 = stamp0, stamp2 = stamp0;
+			auto stamp0 = utils::get_tsc(), stamp1 = stamp0, stamp2 = stamp0;
 
 #ifndef _MSC_VER
 			__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
@@ -176,16 +168,16 @@ namespace vm
 #ifndef _MSC_VER
 			__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
 #endif
-			stamp1 = get_tsc();
+			stamp1 = utils::get_tsc();
 
 			// Stage 2: try to lock reservation first
 			_old = res.fetch_add(1);
 
 			// Compute stamps excluding memory touch
-			stamp2 = get_tsc() - (stamp1 - stamp0);
+			stamp2 = utils::get_tsc() - (stamp1 - stamp0);
 
 			// Start lightened transaction
-			for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = get_tsc())
+			for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = utils::get_tsc())
 			{
 				if (cpu.has_pause_flag())
 				{
@@ -285,6 +277,9 @@ namespace vm
 				return result;
 			}
 		}
+#else
+		static_cast<void>(cpu);
+#endif /* ARCH_X64 */
 
 		// Lock reservation and perform heavyweight lock
 		reservation_shared_lock_internal(res);
diff --git a/rpcs3/Emu/NP/np_dnshook.cpp b/rpcs3/Emu/NP/np_dnshook.cpp
index 92e7ea3bfc..62284c53c5 100644
--- a/rpcs3/Emu/NP/np_dnshook.cpp
+++ b/rpcs3/Emu/NP/np_dnshook.cpp
@@ -8,8 +8,15 @@
 #ifdef _WIN32
 #include <WS2tcpip.h>
 #else
+#ifdef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wold-style-cast"
+#endif
 #include <sys/socket.h>
 #include <arpa/inet.h>
+#ifdef __clang__
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 LOG_CHANNEL(dnshook_log, "DnsHook");
diff --git a/rpcs3/Emu/NP/np_handler.cpp b/rpcs3/Emu/NP/np_handler.cpp
index 621b7a6659..a2ac4de4ac 100644
--- a/rpcs3/Emu/NP/np_handler.cpp
+++ b/rpcs3/Emu/NP/np_handler.cpp
@@ -19,12 +19,19 @@
 #include <WS2tcpip.h>
 #include <iphlpapi.h>
 #else
+#ifdef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wold-style-cast"
+#endif
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <net/if.h>
 #include <arpa/inet.h>
 #include <netdb.h>
 #include <unistd.h>
+#ifdef __clang__
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 #if defined(__FreeBSD__) || defined(__APPLE__)
diff --git a/rpcs3/Emu/NP/rpcn_client.cpp b/rpcs3/Emu/NP/rpcn_client.cpp
index 427aedd7ea..d785dcb1b3 100644
--- a/rpcs3/Emu/NP/rpcn_client.cpp
+++ b/rpcs3/Emu/NP/rpcn_client.cpp
@@ -21,6 +21,10 @@
 #include <winsock2.h>
 #include <WS2tcpip.h>
 #else
+#ifdef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wold-style-cast"
+#endif
 #include <errno.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -32,6 +36,9 @@
 #include <fcntl.h>
 #include <poll.h>
 #include <netdb.h>
+#ifdef __clang__
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 LOG_CHANNEL(rpcn_log, "rpcn");
diff --git a/rpcs3/Emu/NP/rpcn_client.h b/rpcs3/Emu/NP/rpcn_client.h
index 992ee92f2b..b674c4199a 100644
--- a/rpcs3/Emu/NP/rpcn_client.h
+++ b/rpcs3/Emu/NP/rpcn_client.h
@@ -11,9 +11,16 @@
 #ifdef _WIN32
 #include <winsock2.h>
 #else
+#ifdef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wold-style-cast"
+#endif
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
+#ifdef __clang__
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 #include "Emu/Cell/Modules/sceNp.h"
diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
index 314f64972d..57489333a0 100644
--- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
@@ -7,15 +7,25 @@
 #include "util/sysinfo.hpp"
 #include "util/asm.hpp"
 
+#if defined(ARCH_X64)
 #include "emmintrin.h"
 #include "immintrin.h"
+#endif
 
-#if !defined(_MSC_VER) && defined(__clang__)
+#if !defined(_MSC_VER)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wold-style-cast"
 #endif
 
-#if defined(_MSC_VER)
+#ifdef ARCH_ARM64
+#if !defined(_MSC_VER)
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+#undef FORCE_INLINE
+#include "Emu/CPU/sse2neon.h"
+#endif
+
+#if defined(_MSC_VER) || !defined(__SSE2__)
 #define PLAIN_FUNC
 #define SSSE3_FUNC
 #define SSE4_1_FUNC
@@ -57,7 +67,7 @@ constexpr bool s_use_ssse3 = true;
 constexpr bool s_use_sse4_1 = true;
 constexpr bool s_use_avx2 = true;
 constexpr bool s_use_avx3 = false;
-#elif defined(__SSE41__)
+#elif defined(__SSE4_1__)
 constexpr bool s_use_ssse3 = true;
 constexpr bool s_use_sse4_1 = true;
 constexpr bool s_use_avx2 = false;
@@ -67,11 +77,16 @@ constexpr bool s_use_ssse3 = true;
 constexpr bool s_use_sse4_1 = false;
 constexpr bool s_use_avx2 = false;
 constexpr bool s_use_avx3 = false;
-#else
+#elif defined(ARCH_X64)
 const bool s_use_ssse3 = utils::has_ssse3();
 const bool s_use_sse4_1 = utils::has_sse41();
 const bool s_use_avx2 = utils::has_avx2();
 const bool s_use_avx3 = utils::has_avx512();
+#else
+constexpr bool s_use_ssse3 = true; // Non x86
+constexpr bool s_use_sse4_1 = true; // Non x86
+constexpr bool s_use_avx2 = false;
+constexpr bool s_use_avx3 = false;
 #endif
 
 const __m128i s_bswap_u32_mask = _mm_set_epi8(
@@ -98,7 +113,7 @@ namespace utils
 namespace
 {
 	template <bool Compare>
-	PLAIN_FUNC bool copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count)
+	PLAIN_FUNC auto copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count)
 	{
 		u32 result = 0;
 
@@ -117,11 +132,14 @@ namespace
 			dst[i] = data;
 		}
 
-		return static_cast<bool>(result);
+		if constexpr (Compare)
+		{
+			return static_cast<bool>(result);
+		}
 	}
 
 	template <bool Compare>
-	SSSE3_FUNC bool copy_data_swap_u32_ssse3(u32* dst, const u32* src, u32 count)
+	SSSE3_FUNC auto copy_data_swap_u32_ssse3(u32* dst, const u32* src, u32 count)
 	{
 		u32 result = 0;
 
@@ -140,9 +158,13 @@ namespace
 			dst[i] = data;
 		}
 
-		return static_cast<bool>(result);
+		if constexpr (Compare)
+		{
+			return static_cast<bool>(result);
+		}
 	}
 
+#if defined(ARCH_X64)
 	template <bool Compare, int Size, typename RT>
 	void build_copy_data_swap_u32_avx3(asmjit::x86::Assembler& c, std::array<asmjit::x86::Gp, 4>& args, const RT& rmask, const RT& rload, const RT& rtest)
 	{
@@ -199,8 +221,7 @@ namespace
 		c.jmp(loop);
 
 		c.bind(tail);
-		c.shlx(x86::eax, x86::eax, args[2].r32());
-		c.not_(x86::eax);
+		c.bzhi(x86::eax, x86::eax, args[2].r32());
 		c.kmovw(x86::k1, x86::eax);
 		c.k(x86::k1).z().vmovdqu32(rload, x86::Mem(args[1], 0, Size * 4u));
 		c.vpshufb(rload, rload, rmask);
@@ -230,7 +251,7 @@ namespace
 	}
 
 	template <bool Compare>
-	void build_copy_data_swap_u32(asmjit::x86::Assembler& c, std::array<asmjit::x86::Gp, 4>& args)
+	void build_copy_data_swap_u32(native_asm& c, native_args& args)
 	{
 		using namespace asmjit;
 
@@ -254,11 +275,18 @@ namespace
 
 		c.jmp(asmjit::imm_ptr(&copy_data_swap_u32_naive<Compare>));
 	}
+#else
+	template <bool Compare>
+	constexpr auto build_copy_data_swap_u32()
+	{
+		return &copy_data_swap_u32_naive<Compare>;
+	}
+#endif
 }
 
-built_function<void(*)(void*, const void*, u32)> copy_data_swap_u32("copy_data_swap_u32", &build_copy_data_swap_u32<false>);
+built_function<void(*)(u32*, const u32*, u32)> copy_data_swap_u32("copy_data_swap_u32", &build_copy_data_swap_u32<false>);
 
-built_function<bool(*)(void*, const void*, u32)> copy_data_swap_u32_cmp("copy_data_swap_u32_cmp", &build_copy_data_swap_u32<true>);
+built_function<bool(*)(u32*, const u32*, u32)> copy_data_swap_u32_cmp("copy_data_swap_u32_cmp", &build_copy_data_swap_u32<true>);
 
 namespace
 {
@@ -390,6 +418,7 @@ namespace
 
 	struct primitive_restart_impl
 	{
+#if defined(ARCH_X64)
 		AVX2_FUNC
 		static
 		std::tuple<u16, u16> upload_u16_swapped_avx2(const void *src, void *dst, u32 iterations, u16 restart_index)
@@ -428,6 +457,7 @@ namespace
 
 			return std::make_tuple(min_index, max_index);
 		}
+#endif
 
 		SSE4_1_FUNC
 		static
@@ -512,9 +542,11 @@ namespace
 				{
 					if (s_use_avx2)
 					{
+#if defined(ARCH_X64)
 						u32 iterations = length >> 4;
 						written = length & ~0xF;
 						std::tie(min_index, max_index) = upload_u16_swapped_avx2(src.data(), dst.data(), iterations, restart_index);
+#endif
 					}
 					else if (s_use_sse4_1)
 					{
diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.h b/rpcs3/Emu/RSX/Common/BufferUtils.h
index 1c872e7319..ad02e72f42 100644
--- a/rpcs3/Emu/RSX/Common/BufferUtils.h
+++ b/rpcs3/Emu/RSX/Common/BufferUtils.h
@@ -51,7 +51,7 @@ void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w);
 void stream_vector_from_memory(void *dst, void *src);
 
 // Copy and swap data in 32-bit units
-extern built_function<void(*)(void*, const void*, u32)> copy_data_swap_u32;
+extern built_function<void(*)(u32*, const u32*, u32)> copy_data_swap_u32;
 
 // Copy and swap data in 32-bit units, return true if changed
-extern built_function<bool(*)(void*, const void*, u32)> copy_data_swap_u32_cmp;
+extern built_function<bool(*)(u32*, const u32*, u32)> copy_data_swap_u32_cmp;
diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
index f7a6cb68e2..a20b8df5b2 100644
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
@@ -38,16 +38,16 @@ namespace gl
 				ensure(real_pitch == (width * 4));
 				if (rsx_pitch == real_pitch) [[likely]]
 				{
-					copy_data_swap_u32(dst, dst, valid_length / 4);
+					copy_data_swap_u32(static_cast<u32*>(dst), static_cast<u32*>(dst), valid_length / 4);
 				}
 				else
 				{
 					const u32 num_rows = utils::align(valid_length, rsx_pitch) / rsx_pitch;
-					u8* data = static_cast<u8*>(dst);
+					u32* data = static_cast<u32*>(dst);
 					for (u32 row = 0; row < num_rows; ++row)
 					{
 						copy_data_swap_u32(data, data, width);
-						data += rsx_pitch;
+						data += rsx_pitch / 4;
 					}
 				}
 				break;
diff --git a/rpcs3/Emu/RSX/Program/program_state_cache2.hpp b/rpcs3/Emu/RSX/Program/program_state_cache2.hpp
index 9dc1e4f132..ada78f4931 100644
--- a/rpcs3/Emu/RSX/Program/program_state_cache2.hpp
+++ b/rpcs3/Emu/RSX/Program/program_state_cache2.hpp
@@ -2,9 +2,12 @@
 
 #include "ProgramStateCache.h"
 
-#include "emmintrin.h"
 #include "util/asm.hpp"
 
+#if defined(ARCH_X64)
+#include "emmintrin.h"
+#endif
+
 template <typename Traits>
 void program_state_cache<Traits>::fill_fragment_constants_buffer(std::span<f32> dst_buffer, const RSXFragmentProgram &fragment_program, bool sanitize) const
 {
@@ -19,12 +22,23 @@ void program_state_cache<Traits>::fill_fragment_constants_buffer(std::span<f32>
 	for (usz offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
 	{
 		char* data = static_cast<char*>(fragment_program.get_data()) + offset_in_fragment_program;
+
+#if defined(ARCH_X64)
 		const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
 		const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
+#else
+		for (u32 i = 0; i < 4; i++)
+		{
+			const u32 value = reinterpret_cast<u32*>(data)[i];
+			tmp[i] = std::bit_cast<f32, u32>(((value >> 8) & 0xff00ff) | ((value << 8) & 0xff00ff00));
+		}
+#endif
 
 		if (!patch_table.is_empty())
 		{
+#if defined(ARCH_X64)
 			_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
+#endif
 
 			for (int i = 0; i < 4; ++i)
 			{
@@ -47,15 +61,29 @@ void program_state_cache<Traits>::fill_fragment_constants_buffer(std::span<f32>
 		}
 		else if (sanitize)
 		{
+#if defined(ARCH_X64)
 			//Convert NaNs and Infs to 0
 			const auto masked = _mm_and_si128(shuffled_vector, _mm_set1_epi32(0x7fffffff));
 			const auto valid = _mm_cmplt_epi32(masked, _mm_set1_epi32(0x7f800000));
 			const auto result = _mm_and_si128(shuffled_vector, valid);
 			_mm_stream_si128(utils::bless<__m128i>(dst), result);
+#else
+			for (u32 i = 0; i < 4; i++)
+			{
+				const u32 value = std::bit_cast<u32>(tmp[i]);
+				tmp[i] = (value & 0x7fffffff) < 0x7f800000 ? value : 0;
+			}
+
+			std::memcpy(dst, tmp, 16);
+#endif
 		}
 		else
 		{
+#if defined(ARCH_X64)
 			_mm_stream_si128(utils::bless<__m128i>(dst), shuffled_vector);
+#else
+			std::memcpy(dst, tmp, 16);
+#endif
 		}
 
 		dst += 4;
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h
index da0d6d2296..50604bb3d6 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@@ -20,6 +20,7 @@
 #include "VKShaderInterpreter.h"
 #include "VKQueryPool.h"
 #include "../GCM.h"
+#include "util/asm.hpp"
 
 #include <thread>
 #include <optional>
@@ -310,11 +311,7 @@ namespace vk
 		{
 			while (num_waiters.load() != 0)
 			{
-#ifdef _MSC_VER
-				_mm_pause();
-#else
-				__builtin_ia32_pause();
-#endif
+				utils::pause();
 			}
 		}
 
diff --git a/rpcs3/Emu/RSX/VK/vkutils/device.cpp b/rpcs3/Emu/RSX/VK/vkutils/device.cpp
index 5480ba4011..4837e89966 100644
--- a/rpcs3/Emu/RSX/VK/vkutils/device.cpp
+++ b/rpcs3/Emu/RSX/VK/vkutils/device.cpp
@@ -452,6 +452,18 @@ namespace vk
 			enabled_features.shaderStorageImageWriteWithoutFormat = VK_FALSE;
 		}
 
+		if (!pgpu->features.shaderClipDistance)
+		{
+			rsx_log.error("Your GPU does not support shader clip distance. Graphics will not render correctly.");
+			enabled_features.shaderClipDistance = VK_FALSE;
+		}
+
+		if (!pgpu->features.shaderStorageBufferArrayDynamicIndexing)
+		{
+			rsx_log.error("Your GPU does not support shader storage buffer array dynamic indexing. Graphics will not render correctly.");
+			enabled_features.shaderStorageBufferArrayDynamicIndexing = VK_FALSE;
+		}
+
 		if (!pgpu->features.samplerAnisotropy)
 		{
 			rsx_log.error("Your GPU does not support anisotropic filtering. Graphics may not render correctly.");
diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp
index 874217545a..292a2833f6 100644
--- a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp
+++ b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp
@@ -12,10 +12,6 @@
 
 namespace vk
 {
-#ifdef _MSC_VER
-	extern "C" void _mm_pause();
-#endif
-
 	fence::fence(VkDevice dev)
 	{
 		owner                  = dev;
@@ -48,11 +44,7 @@ namespace vk
 	{
 		while (!flushed)
 		{
-#ifdef _MSC_VER
-			_mm_pause();
-#else
-			__builtin_ia32_pause();
-#endif
+			utils::pause();
 		}
 	}
 
@@ -218,11 +210,7 @@ namespace vk
 				}
 			}
 
-#ifdef _MSC_VER
-			_mm_pause();
-#else
-			__builtin_ia32_pause();
-#endif
+			utils::pause();
 		}
 	}
 }
diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp
index 0bd21b9cde..ebba57a77c 100644
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@@ -42,8 +42,10 @@ namespace rsx
 		{
 			rsx->sync();
 
-			// Write ref+get atomically (get will be written again with the same value at command end)
-			vm::_ref<atomic_be_t<u64>>(rsx->dma_address + ::offset32(&RsxDmaControl::get)).store(u64{rsx->fifo_ctrl->get_pos()} << 32 | arg);
+			// Write ref+get (get will be written again with the same value at command end)
+			auto& dma = vm::_ref<RsxDmaControl>(rsx->dma_address);
+			dma.get.release(rsx->fifo_ctrl->get_pos());
+			dma.ref.store(arg);
 		}
 
 		void semaphore_acquire(thread* rsx, u32 /*reg*/, u32 arg)
@@ -436,11 +438,11 @@ namespace rsx
 				if (rsx->m_graphics_state & rsx::pipeline_state::transform_constants_dirty)
 				{
 					// Minor optimization: don't compare values if we already know we need invalidation
-					copy_data_swap_u32(values, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount);
+					copy_data_swap_u32(values, static_cast<u32*>(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount);
 				}
 				else
 				{
-					if (copy_data_swap_u32_cmp(values, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount))
+					if (copy_data_swap_u32_cmp(values, static_cast<u32*>(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount))
 					{
 						// Transform constants invalidation is expensive (~8k bytes per update)
 						rsx->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty;
@@ -472,7 +474,7 @@ namespace rsx
 					rcount -= max - (max_vertex_program_instructions * 4);
 				}
 
-				copy_data_swap_u32(&rsx::method_registers.transform_program[load_pos * 4 + index % 4], vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount);
+				copy_data_swap_u32(&rsx::method_registers.transform_program[load_pos * 4 + index % 4], static_cast<u32*>(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount);
 
 				rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_ucode_dirty;
 				rsx::method_registers.transform_program_load_set(load_pos + ((rcount + index % 4) / 4));
diff --git a/rpcs3/Emu/perf_meter.cpp b/rpcs3/Emu/perf_meter.cpp
index d570394066..db9aace584 100644
--- a/rpcs3/Emu/perf_meter.cpp
+++ b/rpcs3/Emu/perf_meter.cpp
@@ -2,6 +2,8 @@
 #include "perf_meter.hpp"
 
 #include "util/sysinfo.hpp"
+#include "util/fence.hpp"
+#include "util/tsc.hpp"
 #include "Utilities/Thread.h"
 
 #include <map>
@@ -68,18 +70,10 @@ void perf_stat_base::print(const char* name) const noexcept
 	}
 }
 
-#ifdef _MSC_VER
-extern "C" void _mm_lfence();
-#endif
-
 SAFE_BUFFERS(void) perf_stat_base::push(u64 data[66], u64 start_time, const char* name) noexcept
 {
 	// Event end
-#ifdef _MSC_VER
-	const u64 end_time = (_mm_lfence(), get_tsc());
-#else
-	const u64 end_time = (__builtin_ia32_lfence(), get_tsc());
-#endif
+	const u64 end_time = (utils::lfence(), utils::get_tsc());
 
 	// Compute difference in seconds
 	const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq();
diff --git a/rpcs3/Emu/perf_meter.hpp b/rpcs3/Emu/perf_meter.hpp
index a936e13994..d439adf81f 100644
--- a/rpcs3/Emu/perf_meter.hpp
+++ b/rpcs3/Emu/perf_meter.hpp
@@ -2,26 +2,13 @@
 
 #include "util/types.hpp"
 #include "util/logs.hpp"
+#include "util/tsc.hpp"
 #include "system_config.h"
 #include <array>
 #include <cmath>
 
 LOG_CHANNEL(perf_log, "PERF");
 
-#ifdef _MSC_VER
-extern "C" u64 __rdtsc();
-
-inline u64 get_tsc()
-{
-	return __rdtsc();
-}
-#else
-inline u64 get_tsc()
-{
-	return __builtin_ia32_rdtsc();
-}
-#endif
-
 // TODO: constexpr with the help of bitcast
 template <auto Name>
 inline const auto perf_name = []
@@ -145,7 +132,7 @@ public:
 		if constexpr (std::array<bool, sizeof...(SubEvents)>{(SubEvents == Event)...}[Index])
 		{
 			// Push actual timestamp into an array
-			m_timestamps[Index + 1] = get_tsc();
+			m_timestamps[Index + 1] = utils::get_tsc();
 		}
 		else if constexpr (Index < sizeof...(SubEvents))
 		{
@@ -169,7 +156,7 @@ public:
 	// Re-initialize first timestamp
 	FORCE_INLINE SAFE_BUFFERS(void) restart() noexcept
 	{
-		m_timestamps[0] = get_tsc();
+		m_timestamps[0] = utils::get_tsc();
 		std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64));
 	}
 
diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
index a456beaf36..aa989758a8 100644
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@@ -52,12 +52,15 @@ struct cfg_root : cfg::node
 		cfg::_enum<tsx_usage> enable_TSX{ this, "Enable TSX", enable_tsx_by_default() ? tsx_usage::enabled : tsx_usage::disabled }; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
 		cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false };
 		cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true };
-		cfg::_bool llvm_accurate_dfma{ this, "LLVM Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
-		cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", true }; // Respect current Java Mode for alti-vec ops by PPU LLVM
 		cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
-		cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false };
 		cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
-		cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false};
+		cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false };
+		cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
+		cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling.
+		cfg::_bool ppu_use_nj_bit{ this, "PPU Use Non-Java Mode Bit", false }; // Accuracy. If unset, ignore NJ flag completely.
+		cfg::_bool ppu_fix_vnan{ this, "PPU Fixup Vector NaN Values", false }; // Accuracy. Partial.
+		cfg::_bool ppu_set_vnan{ this, "PPU Accurate Vector NaN Values", false }; // Accuracy. Implies ppu_fix_vnan.
+		cfg::_bool ppu_set_fpcc{ this, "PPU Set FPCC Bits", false }; // Accuracy.
 
 		cfg::_bool debug_console_mode{ this, "Debug Console Mode", false }; // Debug console emulation, not recommended
 		cfg::_bool hook_functions{ this, "Hook static functions" };
diff --git a/rpcs3/Emu/system_config_types.cpp b/rpcs3/Emu/system_config_types.cpp
index f2a1a36edb..ab99907795 100644
--- a/rpcs3/Emu/system_config_types.cpp
+++ b/rpcs3/Emu/system_config_types.cpp
@@ -256,8 +256,8 @@ void fmt_class_string<spu_decoder_type>::format(std::string& out, u64 arg)
 	{
 		switch (type)
 		{
-		case spu_decoder_type::precise: return "Interpreter (precise)";
-		case spu_decoder_type::fast: return "Interpreter (fast)";
+		case spu_decoder_type::_static: return "Interpreter (static)";
+		case spu_decoder_type::dynamic: return "Interpreter (dynamic)";
 		case spu_decoder_type::asmjit: return "Recompiler (ASMJIT)";
 		case spu_decoder_type::llvm: return "Recompiler (LLVM)";
 		}
@@ -440,8 +440,8 @@ void fmt_class_string<ppu_decoder_type>::format(std::string& out, u64 arg)
 	{
 		switch (type)
 		{
-		case ppu_decoder_type::precise: return "Interpreter (precise)";
-		case ppu_decoder_type::fast: return "Interpreter (fast)";
+		case ppu_decoder_type::_static: return "Interpreter (static)";
+		case ppu_decoder_type::dynamic: return "Interpreter (dynamic)";
 		case ppu_decoder_type::llvm: return "Recompiler (LLVM)";
 		}
 
diff --git a/rpcs3/Emu/system_config_types.h b/rpcs3/Emu/system_config_types.h
index 59d26fbfea..40691f0129 100644
--- a/rpcs3/Emu/system_config_types.h
+++ b/rpcs3/Emu/system_config_types.h
@@ -2,15 +2,15 @@
 
 enum class ppu_decoder_type : unsigned
 {
-	precise = 0, // Don't change (0)
-	fast, // Don't change (1)
+	_static,
+	dynamic,
 	llvm,
 };
 
 enum class spu_decoder_type : unsigned
 {
-	precise = 0, // Don't change (0)
-	fast, // Don't change (1)
+	_static,
+	dynamic,
 	asmjit,
 	llvm,
 };
diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj
index 27e40c5664..b8d7e47275 100644
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@@ -509,7 +509,7 @@
     <ClInclude Include="util\media_utils.h" />
     <ClInclude Include="util\serialization.hpp" />
     <ClInclude Include="util\v128.hpp" />
-    <ClInclude Include="util\v128sse.hpp" />
+    <ClInclude Include="util\simd.hpp" />
     <ClInclude Include="util\to_endian.hpp" />
     <ClInclude Include="..\Utilities\bin_patch.h" />
     <ClInclude Include="..\Utilities\BitField.h" />
diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters
index 985800bbad..0820031b31 100644
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@@ -1122,7 +1122,7 @@
     <ClInclude Include="util\v128.hpp">
       <Filter>Utilities</Filter>
     </ClInclude>
-    <ClInclude Include="util\v128sse.hpp">
+    <ClInclude Include="util\simd.hpp">
       <Filter>Utilities</Filter>
     </ClInclude>
     <ClInclude Include="util\to_endian.hpp">
diff --git a/rpcs3/main.cpp b/rpcs3/main.cpp
index cae695ddb4..7dec9dd249 100644
--- a/rpcs3/main.cpp
+++ b/rpcs3/main.cpp
@@ -42,6 +42,7 @@ DYNAMIC_IMPORT("ntdll.dll", NtSetTimerResolution, NTSTATUS(ULONG DesiredResoluti
 #include <spawn.h>
 #include <sys/wait.h>
 #include <stdlib.h>
+#include <signal.h>
 #endif
 
 #ifdef __linux__
@@ -49,7 +50,7 @@ DYNAMIC_IMPORT("ntdll.dll", NtSetTimerResolution, NTSTATUS(ULONG DesiredResoluti
 #include <sys/resource.h>
 #endif
 
-#if defined(__APPLE__) && defined(BLOCKS) // BLOCKS is required for dispatch_sync, but GCC-11 does not support it
+#if defined(__APPLE__)
 #include <dispatch/dispatch.h>
 #endif
 
@@ -96,7 +97,7 @@ LOG_CHANNEL(q_debug, "QDEBUG");
 		fmt::append(buf, "\nThread id = %s.", std::this_thread::get_id());
 	}
 
-	const std::string_view text = buf.empty() ? _text : buf;
+	std::string_view text = buf.empty() ? _text : buf;
 
 	if (s_headless)
 	{
@@ -124,18 +125,16 @@ LOG_CHANNEL(q_debug, "QDEBUG");
 		std::cerr << fmt::format("RPCS3: %s\n", text);
 	}
 
-	auto show_report = [](std::string_view text)
+	static auto show_report = [](std::string_view text)
 	{
 		fatal_error_dialog dlg(text);
 		dlg.exec();
 	};
 
-#if defined(__APPLE__) && defined(BLOCKS) // BLOCKS is required for dispatch_sync, but GCC-11 does not support it
-	// Cocoa access is not allowed outside of the main thread
-	// Prevents crash dialogs from freezing the program
+#if defined(__APPLE__)
 	if (!pthread_main_np())
 	{
-		dispatch_sync(dispatch_get_main_queue(), ^ { show_report(text); });
+		dispatch_sync_f(dispatch_get_main_queue(), &text, [](void* text){ show_report(*static_cast<std::string_view*>(text)); });
 	}
 	else
 #endif
@@ -143,9 +142,12 @@ LOG_CHANNEL(q_debug, "QDEBUG");
 		// If Qt is already initialized, spawn a new RPCS3 process with an --error argument
 		if (local)
 		{
-			// Since we only show an error, we can hope for a graceful exit
 			show_report(text);
-			std::exit(0);
+#ifdef _WIN32
+			ExitProcess(0);
+#else
+			kill(getpid(), SIGKILL);
+#endif
 		}
 
 #ifdef _WIN32
diff --git a/rpcs3/rpcs3qt/cheat_manager.cpp b/rpcs3/rpcs3qt/cheat_manager.cpp
index 3520bd591a..a5b83f082b 100644
--- a/rpcs3/rpcs3qt/cheat_manager.cpp
+++ b/rpcs3/rpcs3qt/cheat_manager.cpp
@@ -408,7 +408,7 @@ bool cheat_engine::set_value(const u32 offset, const T value)
 
 		if (exec_code_at_end || exec_code_at_start)
 		{
-			extern void ppu_register_function_at(u32, u32, ppu_function_t);
+			extern void ppu_register_function_at(u32, u32, ppu_intrp_func_t);
 
 			u32 addr = offset, size = sizeof(T);
 
diff --git a/rpcs3/rpcs3qt/debugger_frame.cpp b/rpcs3/rpcs3qt/debugger_frame.cpp
index 960a144c71..1528f30aa1 100644
--- a/rpcs3/rpcs3qt/debugger_frame.cpp
+++ b/rpcs3/rpcs3qt/debugger_frame.cpp
@@ -45,8 +45,8 @@ extern bool is_using_interpreter(u32 id_type)
 	switch (id_type)
 	{
 	case 1: return g_cfg.core.ppu_decoder != ppu_decoder_type::llvm;
-	case 2: return g_cfg.core.spu_decoder == spu_decoder_type::fast || g_cfg.core.spu_decoder == spu_decoder_type::precise;
-	default: return true; 
+	case 2: return g_cfg.core.spu_decoder != spu_decoder_type::asmjit && g_cfg.core.spu_decoder != spu_decoder_type::llvm;
+	default: return true;
 	}
 }
 
@@ -528,7 +528,7 @@ void debugger_frame::keyPressEvent(QKeyEvent* event)
 					dis_asm.disasm(*it);
 					fmt::append(ret, "\n(%u) 0x%08x: %s", i, *it, dis_asm.last_opcode);
 				}
-	
+
 				if (ret.empty())
 				{
 					ret = "No PPU calls have been logged";
@@ -1134,7 +1134,7 @@ void debugger_frame::EnableButtons(bool enable)
 	if (!cpu) enable = false;
 
 	const bool step = enable && is_using_interpreter(cpu->id_type());
- 
+
 	m_go_to_addr->setEnabled(enable);
 	m_go_to_pc->setEnabled(enable);
 	m_btn_step->setEnabled(step);
diff --git a/rpcs3/rpcs3qt/emu_settings.cpp b/rpcs3/rpcs3qt/emu_settings.cpp
index ce7ec248a1..ce245cec29 100644
--- a/rpcs3/rpcs3qt/emu_settings.cpp
+++ b/rpcs3/rpcs3qt/emu_settings.cpp
@@ -1104,16 +1104,16 @@ QString emu_settings::GetLocalizedSetting(const QString& original, emu_settings_
 	case emu_settings_type::PPUDecoder:
 		switch (static_cast<ppu_decoder_type>(index))
 		{
-		case ppu_decoder_type::precise: return tr("Interpreter (precise)", "PPU decoder");
-		case ppu_decoder_type::fast: return tr("Interpreter (fast)", "PPU decoder");
+		case ppu_decoder_type::_static: return tr("Interpreter (static)", "PPU decoder");
+		case ppu_decoder_type::dynamic: return tr("Interpreter (dynamic)", "PPU decoder");
 		case ppu_decoder_type::llvm: return tr("Recompiler (LLVM)", "PPU decoder");
 		}
 		break;
 	case emu_settings_type::SPUDecoder:
 		switch (static_cast<spu_decoder_type>(index))
 		{
-		case spu_decoder_type::precise: return tr("Interpreter (precise)", "SPU decoder");
-		case spu_decoder_type::fast: return tr("Interpreter (fast)", "SPU decoder");
+		case spu_decoder_type::_static: return tr("Interpreter (static)", "SPU decoder");
+		case spu_decoder_type::dynamic: return tr("Interpreter (dynamic)", "SPU decoder");
 		case spu_decoder_type::asmjit: return tr("Recompiler (ASMJIT)", "SPU decoder");
 		case spu_decoder_type::llvm: return tr("Recompiler (LLVM)", "SPU decoder");
 		}
diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h
index db64d90737..2cf1eae4fe 100644
--- a/rpcs3/rpcs3qt/emu_settings_type.h
+++ b/rpcs3/rpcs3qt/emu_settings_type.h
@@ -23,8 +23,6 @@ enum class emu_settings_type
 	AccurateGETLLAR,
 	AccurateSpuDMA,
 	AccurateClineStores,
-	AccurateLLVMdfma,
-	AccurateVectorNaN,
 	AccurateRSXAccess,
 	AccurateXFloat,
 	AccuratePPU128Loop,
@@ -40,7 +38,12 @@ enum class emu_settings_type
 	ClocksScale,
 	PerformanceReport,
 	FullWidthAVX512,
-	PPULLVMJavaModeHandling,
+	AccurateDFMA,
+	AccuratePPUSAT,
+	AccuratePPUNJ,
+	FixupPPUVNAN,
+	AccuratePPUVNAN,
+	AccuratePPUFPCC,
 
 	// Graphics
 	Renderer,
@@ -178,8 +181,6 @@ inline static const QMap<emu_settings_type, cfg_location> settings_location =
 	{ emu_settings_type::AccurateGETLLAR,          { "Core", "Accurate GETLLAR"}},
 	{ emu_settings_type::AccurateSpuDMA,           { "Core", "Accurate SPU DMA"}},
 	{ emu_settings_type::AccurateClineStores,      { "Core", "Accurate Cache Line Stores"}},
-	{ emu_settings_type::AccurateLLVMdfma,         { "Core", "LLVM Accurate DFMA"}},
-	{ emu_settings_type::AccurateVectorNaN,        { "Core", "PPU LLVM Accurate Vector NaN values"}},
 	{ emu_settings_type::AccurateRSXAccess,        { "Core", "Accurate RSX reservation access"}},
 	{ emu_settings_type::AccurateXFloat,           { "Core", "Accurate xfloat"}},
 	{ emu_settings_type::MFCCommandsShuffling,     { "Core", "MFC Commands Shuffling Limit"}},
@@ -194,7 +195,12 @@ inline static const QMap<emu_settings_type, cfg_location> settings_location =
 	{ emu_settings_type::PerformanceReport,        { "Core", "Enable Performance Report"}},
 	{ emu_settings_type::FullWidthAVX512,          { "Core", "Full Width AVX-512"}},
 	{ emu_settings_type::NumPPUThreads,            { "Core", "PPU Threads"}},
-	{ emu_settings_type::PPULLVMJavaModeHandling,  { "Core", "PPU LLVM Java Mode Handling"}},
+	{ emu_settings_type::AccurateDFMA,             { "Core", "Use Accurate DFMA"}},
+	{ emu_settings_type::AccuratePPUSAT,           { "Core", "PPU Set Saturation Bit"}},
+	{ emu_settings_type::AccuratePPUNJ,            { "Core", "PPU Use Non-Java Mode Bit"}},
+	{ emu_settings_type::FixupPPUVNAN,             { "Core", "PPU Fixup Vector NaN Values"}},
+	{ emu_settings_type::AccuratePPUVNAN,          { "Core", "PPU Accurate Vector NaN Values"}},
+	{ emu_settings_type::AccuratePPUFPCC,          { "Core", "PPU Set FPCC Bits"}},
 
 	// Graphics Tab
 	{ emu_settings_type::Renderer,                   { "Video", "Renderer"}},
diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp
index e3f12d63eb..931fbbf58a 100644
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@@ -319,26 +319,26 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 	}
 
 	// PPU tool tips
-	SubscribeTooltip(ui->ppu_precise, tooltips.settings.ppu_precise);
-	SubscribeTooltip(ui->ppu_fast,    tooltips.settings.ppu_fast);
+	SubscribeTooltip(ui->ppu__static, tooltips.settings.ppu__static);
+	SubscribeTooltip(ui->ppu_dynamic, tooltips.settings.ppu_dynamic);
 	SubscribeTooltip(ui->ppu_llvm,    tooltips.settings.ppu_llvm);
 
 	QButtonGroup *ppu_bg = new QButtonGroup(this);
-	ppu_bg->addButton(ui->ppu_precise, static_cast<int>(ppu_decoder_type::precise));
-	ppu_bg->addButton(ui->ppu_fast,    static_cast<int>(ppu_decoder_type::fast));
+	ppu_bg->addButton(ui->ppu__static, static_cast<int>(ppu_decoder_type::_static));
+	ppu_bg->addButton(ui->ppu_dynamic, static_cast<int>(ppu_decoder_type::dynamic));
 	ppu_bg->addButton(ui->ppu_llvm,    static_cast<int>(ppu_decoder_type::llvm));
 
 	m_emu_settings->EnhanceRadioButton(ppu_bg, emu_settings_type::PPUDecoder);
 
 	// SPU tool tips
-	SubscribeTooltip(ui->spu_precise, tooltips.settings.spu_precise);
-	SubscribeTooltip(ui->spu_fast,    tooltips.settings.spu_fast);
+	SubscribeTooltip(ui->spu__static, tooltips.settings.spu__static);
+	SubscribeTooltip(ui->spu_dynamic, tooltips.settings.spu_dynamic);
 	SubscribeTooltip(ui->spu_asmjit,  tooltips.settings.spu_asmjit);
 	SubscribeTooltip(ui->spu_llvm,    tooltips.settings.spu_llvm);
 
 	QButtonGroup *spu_bg = new QButtonGroup(this);
-	spu_bg->addButton(ui->spu_precise, static_cast<int>(spu_decoder_type::precise));
-	spu_bg->addButton(ui->spu_fast,    static_cast<int>(spu_decoder_type::fast));
+	spu_bg->addButton(ui->spu__static, static_cast<int>(spu_decoder_type::_static));
+	spu_bg->addButton(ui->spu_dynamic, static_cast<int>(spu_decoder_type::dynamic));
 	spu_bg->addButton(ui->spu_asmjit,  static_cast<int>(spu_decoder_type::asmjit));
 	spu_bg->addButton(ui->spu_llvm,    static_cast<int>(spu_decoder_type::llvm));
 
@@ -349,17 +349,24 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 		ui->accurateXFloat->setEnabled(checked);
 	});
 
-	connect(ui->spu_fast, &QAbstractButton::toggled, [this](bool checked)
+	connect(ui->spu__static, &QAbstractButton::toggled, [this](bool checked)
 	{
 		ui->accurateXFloat->setEnabled(checked);
 	});
 
-	ui->accurateXFloat->setEnabled(ui->spu_llvm->isChecked() || ui->spu_fast->isChecked());
+	connect(ui->spu_dynamic, &QAbstractButton::toggled, [this](bool checked)
+	{
+		ui->accurateXFloat->setEnabled(checked);
+	});
+
+	ui->accurateXFloat->setEnabled(ui->spu_llvm->isChecked() || ui->spu_dynamic->isChecked());
 
 #ifndef LLVM_AVAILABLE
 	ui->ppu_llvm->setEnabled(false);
 	ui->spu_llvm->setEnabled(false);
+	ui->spu_dynamic->setEnabled(false);
 #endif
+	ui->ppu_dynamic->setEnabled(false);
 
 	//     _____ _____  _    _   _______    _
 	//    / ____|  __ \| |  | | |__   __|  | |
@@ -1138,12 +1145,24 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 	m_emu_settings->EnhanceCheckBox(ui->debugConsoleMode, emu_settings_type::DebugConsoleMode);
 	SubscribeTooltip(ui->debugConsoleMode, tooltips.settings.debug_console_mode);
 
-	m_emu_settings->EnhanceCheckBox(ui->accurateLLVMdfma, emu_settings_type::AccurateLLVMdfma);
-	SubscribeTooltip(ui->accurateLLVMdfma, tooltips.settings.accurate_llvm_dfma);
-	ui->accurateLLVMdfma->setDisabled(utils::has_fma3() || utils::has_fma4());
+	m_emu_settings->EnhanceCheckBox(ui->accurateDFMA, emu_settings_type::AccurateDFMA);
+	SubscribeTooltip(ui->accurateDFMA, tooltips.settings.accurate_dfma);
+	ui->accurateDFMA->setDisabled(utils::has_fma3() || utils::has_fma4());
 
-	m_emu_settings->EnhanceCheckBox(ui->AccurateVectorNaN, emu_settings_type::AccurateVectorNaN);
-	SubscribeTooltip(ui->AccurateVectorNaN, tooltips.settings.accurate_vector_nan);
+	m_emu_settings->EnhanceCheckBox(ui->accuratePPUSAT, emu_settings_type::AccuratePPUSAT);
+	SubscribeTooltip(ui->accuratePPUSAT, tooltips.settings.accurate_ppusat);
+
+	m_emu_settings->EnhanceCheckBox(ui->accuratePPUNJ, emu_settings_type::AccuratePPUNJ);
+	SubscribeTooltip(ui->accuratePPUNJ, tooltips.settings.accurate_ppunj);
+
+	m_emu_settings->EnhanceCheckBox(ui->fixupPPUVNAN, emu_settings_type::FixupPPUVNAN);
+	SubscribeTooltip(ui->fixupPPUVNAN, tooltips.settings.fixup_ppuvnan);
+
+	m_emu_settings->EnhanceCheckBox(ui->accuratePPUVNAN, emu_settings_type::AccuratePPUVNAN);
+	SubscribeTooltip(ui->accuratePPUVNAN, tooltips.settings.accurate_ppuvnan);
+
+	m_emu_settings->EnhanceCheckBox(ui->accuratePPUFPCC, emu_settings_type::AccuratePPUFPCC);
+	SubscribeTooltip(ui->accuratePPUFPCC, tooltips.settings.accurate_ppufpcc);
 
 	m_emu_settings->EnhanceCheckBox(ui->silenceAllLogs, emu_settings_type::SilenceAllLogs);
 	SubscribeTooltip(ui->silenceAllLogs, tooltips.settings.silence_all_logs);
@@ -1927,9 +1946,6 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 	m_emu_settings->EnhanceCheckBox(ui->accurateRSXAccess, emu_settings_type::AccurateRSXAccess);
 	SubscribeTooltip(ui->accurateRSXAccess, tooltips.settings.accurate_rsx_access);
 
-	m_emu_settings->EnhanceCheckBox(ui->ppuLlvmJavaModeHandling, emu_settings_type::PPULLVMJavaModeHandling);
-	SubscribeTooltip(ui->ppuLlvmJavaModeHandling, tooltips.settings.ppu_llvm_java_mode_handling);
-
 	m_emu_settings->EnhanceCheckBox(ui->ppuPrecompilation, emu_settings_type::PPULLVMPrecompilation);
 	SubscribeTooltip(ui->ppuPrecompilation, tooltips.settings.ppu_precompilation);
 
diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui
index b39694c486..01e6bf56ae 100644
--- a/rpcs3/rpcs3qt/settings_dialog.ui
+++ b/rpcs3/rpcs3qt/settings_dialog.ui
@@ -57,16 +57,16 @@
              </property>
              <layout class="QVBoxLayout" name="ppu_layout">
               <item>
-               <widget class="QRadioButton" name="ppu_precise">
+               <widget class="QRadioButton" name="ppu__static">
                 <property name="text">
-                 <string notr="true">Interpreter (precise)</string>
+                 <string notr="true">Interpreter (static)</string>
                 </property>
                </widget>
               </item>
               <item>
-               <widget class="QRadioButton" name="ppu_fast">
+               <widget class="QRadioButton" name="ppu_dynamic">
                 <property name="text">
-                 <string notr="true">Interpreter (fast)</string>
+                 <string notr="true">Interpreter (dynamic)</string>
                 </property>
                </widget>
               </item>
@@ -87,16 +87,16 @@
              </property>
              <layout class="QVBoxLayout" name="spu_layout">
               <item>
-               <widget class="QRadioButton" name="spu_precise">
+               <widget class="QRadioButton" name="spu__static">
                 <property name="text">
-                 <string notr="true">Interpreter (precise)</string>
+                 <string notr="true">Interpreter (static)</string>
                 </property>
                </widget>
               </item>
               <item>
-               <widget class="QRadioButton" name="spu_fast">
+               <widget class="QRadioButton" name="spu_dynamic">
                 <property name="text">
-                 <string notr="true">Interpreter (fast)</string>
+                 <string notr="true">Interpreter (dynamic)</string>
                 </property>
                </widget>
               </item>
@@ -2050,9 +2050,9 @@
                </widget>
               </item>
               <item>
-               <widget class="QCheckBox" name="accurateLLVMdfma">
+               <widget class="QCheckBox" name="accurateDFMA">
                 <property name="text">
-                 <string>Accurate LLVM DFMA</string>
+                 <string>Accurate DFMA</string>
                 </property>
                </widget>
               </item>
@@ -2064,16 +2064,37 @@
                </widget>
               </item>
               <item>
-               <widget class="QCheckBox" name="AccurateVectorNaN">
+               <widget class="QCheckBox" name="accuratePPUSAT">
                 <property name="text">
-                 <string>PPU LLVM Accurate Vector NaNs</string>
+                 <string>Accurate PPU Saturation Bit</string>
                 </property>
                </widget>
               </item>
               <item>
-               <widget class="QCheckBox" name="ppuLlvmJavaModeHandling">
+               <widget class="QCheckBox" name="accuratePPUNJ">
                 <property name="text">
-                 <string>PPU LLVM Java Mode Handling</string>
+                 <string>Accurate PPU Non-Java Mode</string>
+                </property>
+               </widget>
+              </item>
+              <item>
+               <widget class="QCheckBox" name="fixupPPUVNAN">
+                <property name="text">
+                 <string>PPU Vector NaN Fixup</string>
+                </property>
+               </widget>
+              </item>
+              <item>
+               <widget class="QCheckBox" name="accuratePPUVNAN">
+                <property name="text">
+                 <string>Accurate PPU Vector NaN Handling</string>
+                </property>
+               </widget>
+              </item>
+              <item>
+               <widget class="QCheckBox" name="accuratePPUFPCC">
+                <property name="text">
+                 <string>Accurate PPU Float Condition Control</string>
                 </property>
                </widget>
               </item>
diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h
index 739bbc6b38..73ed608ec0 100644
--- a/rpcs3/rpcs3qt/tooltips.h
+++ b/rpcs3/rpcs3qt/tooltips.h
@@ -55,12 +55,12 @@ public:
 
 		// cpu
 
-		const QString ppu_precise               = tr("Interprets PPU code with absolute accuracy.\nThis is the most accurate Interpreter, but very slow to play games with.\nYou may try this as a last resort if you encounter odd bugs or crashes.\nIf unsure, use PPU Interpreter Fast or PPU Recompiler (LLVM).");
-		const QString ppu_fast                  = tr("Interprets PPU code with sacrificed accuracy in order to achieve better performance.\nThis is the fastest interpreter.\nIt very rarely breaks games even in comparison to the Precise option.\nTry this if PPU Recompiler (LLVM) fails.");
+		const QString ppu__static               = tr("Interpreter (slow). Try this if PPU Recompiler (LLVM) doesn't work.");
+		const QString ppu_dynamic               = tr("Alternative interpreter (slow). May be faster than static interpreter. Try this if PPU Recompiler (LLVM) doesn't work.");
 		const QString ppu_llvm                  = tr("Recompiles and caches the game's PPU code using the LLVM Recompiler once before running it for the first time.\nThis is by far the fastest option and should always be used.\nShould you face compatibility issues, fall back to one of the Interpreters and retry.\nIf unsure, use this option.");
 		const QString ppu_precompilation        = tr("Searches the game's directory and precompiles extra PPU modules during boot.\nIf disabled, these modules will only be compiled when needed. Depending on the game, this might interrupt the gameplay unexpectedly and possibly frequently.\nOnly disable this if you want to get ingame more quickly.");
-		const QString spu_precise               = tr("Interprets SPU code with absolute accuracy.\nThis is extremely slow but may fix broken graphics in some games.");
-		const QString spu_fast                  = tr("Interprets SPU code with sacrificed accuracy in order to achieve better performance.\nThis is slower than the SPU Recompiler but significantly faster than the precise interpreter.\nHowever, games rarely need this.");
+		const QString spu__static               = tr("Interpreter (slow). Try this if SPU Recompiler (LLVM) doesn't work.");
+		const QString spu_dynamic               = tr("Alternative interpreter (slow). May be faster than static interpreter. Try this if SPU Recompiler (LLVM) doesn't work.");
 		const QString spu_asmjit                = tr("Recompiles the game's SPU code using the ASMJIT Recompiler.\nThis is the fast option with very good compatibility.\nIf unsure, use this option.");
 		const QString spu_llvm                  = tr("Recompiles and caches the game's SPU code using the LLVM Recompiler before running which adds extra start-up time.\nThis is the fastest option with very good compatibility.\nIf you experience issues, use the ASMJIT Recompiler.");
 		const QString accurate_xfloat           = tr("Adds extra accuracy to SPU float vectors processing.\nFixes bugs in various games at the cost of performance.\nThis setting is only applied when SPU Decoder is set to Fast or LLVM.");
@@ -70,6 +70,12 @@ public:
 		const QString spu_block_size            = tr("This option controls the SPU analyser, particularly the size of compiled units. The Mega and Giga modes may improve performance by tying smaller units together, decreasing the number of compiled units but increasing their size.\nUse the Safe mode for maximum compatibility.");
 		const QString preferred_spu_threads     = tr("Some SPU stages are sensitive to race conditions and allowing a limited number at a time helps alleviate performance stalls.\nSetting this to a smaller value might improve performance and reduce stuttering in some games.\nLeave this on auto if performance is negatively affected when setting a small value.");
 		const QString full_width_avx512         = tr("Enables the use of code with full width AVX-512.\nThis code can be executed much faster, but may cause a loss in performance if your CPU model experiences downclocking on wide AVX-512 loads.\nNote that AVX-512 instructions will be used regardless of this option, just at 128 and 256 bit width.");
+		const QString accurate_dfma             = tr("Use accurate double-precision FMA instructions in PPU and SPU backends.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou shouldn't disable it if your CPU supports FMA.");
+		const QString accurate_ppusat           = tr("Accurately set Saturation Bit values in PPU backends.\nIf unsure, do not modify this setting.");
+		const QString accurate_ppunj            = tr("Respect Non-Java Mode Bit values for vector ops in PPU backends.\nIf unsure, do not modify this setting.");
+		const QString fixup_ppuvnan             = tr("Fixup NaN results in vector instructions in PPU backends.\nIf unsure, do not modify this setting.");
+		const QString accurate_ppuvnan          = tr("Accurately set NaN results in vector instructions in PPU backends.\nIf unsure, do not modify this setting.");
+		const QString accurate_ppufpcc          = tr("Accurately set FPCC Bits in PPU backends.\nIf unsure, do not modify this setting.");
 
 		// debug
 
@@ -80,8 +86,6 @@ public:
 		const QString accurate_getllar             = tr("Accurately processes SPU MFC_GETLLAR operation.");
 		const QString accurate_spu_dma             = tr("Accurately processes SPU DMA operations.");
 		const QString accurate_cache_line_stores   = tr("Accurately processes PPU DCBZ instruction.\nIn addition, when combined with Accurate SPU DMA, SPU PUT cache line accesses will be processed atomically.");
-		const QString accurate_llvm_dfma           = tr("Provides extra accuracy on FMA instructions at the cost of performance.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou can't disable it if your CPU supports FMA.");
-		const QString accurate_vector_nan          = tr("Forces the floating point NaN (Not A Number) values outputted from PPU vector instructions to be accurate to the real hardware. (0x7FC00000)");
 		const QString accurate_rsx_access          = tr("Forces RSX pauses on SPU MFC_GETLLAR and SPU MFC_PUTLLUC operations.");
 		const QString mfc_delay_command            = tr("Forces delaying any odd MFC command, waits for at least 2 pending commands to execute them in a random order.\nMust be used with either SPU interpreters currently.\nSeverely degrades performance! If unsure, don't use this option.");
 		const QString hook_static_functions        = tr("Allows to hook some functions like 'memcpy' replacing them with high-level implementations. May do nothing or break things. Experimental.");
@@ -101,7 +105,6 @@ public:
 		const QString accurate_ppu_128_loop        = tr("When enabled, PPU atomic operations will operate on entire cache line data, as opposed to a single 64bit block of memory when disabled.\nNumerical values control whether or not to enable the accurate version based on the atomic operation's length.");
 		const QString enable_performance_report    = tr("Measure certain events and print a chart after the emulator is stopped. Don't enable if not asked to.");
 		const QString num_ppu_threads              = tr("Affects maximum amount of PPU threads running concurrently, the value of 1 has very low compatibility with games.\n2 is the default, if unsure do not modify this setting.");
-		const QString ppu_llvm_java_mode_handling  = tr("Respect current Java Mode for alti-vec ops by PPU LLVM.\nIf unsure, do not modify this setting.");
 
 		// emulator
 
diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp
index 5757e06233..5a989a0a60 100644
--- a/rpcs3/util/asm.hpp
+++ b/rpcs3/util/asm.hpp
@@ -1,15 +1,15 @@
 #pragma once
 
 #include "util/types.hpp"
+#include "util/tsc.hpp"
 #include <functional>
 
 extern bool g_use_rtm;
 extern u64 g_rtm_tx_limit1;
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 extern "C"
 {
-	u64 __rdtsc();
 	u32 _xbegin();
 	void _xend();
 	void _mm_pause();
@@ -27,24 +27,17 @@ extern "C"
 
 	s64 _div128(s64, s64, s64, s64*);
 	u64 _udiv128(u64, u64, u64, u64*);
+	void __debugbreak();
 }
 #endif
 
 namespace utils
 {
-	inline u64 get_tsc()
-	{
-#ifdef _MSC_VER
-		return __rdtsc();
-#else
-		return __builtin_ia32_rdtsc();
-#endif
-	}
-
 	// Transaction helper (result = pair of success and op result, or just bool)
 	template <typename F, typename R = std::invoke_result_t<F>>
 	inline auto tx_start(F op)
 	{
+#if defined(ARCH_X64)
 		uint status = -1;
 
 		for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc())
@@ -90,6 +83,9 @@ namespace utils
 				break;
 			}
 		}
+#else
+		static_cast<void>(op);
+#endif
 
 		if constexpr (std::is_void_v<R>)
 		{
@@ -113,7 +109,7 @@ namespace utils
 		const u64 value = reinterpret_cast<u64>(func);
 		const void* ptr = reinterpret_cast<const void*>(value);
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 		return _mm_prefetch(static_cast<const char*>(ptr), 2);
 #else
 		return __builtin_prefetch(ptr, 0, 2);
@@ -128,7 +124,7 @@ namespace utils
 			return;
 		}
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 		return _mm_prefetch(static_cast<const char*>(ptr), 3);
 #else
 		return __builtin_prefetch(ptr, 0, 3);
@@ -142,7 +138,7 @@ namespace utils
 			return;
 		}
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 		return _m_prefetchw(ptr);
 #else
 		return __builtin_prefetch(ptr, 1, 0);
@@ -160,8 +156,10 @@ namespace utils
 		return _rotl8(x, n);
 #elif defined(__clang__)
 		return __builtin_rotateleft8(x, n);
-#else
+#elif defined(ARCH_X64)
 		return __builtin_ia32_rolqi(x, n);
+#else
+		return (x << (n & 7)) | (x >> ((-n & 7)));
 #endif
 	}
 
@@ -176,8 +174,10 @@ namespace utils
 		return _rotl16(x, static_cast<uchar>(n));
 #elif defined(__clang__)
 		return __builtin_rotateleft16(x, n);
-#else
+#elif defined(ARCH_X64)
 		return __builtin_ia32_rolhi(x, n);
+#else
+		return (x << (n & 15)) | (x >> ((-n & 15)));
 #endif
 	}
 
@@ -344,10 +344,14 @@ namespace utils
 
 	inline void pause()
 	{
-#ifdef _MSC_VER
+#if defined(ARCH_ARM64)
+		__asm__ volatile("yield");
+#elif defined(_M_X64)
 		_mm_pause();
-#else
+#elif defined(ARCH_X64)
 		__builtin_ia32_pause();
+#else
+#error "Missing utils::pause() implementation"
 #endif
 	}
 
@@ -391,10 +395,27 @@ namespace utils
 	{
 #ifdef _MSC_VER
 		return (T*)ptr;
-#else
+#elif defined(ARCH_X64)
 		T* result;
 		__asm__("movq %1, %0;" : "=r" (result) : "r" (ptr) : "memory");
 		return result;
+#elif defined(ARCH_ARM64)
+		T* result;
+		__asm__("mov %0, %1" : "=r" (result) : "r" (ptr) : "memory");
+		return result;
+#endif
+	}
+
+	inline void trap()
+	{
+#ifdef _M_X64
+		__debugbreak();
+#elif defined(ARCH_X64)
+		__asm__ volatile("int3");
+#elif defined(ARCH_ARM64)
+		__asm__ volatile("brk 0x42");
+#else
+#error "Missing utils::trap() implementation"
 #endif
 	}
 } // namespace utils
diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp
index 54c921adb8..b42099aa56 100644
--- a/rpcs3/util/atomic.cpp
+++ b/rpcs3/util/atomic.cpp
@@ -35,6 +35,7 @@ namespace utils
 
 #include "asm.hpp"
 #include "endian.hpp"
+#include "tsc.hpp"
 
 // Total number of entries.
 static constexpr usz s_hashtable_size = 1u << 17;
@@ -804,17 +805,9 @@ namespace
 	};
 }
 
-#ifdef _MSC_VER
-extern "C" u64 __rdtsc();
-#endif
-
 u64 utils::get_unique_tsc()
 {
-#ifdef _MSC_VER
-	const u64 stamp0 = __rdtsc();
-#else
-    const u64 stamp0 = __builtin_ia32_rdtsc();
-#endif
+	const u64 stamp0 = utils::get_tsc();
 
 	return s_min_tsc.atomic_op([&](u64& tsc)
 	{
diff --git a/rpcs3/util/atomic.hpp b/rpcs3/util/atomic.hpp
index 0f60a549d6..711b669ae6 100644
--- a/rpcs3/util/atomic.hpp
+++ b/rpcs3/util/atomic.hpp
@@ -4,7 +4,7 @@
 #include <functional>
 #include <mutex>
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 #pragma warning(push)
 #pragma warning(disable: 4996)
 
@@ -67,7 +67,7 @@ namespace utils
 
 FORCE_INLINE void atomic_fence_consume()
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	_ReadWriteBarrier();
 #else
 	__atomic_thread_fence(__ATOMIC_CONSUME);
@@ -76,7 +76,7 @@ FORCE_INLINE void atomic_fence_consume()
 
 FORCE_INLINE void atomic_fence_acquire()
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	_ReadWriteBarrier();
 #else
 	__atomic_thread_fence(__ATOMIC_ACQUIRE);
@@ -85,7 +85,7 @@ FORCE_INLINE void atomic_fence_acquire()
 
 FORCE_INLINE void atomic_fence_release()
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	_ReadWriteBarrier();
 #else
 	__atomic_thread_fence(__ATOMIC_RELEASE);
@@ -94,7 +94,7 @@ FORCE_INLINE void atomic_fence_release()
 
 FORCE_INLINE void atomic_fence_acq_rel()
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	_ReadWriteBarrier();
 #else
 	__atomic_thread_fence(__ATOMIC_ACQ_REL);
@@ -103,16 +103,18 @@ FORCE_INLINE void atomic_fence_acq_rel()
 
 FORCE_INLINE void atomic_fence_seq_cst()
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	_ReadWriteBarrier();
 	_InterlockedOr(static_cast<long*>(_AddressOfReturnAddress()), 0);
 	_ReadWriteBarrier();
-#else
+#elif defined(ARCH_X64)
 	__asm__ volatile ("lock orl $0, 0(%%rsp);" ::: "cc", "memory");
+#else
+	__atomic_thread_fence(__ATOMIC_SEQ_CST);
 #endif
 }
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 #pragma warning(pop)
 #endif
 
@@ -342,7 +344,7 @@ struct atomic_storage
 
 	using type = get_uint_t<sizeof(T)>;
 
-#ifndef _MSC_VER
+#ifndef _M_X64
 
 #if defined(__ATOMIC_HLE_ACQUIRE) && defined(__ATOMIC_HLE_RELEASE)
 	static constexpr int s_hle_ack = __ATOMIC_SEQ_CST | __ATOMIC_HLE_ACQUIRE;
@@ -472,7 +474,7 @@ struct atomic_storage
 
 	/* Second part: MSVC-specific */
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 	static inline T add_fetch(T& dest, T value)
 	{
 		return atomic_storage<T>::fetch_add(dest, value) + value;
@@ -529,6 +531,7 @@ struct atomic_storage
 
 	static inline bool bts(T& dest, uint bit)
 	{
+#if defined(ARCH_X64)
 		uchar* dst = reinterpret_cast<uchar*>(&dest);
 
 		if constexpr (sizeof(T) < 4)
@@ -539,18 +542,23 @@ struct atomic_storage
 			bit = bit + (ptr & 3) * 8;
 			dst = reinterpret_cast<T*>(ptr & -4);
 		}
+#endif
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 		return _interlockedbittestandset((long*)dst, bit) != 0;
-#else
+#elif defined(ARCH_X64)
 		bool result;
 		__asm__ volatile ("lock btsl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory");
 		return result;
+#else
+		const T value = static_cast<T>(1) << bit;
+		return (__atomic_fetch_or(&dest, value, __ATOMIC_SEQ_CST) & value) != 0;
 #endif
 	}
 
 	static inline bool btr(T& dest, uint bit)
 	{
+#if defined(ARCH_X64)
 		uchar* dst = reinterpret_cast<uchar*>(&dest);
 
 		if constexpr (sizeof(T) < 4)
@@ -561,18 +569,23 @@ struct atomic_storage
 			bit = bit + (ptr & 3) * 8;
 			dst = reinterpret_cast<T*>(ptr & -4);
 		}
+#endif
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 		return _interlockedbittestandreset((long*)dst, bit) != 0;
-#else
+#elif defined(ARCH_X64)
 		bool result;
 		__asm__ volatile ("lock btrl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory");
 		return result;
+#else
+		const T value = static_cast<T>(1) << bit;
+		return (__atomic_fetch_and(&dest, ~value, __ATOMIC_SEQ_CST) & value) != 0;
 #endif
 	}
 
 	static inline bool btc(T& dest, uint bit)
 	{
+#if defined(ARCH_X64)
 		uchar* dst = reinterpret_cast<uchar*>(&dest);
 
 		if constexpr (sizeof(T) < 4)
@@ -583,8 +596,9 @@ struct atomic_storage
 			bit = bit + (ptr & 3) * 8;
 			dst = reinterpret_cast<T*>(ptr & -4);
 		}
+#endif
 
-#ifdef _MSC_VER
+#ifdef _M_X64
 		while (true)
 		{
 			// Keep trying until we actually invert desired bit
@@ -593,10 +607,13 @@ struct atomic_storage
 			if (_interlockedbittestandreset((long*)dst, bit))
 				return true;
 		}
-#else
+#elif defined(ARCH_X64)
 		bool result;
 		__asm__ volatile ("lock btcl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory");
 		return result;
+#else
+		const T value = static_cast<T>(1) << bit;
+		return (__atomic_fetch_xor(&dest, value, __ATOMIC_SEQ_CST) & value) != 0;
 #endif
 	}
 };
@@ -606,7 +623,7 @@ struct atomic_storage
 template <typename T>
 struct atomic_storage<T, 1> : atomic_storage<T, 0>
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	static inline bool compare_exchange(T& dest, T& comp, T exch)
 	{
 		const char v = std::bit_cast<char>(comp);
@@ -676,7 +693,7 @@ struct atomic_storage<T, 1> : atomic_storage<T, 0>
 template <typename T>
 struct atomic_storage<T, 2> : atomic_storage<T, 0>
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	static inline bool compare_exchange(T& dest, T& comp, T exch)
 	{
 		const short v = std::bit_cast<short>(comp);
@@ -758,7 +775,7 @@ struct atomic_storage<T, 2> : atomic_storage<T, 0>
 template <typename T>
 struct atomic_storage<T, 4> : atomic_storage<T, 0>
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	static inline bool compare_exchange(T& dest, T& comp, T exch)
 	{
 		const long v = std::bit_cast<long>(comp);
@@ -854,7 +871,7 @@ struct atomic_storage<T, 4> : atomic_storage<T, 0>
 template <typename T>
 struct atomic_storage<T, 8> : atomic_storage<T, 0>
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	static inline bool compare_exchange(T& dest, T& comp, T exch)
 	{
 		const llong v = std::bit_cast<llong>(comp);
@@ -950,7 +967,7 @@ struct atomic_storage<T, 8> : atomic_storage<T, 0>
 template <typename T>
 struct atomic_storage<T, 16> : atomic_storage<T, 0>
 {
-#ifdef _MSC_VER
+#ifdef _M_X64
 	static inline T load(const T& dest)
 	{
 		atomic_fence_acquire();
@@ -995,7 +1012,7 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
 		utils::atomic_store16(&dest, std::bit_cast<u128>(value));
 		atomic_fence_release();
 	}
-#else
+#elif defined(ARCH_X64)
 	static inline T load(const T& dest)
 	{
 		alignas(16) T r;
@@ -1078,6 +1095,91 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
 		__asm__ volatile("movdqa %0, %1;" :: "x" (val), "m" (dest) : "memory");
 #endif
 	}
+#elif defined(ARCH_ARM64)
+	static inline T load(const T& dest)
+	{
+		u32 tmp;
+		u64 data[2];
+		__asm__ volatile("1:\n"
+			"ldaxp %x[data0], %x[data1], %[dest]\n"
+			"stlxp %w[tmp], %x[data0], %x[data1], %[dest]\n"
+			"cbnz %w[tmp], 1b\n"
+			: [tmp] "=&r" (tmp), [data0] "=&r" (data[0]), [data1] "=&r" (data[1])
+			: [dest] "Q" (dest)
+			: "memory"
+		);
+		T result;
+		std::memcpy(&result, data, 16);
+		return result;
+	}
+
+	static inline T observe(const T& dest)
+	{
+		// TODO
+		return load(dest);
+	}
+
+	static inline bool compare_exchange(T& dest, T& comp, T exch)
+	{
+		bool result;
+		u64 cmp[2];
+		std::memcpy(cmp, &comp, 16);
+		u64 data[2];
+		std::memcpy(data, &exch, 16);
+		u64 prev[2];
+		__asm__ volatile("1:\n"
+			"ldaxp %x[prev0], %x[prev1], %[storage]\n"
+			"cmp %x[prev0], %x[cmp0]\n"
+			"ccmp %x[prev1], %x[cmp1], #0, eq\n"
+			"b.ne 2f\n"
+			"stlxp %w[result], %x[data0], %x[data1], %[storage]\n"
+			"cbnz %w[result], 1b\n"
+			"2:\n"
+			"cset %w[result], eq\n"
+			: [result] "=&r" (result), [storage] "+Q" (dest), [prev0] "=&r" (prev[0]), [prev1] "=&r" (prev[1])
+			: [data0] "r" (data[0]), [data1] "r" (data[1]), [cmp0] "r" (cmp[0]), [cmp1] "r" (cmp[1])
+			: "cc", "memory"
+		);
+
+		if (result)
+		{
+			return true;
+		}
+
+		std::memcpy(&comp, prev, 16);
+		return false;
+	}
+
+	static inline T exchange(T& dest, T value)
+	{
+		u32 tmp;
+		u64 src[2];
+		u64 data[2];
+		std::memcpy(src, &value, 16);
+		__asm__ volatile("1:\n"
+			"ldaxp %x[data0], %x[data1], %[dest]\n"
+			"stlxp %w[tmp], %x[src0], %x[src1], %[dest]\n"
+			"cbnz %w[tmp], 1b\n"
+			: [tmp] "=&r" (tmp), [dest] "+Q" (dest), [data0] "=&r" (data[0]), [data1] "=&r" (data[1])
+			: [src0] "r" (src[0]), [src1] "r" (src[1])
+			: "memory"
+		);
+		T result;
+		std::memcpy(&result, data, 16);
+		return result;
+	}
+
+	static inline void store(T& dest, T value)
+	{
+		// TODO
+		exchange(dest, value);
+	}
+
+	static inline void release(T& dest, T value)
+	{
+		// TODO
+		exchange(dest, value);
+	}
 #endif
 
 	// TODO
@@ -1562,17 +1664,50 @@ public:
 
 	bool bit_test_set(uint bit)
 	{
-		return atomic_storage<type>::bts(m_data, bit & (sizeof(T) * 8 - 1));
+		if constexpr (std::is_integral<type>::value)
+		{
+			return atomic_storage<type>::bts(m_data, bit & (sizeof(T) * 8 - 1));
+		}
+
+		return atomic_op([](type& v)
+		{
+			const auto old = v;
+			const auto bit = type(1) << (sizeof(T) * 8 - 1);
+			v |= bit;
+			return !!(old & bit);
+		});
 	}
 
 	bool bit_test_reset(uint bit)
 	{
-		return atomic_storage<type>::btr(m_data, bit & (sizeof(T) * 8 - 1));
+		if constexpr (std::is_integral<type>::value)
+		{
+			return atomic_storage<type>::btr(m_data, bit & (sizeof(T) * 8 - 1));
+		}
+
+		return atomic_op([](type& v)
+		{
+			const auto old = v;
+			const auto bit = type(1) << (sizeof(T) * 8 - 1);
+			v &= ~bit;
+			return !!(old & bit);
+		});
 	}
 
 	bool bit_test_invert(uint bit)
 	{
-		return atomic_storage<type>::btc(m_data, bit & (sizeof(T) * 8 - 1));
+		if constexpr (std::is_integral<type>::value)
+		{
+			return atomic_storage<type>::btc(m_data, bit & (sizeof(T) * 8 - 1));
+		}
+
+		return atomic_op([](type& v)
+		{
+			const auto old = v;
+			const auto bit = type(1) << (sizeof(T) * 8 - 1);
+			v ^= bit;
+			return !!(old & bit);
+		});
 	}
 
 	// Timeout is discouraged
diff --git a/rpcs3/util/fence.hpp b/rpcs3/util/fence.hpp
new file mode 100644
index 0000000000..f502bdee4f
--- /dev/null
+++ b/rpcs3/util/fence.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "util/types.hpp"
+
+#ifdef _M_X64
+extern "C" void _mm_lfence();
+#endif
+
+namespace utils
+{
+	inline void lfence()
+	{
+#ifdef _M_X64
+		_mm_lfence();
+#elif defined(ARCH_X64)
+		__builtin_ia32_lfence();
+#elif defined(ARCH_ARM64)
+		// TODO
+		__asm__ volatile("isb");
+#else
+#error "Missing lfence() implementation"
+#endif
+	}
+}
diff --git a/rpcs3/util/shared_ptr.hpp b/rpcs3/util/shared_ptr.hpp
index dd515ab422..6e3e9dbfb9 100644
--- a/rpcs3/util/shared_ptr.hpp
+++ b/rpcs3/util/shared_ptr.hpp
@@ -19,10 +19,10 @@ namespace stx
 	class atomic_ptr;
 
 	// Basic assumption of userspace pointer size
-	constexpr uint c_ptr_size = 47;
+	constexpr uint c_ptr_size = 48;
 
 	// Use lower 17 bits as atomic_ptr internal counter of borrowed refs (pointer itself is shifted)
-	constexpr uint c_ref_mask = 0x1ffff, c_ref_size = 17;
+	constexpr uint c_ref_mask = 0xffff, c_ref_size = 16;
 
 	// Remaining pointer bits
 	constexpr uptr c_ptr_mask = static_cast<uptr>(-1) << c_ref_size;
diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp
new file mode 100644
index 0000000000..e65f0793f8
--- /dev/null
+++ b/rpcs3/util/simd.hpp
@@ -0,0 +1,2143 @@
+#pragma once
+
+#include "util/types.hpp"
+#include "util/v128.hpp"
+#include "util/sysinfo.hpp"
+#include "Utilities/JIT.h"
+
+#if defined(ARCH_X64)
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <cmath>
+#endif
+
+#if defined(ARCH_ARM64)
+#include <arm_neon.h>
+#endif
+
+#include <cmath>
+#include <cfenv>
+
+namespace asmjit
+{
+	struct vec_builder;
+}
+
+inline thread_local asmjit::vec_builder* g_vc = nullptr;
+
+namespace asmjit
+{
+#if defined(ARCH_X64)
+	using gpr_type = x86::Gp;
+	using vec_type = x86::Xmm;
+	using mem_type = x86::Mem;
+#else
+	struct gpr_type : Operand
+	{
+		gpr_type(u32)
+		{
+		}
+	};
+
+	struct vec_type : Operand
+	{
+		vec_type(u32)
+		{
+		}
+	};
+
+	struct mem_type : Operand
+	{
+	};
+#endif
+
+	struct mem_lazy : Operand
+	{
+		const Operand& eval(bool is_lv);
+	};
+
+	enum class arg_class : u32
+	{
+		reg_lv, // const auto x = gv_...(y, z);
+		reg_rv, // r = gv_...(y, z);
+		imm_lv,
+		imm_rv,
+		mem_lv,
+		mem_rv,
+	};
+
+	constexpr arg_class operator+(arg_class _base, u32 off)
+	{
+		return arg_class(u32(_base) + off);
+	}
+
+	template <typename... Args>
+	constexpr bool any_operand_v = (std::is_base_of_v<Operand, std::decay_t<Args>> || ...);
+
+	template <typename T, typename D = std::decay_t<T>>
+	constexpr arg_class arg_classify =
+		std::is_base_of_v<v128, D> ? arg_class::imm_lv + !std::is_reference_v<T> :
+		std::is_base_of_v<mem_type, D> ? arg_class::mem_lv :
+		std::is_base_of_v<mem_lazy, D> ? arg_class::mem_lv + !std::is_reference_v<T> :
+		std::is_reference_v<T> ? arg_class::reg_lv : arg_class::reg_rv;
+
+	struct vec_builder : native_asm
+	{
+		using base = native_asm;
+
+		vec_builder(CodeHolder* ch)
+			: native_asm(ch)
+		{
+			if (!g_vc)
+			{
+				g_vc = this;
+			}
+		}
+
+		~vec_builder()
+		{
+			if (g_vc == this)
+			{
+				g_vc = nullptr;
+			}
+		}
+
+		u32 vec_allocated = 0xffffffff << 6;
+
+		vec_type vec_alloc()
+		{
+			ensure(~vec_allocated);
+			const u32 idx = std::countr_one(vec_allocated);
+			vec_allocated |= vec_allocated + 1;
+			return vec_type{idx};
+		}
+
+		template <u32 Size>
+		std::array<vec_type, Size> vec_alloc()
+		{
+			std::array<vec_type, Size> r;
+			for (auto& x : r)
+			{
+				x = vec_alloc();
+			}
+			return r;
+		}
+
+		void vec_dealloc(vec_type vec)
+		{
+			vec_allocated &= ~(1u << vec.id());
+		}
+
+		void emit_consts()
+		{
+			//  (TODO: sort in use order)
+			for (u32 sz = 1; sz <= 16; sz++)
+			{
+				for (auto& [key, _label] : consts[sz - 1])
+				{
+					base::align(AlignMode::kData, 1u << std::countr_zero<u32>(sz));
+					base::bind(_label);
+					base::embed(&key, sz);
+				}
+			}
+		}
+
+		std::unordered_map<v128, Label> consts[16]{};
+
+		template <typename T, u32 Size = sizeof(T)>
+		x86::Mem get_const(const T& data, u32 esize = Size)
+		{
+			static_assert(Size <= 16);
+
+			// Find existing const
+			v128 key{};
+			std::memcpy(&key, &data, Size);
+
+			if (Size == 16 && esize == 4 && key._u64[0] == key._u64[1] && key._u32[0] == key._u32[1])
+			{
+				x86::Mem r = get_const<u32>(key._u32[0]);
+				r.setBroadcast(x86::Mem::Broadcast::k1To4);
+				return r;
+			}
+
+			if (Size == 16 && esize == 8 && key._u64[0] == key._u64[1])
+			{
+				x86::Mem r = get_const<u64>(key._u64[0]);
+				r.setBroadcast(x86::Mem::Broadcast::k1To2);
+				return r;
+			}
+
+			auto& _label = consts[Size - 1][key];
+
+			if (!_label.isValid())
+				_label = base::newLabel();
+
+			return x86::Mem(_label, 0, Size);
+		}
+	};
+
+#if defined(ARCH_X64)
+	inline auto arg_eval(const v128& _c, u32 esize)
+	{
+		// TODO: implement PSHUFD broadcasts and AVX ones
+		auto r = g_vc->get_const(_c, esize);
+		return r;
+	}
+
+	template <typename T> requires(std::is_base_of_v<mem_lazy, std::decay_t<T>>)
+	inline decltype(auto) arg_eval(T&& mem, u32)
+	{
+		return mem.eval(std::is_reference_v<T>);
+	}
+
+	inline decltype(auto) arg_eval(Operand& mem, u32)
+	{
+		return mem;
+	}
+
+	inline decltype(auto) arg_eval(Operand&& mem, u32)
+	{
+		return std::move(mem);
+	}
+
+	template <typename A, typename... Args>
+	vec_type unary_op(x86::Inst::Id op, x86::Inst::Id op2, A&& a, Args&&... args)
+	{
+		if constexpr (arg_classify<A> == arg_class::reg_rv)
+		{
+			if (op)
+			{
+				ensure(!g_vc->emit(op, a, std::forward<Args>(args)...));
+			}
+			else
+			{
+				ensure(!g_vc->emit(op2, a, a, std::forward<Args>(args)...));
+			}
+
+			return a;
+		}
+		else
+		{
+			vec_type r = g_vc->vec_alloc();
+
+			if (op)
+			{
+				if (op2 && utils::has_avx())
+				{
+					// Assume op2 is AVX (but could be PSHUFD as well for example)
+					ensure(!g_vc->emit(op2, r, arg_eval(std::forward<A>(a), 16), std::forward<Args>(args)...));
+				}
+				else
+				{
+					// TODO
+					ensure(!g_vc->emit(x86::Inst::Id::kIdMovaps, r, arg_eval(std::forward<A>(a), 16)));
+					ensure(!g_vc->emit(op, r, std::forward<Args>(args)...));
+				}
+			}
+			else
+			{
+				ensure(!g_vc->emit(op2, r, arg_eval(std::forward<A>(a), 16), std::forward<Args>(args)...));
+			}
+
+			return r;
+		}
+	}
+
+	template <typename D, typename S>
+	void store_op(x86::Inst::Id op, x86::Inst::Id evex_op, D&& d, S&& s)
+	{
+		static_assert(arg_classify<D> == arg_class::mem_lv);
+
+		mem_type dst;
+		dst.copyFrom(arg_eval(std::forward<D>(d), 16));
+
+		if (utils::has_avx512() && evex_op)
+		{
+			if (!dst.hasBaseLabel() && dst.hasOffset() && dst.offset() % dst.size() == 0 && dst.offset() / dst.size() + 128 < 256)
+			{
+				ensure(!g_vc->evex().emit(evex_op, dst, arg_eval(std::forward<S>(s), 16)));
+				return;
+			}
+		}
+
+		ensure(!g_vc->emit(op, dst, arg_eval(std::forward<S>(s), 16)));
+	}
+
+	template <typename A, typename B, typename... Args>
+	vec_type binary_op(u32 esize, x86::Inst::Id mov_op, x86::Inst::Id sse_op, x86::Inst::Id avx_op, x86::Inst::Id evex_op, A&& a, B&& b, Args&&... args)
+	{
+		Operand src1{};
+
+		if constexpr (arg_classify<A> == arg_class::reg_rv)
+		{
+			// Use src1 as a destination
+			src1 = arg_eval(std::forward<A>(a), 16);
+
+			if (utils::has_avx512() && evex_op && (arg_classify<B> == arg_class::imm_rv || arg_classify<B> == arg_class::mem_rv || b.isMem()))
+			{
+				ensure(!g_vc->evex().emit(evex_op, src1, src1, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
+				return vec_type{src1.id()};
+			}
+
+			if constexpr (arg_classify<B> == arg_class::reg_rv)
+			{
+				g_vc->vec_dealloc(vec_type{b.id()});
+			}
+		}
+		else if (utils::has_avx() && avx_op && (arg_classify<A> == arg_class::reg_lv || arg_classify<A> == arg_class::mem_lv))
+		{
+			if constexpr (arg_classify<A> == arg_class::reg_lv)
+			{
+				if constexpr (arg_classify<B> == arg_class::reg_rv)
+				{
+					// Use src2 as a destination
+					src1 = arg_eval(std::forward<B>(b), 16);
+				}
+				else
+				{
+					// Use new reg as a destination
+					src1 = g_vc->vec_alloc();
+				}
+			}
+			else // if A == arg_class::reg_rv
+			{
+				src1 = g_vc->vec_alloc();
+
+				if (!a.isReg())
+				{
+					static_cast<void>(arg_eval(std::forward<A>(a), 16));
+				}
+
+				if constexpr (arg_classify<B> == arg_class::reg_rv)
+				{
+					g_vc->vec_dealloc(vec_type{b.id()});
+				}
+			}
+
+			if (utils::has_avx512() && evex_op && (arg_classify<B> == arg_class::imm_rv || arg_classify<B> == arg_class::mem_rv || b.isMem()))
+			{
+				ensure(!g_vc->evex().emit(evex_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
+				return vec_type{src1.id()};
+			}
+
+			ensure(!g_vc->emit(avx_op, src1, vec_type{a.id()}, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
+			return vec_type{src1.id()};
+		}
+		else do
+		{
+			if constexpr (arg_classify<B> == arg_class::reg_rv)
+			{
+				g_vc->vec_dealloc(vec_type{b.id()});
+			}
+
+			if (arg_classify<A> == arg_class::mem_rv && a.isReg())
+			{
+				src1 = vec_type(a.id());
+				break;
+			}
+
+			src1 = g_vc->vec_alloc();
+
+			// Fallback to arg copy
+			ensure(!g_vc->emit(mov_op, src1, arg_eval(std::forward<A>(a), 16)));
+		}
+		while (0);
+
+		if (utils::has_avx512() && evex_op && (arg_classify<B> == arg_class::imm_rv || arg_classify<B> == arg_class::mem_rv || b.isMem()))
+		{
+			ensure(!g_vc->evex().emit(evex_op, src1, src1, arg_eval(std::forward<B>(b), esize), std::forward<Args>(args)...));
+		}
+		else if (sse_op)
+		{
+			ensure(!g_vc->emit(sse_op, src1, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
+		}
+		else
+		{
+			ensure(!g_vc->emit(avx_op, src1, src1, arg_eval(std::forward<B>(b), 16), std::forward<Args>(args)...));
+		}
+
+		return vec_type{src1.id()};
+	}
+#define FOR_X64(f, ...) do { using enum asmjit::x86::Inst::Id; return asmjit::f(__VA_ARGS__); } while (0)
+#elif defined(ARCH_ARM64)
+#define FOR_X64(...) do {} while (0)
+#endif
+}
+
+inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false);
+inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false);
+inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false);
+inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false);
+
+inline void gv_set_zeroing_denormals()
+{
+#if defined(ARCH_X64)
+	u32 cr = _mm_getcsr();
+	cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_ON;
+	cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_ON;
+	cr = (cr | _MM_MASK_INVALID);
+	_mm_setcsr(cr);
+#elif defined(ARCH_ARM64)
+	u64 cr;
+	__asm__ volatile("mrs %0, FPCR" : "=r"(cr));
+	cr |= 0x1000000ull;
+	__asm__ volatile("msr FPCR, %0" :: "r"(cr));
+#else
+#error "Not implemented"
+#endif
+}
+
+inline void gv_unset_zeroing_denormals()
+{
+#if defined(ARCH_X64)
+	u32 cr = _mm_getcsr();
+	cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_OFF;
+	cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_OFF;
+	cr = (cr | _MM_MASK_INVALID);
+	_mm_setcsr(cr);
+#elif defined(ARCH_ARM64)
+	u64 cr;
+	__asm__ volatile("mrs %0, FPCR" : "=r"(cr));
+	cr &= ~0x1000000ull;
+	__asm__ volatile("msr FPCR, %0" :: "r"(cr));
+#else
+#error "Not implemented"
+#endif
+}
+
+inline void gv_zeroupper()
+{
+#if defined(ARCH_X64)
+	if (!utils::has_avx())
+		return;
+#if defined(_M_X64)
+	_mm256_zeroupper();
+#else
+	__asm__ volatile("vzeroupper;");
+#endif
+#endif
+}
+
+inline v128 gv_bcst8(u8 value)
+{
+#if defined(ARCH_X64)
+	return _mm_set1_epi8(value);
+#elif defined(ARCH_ARM64)
+	return vdupq_n_s8(value);
+#endif
+}
+
+inline v128 gv_bcst16(u16 value)
+{
+#if defined(ARCH_X64)
+	return _mm_set1_epi16(value);
+#elif defined(ARCH_ARM64)
+	return vdupq_n_s16(value);
+#endif
+}
+
+// Optimized broadcast using constant offset assumption
+inline v128 gv_bcst16(const u16& value, auto mptr, auto... args)
+{
+#if defined(ARCH_X64)
+	const u32 offset = ::offset32(mptr, args...);
+	[[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16);
+#if !defined(__AVX2__)
+	if (offset % 16 == 0)
+		return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0), 0);
+	if (offset % 16 == 2)
+		return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0b01010101), 0);
+	if (offset % 16 == 4)
+		return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0b10101010), 0);
+	if (offset % 16 == 6)
+		return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0xff), 0);
+	if (offset % 16 == 8)
+		return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0), 0xff);
+	if (offset % 16 == 10)
+		return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0b01010101), 0xff);
+	if (offset % 16 == 12)
+		return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0b10101010), 0xff);
+	if (offset % 16 == 14)
+		return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0xff), 0xff);
+#endif
+	return _mm_set1_epi16(value);
+#else
+	static_cast<void>(mptr);
+	return gv_bcst16(value);
+#endif
+}
+
+inline v128 gv_bcst32(u32 value)
+{
+#if defined(ARCH_X64)
+	return _mm_set1_epi32(value);
+#elif defined(ARCH_ARM64)
+	return vdupq_n_s32(value);
+#endif
+}
+
+// Optimized broadcast using constant offset assumption
+inline v128 gv_bcst32(const u32& value, auto mptr, auto... args)
+{
+#if defined(ARCH_X64)
+	const u32 offset = ::offset32(mptr, args...);
+	[[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16);
+#if !defined(__AVX__)
+	if (offset % 16 == 0)
+		return _mm_shuffle_epi32(*ptr, 0);
+	if (offset % 16 == 4)
+		return _mm_shuffle_epi32(*ptr, 0b01010101);
+	if (offset % 16 == 8)
+		return _mm_shuffle_epi32(*ptr, 0b10101010);
+	if (offset % 16 == 12)
+		return _mm_shuffle_epi32(*ptr, 0xff);
+#endif
+	return _mm_set1_epi32(value);
+#else
+	static_cast<void>(mptr);
+	return gv_bcst32(value);
+#endif
+}
+
+inline v128 gv_bcst64(u64 value)
+{
+#if defined(ARCH_X64)
+	return _mm_set1_epi64x(value);
+#elif defined(ARCH_ARM64)
+	return vdupq_n_s64(value);
+#endif
+}
+
+// Optimized broadcast using constant offset assumption
+inline v128 gv_bcst64(const u64& value, auto mptr, auto... args)
+{
+#if defined(ARCH_X64)
+	const u32 offset = ::offset32(mptr, args...);
+	[[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16);
+#if !defined(__AVX__)
+	if (offset % 16 == 0)
+		return _mm_shuffle_epi32(*ptr, 0b00010001);
+	if (offset % 16 == 8)
+		return _mm_shuffle_epi32(*ptr, 0b10111011);
+#endif
+	return _mm_set1_epi64x(value);
+#else
+	static_cast<void>(mptr);
+	return gv_bcst64(value);
+#endif
+}
+
+inline v128 gv_bcstfs(f32 value)
+{
+#if defined(ARCH_X64)
+	return _mm_set1_ps(value);
+#elif defined(ARCH_ARM64)
+	return vdupq_n_f32(value);
+#endif
+}
+
+inline v128 gv_and32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_and_si128(a, b);
+#elif defined(ARCH_ARM64)
+	return vandq_s32(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_and32(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovdqa, kIdPand, kIdVpand, kIdVpandd, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_andfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_and_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vandq_s32(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_andfs(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovaps, kIdAndps, kIdVandps, kIdVandps, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_andn32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_andnot_si128(a, b);
+#elif defined(ARCH_ARM64)
+	return vbicq_s32(b, a);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_andn32(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovdqa, kIdPandn, kIdVpandn, kIdVpandnd, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_andnfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_andnot_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vbicq_s32(b, a);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_andnfs(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovaps, kIdAndnps, kIdVandnps, kIdVandnps, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_or32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_or_si128(a, b);
+#elif defined(ARCH_ARM64)
+	return vorrq_s32(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_or32(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovdqa, kIdPor, kIdVpor, kIdVpord, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_orfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_or_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vorrq_s32(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_orfs(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovaps, kIdOrps, kIdVorps, kIdVorps, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_xor32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_xor_si128(a, b);
+#elif defined(ARCH_ARM64)
+	return veorq_s32(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_xor32(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_xorfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_xor_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return veorq_s32(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_xorfs(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_shl16(const v128& a, u32 count)
+{
+	if (count >= 16)
+		return v128{};
+#if defined(ARCH_X64)
+	return _mm_slli_epi16(a, count);
+#elif defined(ARCH_ARM64)
+	return vshlq_s16(a, vdupq_n_s16(count));
+#endif
+}
+
+template <typename A> requires(asmjit::any_operand_v<A>)
+inline auto gv_shl16(A&& a, u32 count)
+{
+	FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward<A>(a), count);
+}
+
+inline v128 gv_shl32(const v128& a, u32 count)
+{
+	if (count >= 32)
+		return v128{};
+#if defined(ARCH_X64)
+	return _mm_slli_epi32(a, count);
+#elif defined(ARCH_ARM64)
+	return vshlq_s32(a, vdupq_n_s32(count));
+#endif
+}
+
+template <typename A> requires(asmjit::any_operand_v<A>)
+inline auto gv_shl32(A&& a, u32 count)
+{
+	FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward<A>(a), count);
+}
+
+inline v128 gv_shl64(const v128& a, u32 count)
+{
+	if (count >= 64)
+		return v128{};
+#if defined(ARCH_X64)
+	return _mm_slli_epi64(a, count);
+#elif defined(ARCH_ARM64)
+	return vshlq_s64(a, vdupq_n_s64(count));
+#endif
+}
+
+template <typename A> requires(asmjit::any_operand_v<A>)
+inline auto gv_shl64(A&& a, u32 count)
+{
+	FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward<A>(a), count);
+}
+
+inline v128 gv_shr16(const v128& a, u32 count)
+{
+	if (count >= 16)
+		return v128{};
+#if defined(ARCH_X64)
+	return _mm_srli_epi16(a, count);
+#elif defined(ARCH_ARM64)
+	return vshlq_u16(a, vdupq_n_s16(0 - count));
+#endif
+}
+
+template <typename A> requires(asmjit::any_operand_v<A>)
+inline auto gv_shr16(A&& a, u32 count)
+{
+	FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward<A>(a), count);
+}
+
+inline v128 gv_shr32(const v128& a, u32 count)
+{
+	if (count >= 32)
+		return v128{};
+#if defined(ARCH_X64)
+	return _mm_srli_epi32(a, count);
+#elif defined(ARCH_ARM64)
+	return vshlq_u32(a, vdupq_n_s32(0 - count));
+#endif
+}
+
+template <typename A> requires(asmjit::any_operand_v<A>)
+inline auto gv_shr32(A&& a, u32 count)
+{
+	FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward<A>(a), count);
+}
+
+inline v128 gv_shr64(const v128& a, u32 count)
+{
+	if (count >= 64)
+		return v128{};
+#if defined(ARCH_X64)
+	return _mm_srli_epi64(a, count);
+#elif defined(ARCH_ARM64)
+	return vshlq_u64(a, vdupq_n_s64(0 - count));
+#endif
+}
+
+template <typename A> requires(asmjit::any_operand_v<A>)
+inline auto gv_shr64(A&& a, u32 count)
+{
+	FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward<A>(a), count);
+}
+
+inline v128 gv_sar16(const v128& a, u32 count)
+{
+	if (count >= 16)
+		count = 15;
+#if defined(ARCH_X64)
+	return _mm_srai_epi16(a, count);
+#elif defined(ARCH_ARM64)
+	return vshlq_s16(a, vdupq_n_s16(0 - count));
+#endif
+}
+
+template <typename A> requires(asmjit::any_operand_v<A>)
+inline auto gv_sar16(A&& a, u32 count)
+{
+	FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward<A>(a), count);
+}
+
+inline v128 gv_sar32(const v128& a, u32 count)
+{
+	if (count >= 32)
+		count = 31;
+#if defined(ARCH_X64)
+	return _mm_srai_epi32(a, count);
+#elif defined(ARCH_ARM64)
+	return vshlq_s32(a, vdupq_n_s32(0 - count));
+#endif
+}
+
+template <typename A> requires(asmjit::any_operand_v<A>)
+inline auto gv_sar32(A&& a, u32 count)
+{
+	FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward<A>(a), count);
+}
+
+inline v128 gv_sar64(const v128& a, u32 count)
+{
+	if (count >= 64)
+		count = 63;
+#if defined(__AVX512VL__)
+	return _mm_srai_epi64(a, count);
+#elif defined(__SSE2__) && !defined(_M_X64)
+	return static_cast<__v2di>(a) >> count;
+#elif defined(ARCH_ARM64)
+	return vshlq_s64(a, vdupq_n_s64(0 - count));
+#else
+	v128 r;
+	r._s64[0] = a._s64[0] >> count;
+	r._s64[1] = a._s64[1] >> count;
+	return r;
+#endif
+}
+
+inline v128 gv_add8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_add_epi8(a, b);
+#elif defined(ARCH_ARM64)
+	return vaddq_s8(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_add8(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddb, kIdVpaddb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_add16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_add_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vaddq_s16(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_add16(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddw, kIdVpaddw, kIdNone, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_add32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_add_epi32(a, b);
+#elif defined(ARCH_ARM64)
+	return vaddq_s32(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_add32(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 4, kIdMovdqa, kIdPaddd, kIdVpaddd, kIdVpaddd, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_add64(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_add_epi64(a, b);
+#elif defined(ARCH_ARM64)
+	return vaddq_s64(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_add64(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 8, kIdMovdqa, kIdPaddq, kIdVpaddq, kIdVpaddq, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_adds_s8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_adds_epi8(a, b);
+#elif defined(ARCH_ARM64)
+	return vqaddq_s8(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_adds_s8(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddsb, kIdVpaddsb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_adds_s16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_adds_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vqaddq_s16(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_adds_s16(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddsw, kIdVpaddsw, kIdNone, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_adds_s32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	const v128 s = _mm_add_epi32(a, b);
+	const v128 m = (a ^ s) & (b ^ s); // overflow bit
+	const v128 x = _mm_srai_epi32(m, 31); // saturation mask
+	const v128 y = _mm_srai_epi32(_mm_and_si128(s, m), 31); // positive saturation mask
+	return _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(x, 1), y), _mm_or_si128(s, x));
+#elif defined(ARCH_ARM64)
+	return vqaddq_s32(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_adds_s32(A&& a, B&& b)
+{
+#if defined(ARCH_X64)
+	auto s = gv_add32(a, b);
+	auto m = gv_and32(gv_xor32(std::forward<A>(a), s), gv_xor32(std::forward<B>(b), s));
+	auto x = gv_sar32(m, 31);
+	auto y = gv_sar32(gv_and32(s, std::move(m)), 31);
+	auto z = gv_xor32(gv_shr32(x, 1), std::move(y));
+	return gv_xor32(std::move(z), gv_or32(std::move(s), std::move(x)));
+#endif
+}
+
+inline v128 gv_addus_u8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_adds_epu8(a, b);
+#elif defined(ARCH_ARM64)
+	return vqaddq_u8(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_addus_u8(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddusb, kIdVpaddusb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_addus_u16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_adds_epu16(a, b);
+#elif defined(ARCH_ARM64)
+	return vqaddq_u16(a, b);
+#endif
+}
+
+template <typename A, typename B> requires (asmjit::any_operand_v<A, B>)
+inline auto gv_addus_u16(A&& a, B&& b)
+{
+	FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddusw, kIdVpaddusw, kIdNone, std::forward<A>(a), std::forward<B>(b));
+}
+
+inline v128 gv_addus_u32(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_add_epi32(a, _mm_min_epu32(~a, b));
+#elif defined(ARCH_X64)
+	const v128 s = _mm_add_epi32(a, b);
+	return _mm_or_si128(s, _mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(smin)), _mm_xor_si128(a, _mm_set1_epi32(smax))));
+#elif defined(ARCH_ARM64)
+	return vqaddq_u32(a, b);
+#endif
+}
+
+inline v128 gv_addfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_add_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vaddq_f32(a, b);
+#endif
+}
+
+inline v128 gv_addfd(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_add_pd(a, b);
+#elif defined(ARCH_ARM64)
+	return vaddq_f64(a, b);
+#endif
+}
+
+inline v128 gv_sub8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_sub_epi8(a, b);
+#elif defined(ARCH_ARM64)
+	return vsubq_s8(a, b);
+#endif
+}
+
+inline v128 gv_sub16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_sub_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vsubq_s16(a, b);
+#endif
+}
+
+inline v128 gv_sub32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_sub_epi32(a, b);
+#elif defined(ARCH_ARM64)
+	return vsubq_s32(a, b);
+#endif
+}
+
+inline v128 gv_sub64(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_sub_epi64(a, b);
+#elif defined(ARCH_ARM64)
+	return vsubq_s64(a, b);
+#endif
+}
+
+inline v128 gv_subs_s8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_subs_epi8(a, b);
+#elif defined(ARCH_ARM64)
+	return vqsubq_s8(a, b);
+#endif
+}
+
+inline v128 gv_subs_s16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_subs_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vqsubq_s16(a, b);
+#endif
+}
+
+inline v128 gv_subs_s32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	const v128 d = _mm_sub_epi32(a, b);
+	const v128 m = (a ^ b) & (a ^ d); // overflow bit
+	const v128 x = _mm_srai_epi32(m, 31);
+	return _mm_or_si128(_mm_andnot_si128(x, d), _mm_and_si128(x, _mm_xor_si128(_mm_srli_epi32(x, 1), _mm_srai_epi32(a, 31))));
+#elif defined(ARCH_ARM64)
+	return vqsubq_s32(a, b);
+#endif
+}
+
+inline v128 gv_subus_u8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_subs_epu8(a, b);
+#elif defined(ARCH_ARM64)
+	return vqsubq_u8(a, b);
+#endif
+}
+
+inline v128 gv_subus_u16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_subs_epu16(a, b);
+#elif defined(ARCH_ARM64)
+	return vqsubq_u16(a, b);
+#endif
+}
+
+inline v128 gv_subus_u32(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_sub_epi32(a, _mm_min_epu32(a, b));
+#elif defined(ARCH_X64)
+	const auto sign = _mm_set1_epi32(smin);
+	return _mm_andnot_si128(_mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)), _mm_sub_epi32(a, b));
+#elif defined(ARCH_ARM64)
+	return vqsubq_u32(a, b);
+#endif
+}
+
+inline v128 gv_subfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_sub_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vsubq_f32(a, b);
+#endif
+}
+
+inline v128 gv_subfd(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_sub_pd(a, b);
+#elif defined(ARCH_ARM64)
+	return vsubq_f64(a, b);
+#endif
+}
+
+inline v128 gv_maxu8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_max_epu8(a, b);
+#elif defined(ARCH_ARM64)
+	return vmaxq_u8(a, b);
+#endif
+}
+
+inline v128 gv_maxu16(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_max_epu16(a, b);
+#elif defined(ARCH_X64)
+	return _mm_add_epi16(_mm_subs_epu16(a, b), b);
+#elif defined(ARCH_ARM64)
+	return vmaxq_u16(a, b);
+#endif
+}
+
+inline v128 gv_maxu32(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_max_epu32(a, b);
+#elif defined(ARCH_X64)
+	const __m128i s = _mm_set1_epi32(smin);
+	const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s));
+	return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
+#elif defined(ARCH_ARM64)
+	return vmaxq_u32(a, b);
+#endif
+}
+
+inline v128 gv_maxs8(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_max_epi8(a, b);
+#elif defined(ARCH_X64)
+	const __m128i m = _mm_cmpgt_epi8(a, b);
+	return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
+#elif defined(ARCH_ARM64)
+	return vmaxq_s8(a, b);
+#endif
+}
+
+inline v128 gv_maxs16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_max_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vmaxq_s16(a, b);
+#endif
+}
+
+inline v128 gv_maxs32(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_max_epi32(a, b);
+#elif defined(ARCH_X64)
+	const __m128i m = _mm_cmpgt_epi32(a, b);
+	return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
+#elif defined(ARCH_ARM64)
+	return vmaxq_s32(a, b);
+#endif
+}
+
+inline v128 gv_maxfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_and_ps(_mm_max_ps(a, b), _mm_max_ps(b, a));
+#elif defined(ARCH_ARM64)
+	return vmaxq_f32(a, b);
+#endif
+}
+
+inline v128 gv_minu8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_min_epu8(a, b);
+#elif defined(ARCH_ARM64)
+	return vminq_u8(a, b);
+#endif
+}
+
+inline v128 gv_minu16(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_min_epu16(a, b);
+#elif defined(ARCH_X64)
+	return _mm_sub_epi16(a, _mm_subs_epu16(a, b));
+#elif defined(ARCH_ARM64)
+	return vminq_u16(a, b);
+#endif
+}
+
+inline v128 gv_minu32(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_min_epu32(a, b);
+#elif defined(ARCH_X64)
+	const __m128i s = _mm_set1_epi32(smin);
+	const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s));
+	return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
+#elif defined(ARCH_ARM64)
+	return vminq_u32(a, b);
+#endif
+}
+
+inline v128 gv_mins8(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_min_epi8(a, b);
+#elif defined(ARCH_X64)
+	const __m128i m = _mm_cmpgt_epi8(a, b);
+	return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
+#elif defined(ARCH_ARM64)
+	return vminq_s8(a, b);
+#endif
+}
+
+inline v128 gv_mins16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_min_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vminq_s16(a, b);
+#endif
+}
+
+inline v128 gv_mins32(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_min_epi32(a, b);
+#elif defined(ARCH_X64)
+	const __m128i m = _mm_cmpgt_epi32(a, b);
+	return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
+#elif defined(ARCH_ARM64)
+	return vminq_s32(a, b);
+#endif
+}
+
+inline v128 gv_minfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
+#elif defined(ARCH_ARM64)
+	return vminq_f32(a, b);
+#endif
+}
+
+inline v128 gv_eq8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpeq_epi8(a, b);
+#elif defined(ARCH_ARM64)
+	return vceqq_s8(a, b);
+#endif
+}
+
+inline v128 gv_eq16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpeq_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vceqq_s16(a, b);
+#endif
+}
+
+inline v128 gv_eq32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpeq_epi32(a, b);
+#elif defined(ARCH_ARM64)
+	return vceqq_s32(a, b);
+#endif
+}
+
+// Ordered and equal
+inline v128 gv_eqfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpeq_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vceqq_f32(a, b);
+#endif
+}
+
+// Unordered or not equal
+inline v128 gv_neqfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpneq_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return ~vceqq_f32(a, b);
+#endif
+}
+
+inline v128 gv_gtu8(const v128& a, const v128& b)
+{
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+	return _mm_movm_epi8(_mm_cmpgt_epu8_mask(a, b));
+#elif defined(ARCH_X64)
+	return _mm_cmpeq_epi8(_mm_cmpeq_epi8(a, _mm_min_epu8(a, b)), _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+	return vcgtq_u8(a, b);
+#endif
+}
+
+inline v128 gv_gtu16(const v128& a, const v128& b)
+{
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+	return _mm_movm_epi16(_mm_cmpgt_epu16_mask(a, b));
+#elif defined(__SSE4_1__)
+	return _mm_cmpeq_epi16(_mm_cmpeq_epi16(a, _mm_min_epu16(a, b)), _mm_setzero_si128());
+#elif defined(ARCH_X64)
+	return _mm_cmpeq_epi16(_mm_cmpeq_epi16(_mm_subs_epu16(a, b), _mm_setzero_si128()), _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+	return vcgtq_u16(a, b);
+#endif
+}
+
+inline v128 gv_gtu32(const v128& a, const v128& b)
+{
+#if defined(__AVX512VL__) && defined(__AVX512DQ__)
+	return _mm_movm_epi32(_mm_cmpgt_epu32_mask(a, b));
+#elif defined(__SSE4_1__)
+	return _mm_cmpeq_epi32(_mm_cmpeq_epi32(a, _mm_min_epu32(a, b)), _mm_setzero_si128());
+#elif defined(ARCH_X64)
+	const auto sign = _mm_set1_epi32(smin);
+	return _mm_cmpgt_epi32(_mm_xor_si128(a, sign), _mm_xor_si128(b, sign));
+#elif defined(ARCH_ARM64)
+	return vcgtq_u32(a, b);
+#endif
+}
+
+// Ordered and greater than
+inline v128 gv_gtfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpgt_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vcgtq_f32(a, b);
+#endif
+}
+
+// Ordered and less than
+inline v128 gv_ltfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmplt_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vcltq_f32(a, b);
+#endif
+}
+
+// Unordered or less or equal
+inline v128 gv_ngtfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpngt_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return ~vcgtq_f32(a, b);
+#endif
+}
+
+// Unordered or greater or equal
+inline v128 gv_nlefs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpnle_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return ~vcleq_f32(a, b);
+#endif
+}
+
+inline v128 gv_geu8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpeq_epi8(b, _mm_min_epu8(a, b));
+#elif defined(ARCH_ARM64)
+	return vcgeq_u8(a, b);
+#endif
+}
+
+inline v128 gv_geu16(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_cmpeq_epi16(b, _mm_min_epu16(a, b));
+#elif defined(ARCH_X64)
+	return _mm_cmpeq_epi16(_mm_subs_epu16(b, a), _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+	return vcgeq_u16(a, b);
+#endif
+}
+
+inline v128 gv_geu32(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_cmpeq_epi32(b, _mm_min_epu32(a, b));
+#elif defined(ARCH_X64)
+	const auto sign = _mm_set1_epi32(smin);
+	return _mm_cmpeq_epi32(_mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)), _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+	return vcgeq_u32(a, b);
+#endif
+}
+
+// Ordered and not less than
+inline v128 gv_gefs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpge_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vcgeq_f32(a, b);
+#endif
+}
+
+// Unordered or less than
+inline v128 gv_ngefs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpnge_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return ~vcgeq_f32(a, b);
+#endif
+}
+
+inline v128 gv_gts8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpgt_epi8(a, b);
+#elif defined(ARCH_ARM64)
+	return vcgtq_s8(a, b);
+#endif
+}
+
+inline v128 gv_gts16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpgt_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vcgtq_s16(a, b);
+#endif
+}
+
+inline v128 gv_gts32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_cmpgt_epi32(a, b);
+#elif defined(ARCH_ARM64)
+	return vcgtq_s32(a, b);
+#endif
+}
+
+inline v128 gv_avgu8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_avg_epu8(a, b);
+#elif defined(ARCH_ARM64)
+	return vrhaddq_u8(a, b);
+#endif
+}
+
+inline v128 gv_avgu16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_avg_epu16(a, b);
+#elif defined(ARCH_ARM64)
+	return vrhaddq_u16(a, b);
+#endif
+}
+
+inline v128 gv_avgu32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	const auto ones = _mm_set1_epi32(-1);
+	const auto summ = gv_sub32(gv_add32(a, b), ones);
+	const auto carry = _mm_slli_epi32(gv_geu32(a, summ), 31);
+	return _mm_or_si128(carry, _mm_srli_epi32(summ, 1));
+#elif defined(ARCH_ARM64)
+	return vrhaddq_u32(a, b);
+#endif
+}
+
+inline v128 gv_avgs8(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	const v128 sign = _mm_set1_epi8(smin);
+	return gv_avgu8(a ^ sign, b ^ sign) ^ sign;
+#elif defined(ARCH_ARM64)
+	return vrhaddq_s8(a, b);
+#endif
+}
+
+inline v128 gv_avgs16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	const v128 sign = _mm_set1_epi16(smin);
+	return gv_avgu16(a ^ sign, b ^ sign) ^ sign;
+#elif defined(ARCH_ARM64)
+	return vrhaddq_s16(a, b);
+#endif
+}
+
+inline v128 gv_avgs32(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	const v128 sign = _mm_set1_epi32(smin);
+	return gv_avgu32(a ^ sign, b ^ sign) ^ sign;
+#elif defined(ARCH_ARM64)
+	return vrhaddq_s32(a, b);
+#endif
+}
+
+inline v128 gv_fmafs(const v128& a, const v128& b, const v128& c)
+{
+#if defined(ARCH_X64) && defined(__FMA__)
+	return _mm_fmadd_ps(a, b, c);
+#elif defined(__FMA4__)
+	return _mm_macc_ps(a, b, c);
+#elif defined(ARCH_X64)
+	// This is inaccurate implementation
+#ifdef __AVX__
+	const __m128 r = _mm256_cvtpd_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtps_pd(a), _mm256_cvtps_pd(b)), _mm256_cvtps_pd(c)));
+#else
+	const __m128d a0 = _mm_cvtps_pd(a);
+	const __m128d a1 = _mm_cvtps_pd(_mm_movehl_ps(a, a));
+	const __m128d b0 = _mm_cvtps_pd(b);
+	const __m128d b1 = _mm_cvtps_pd(_mm_movehl_ps(b, b));
+	const __m128d c0 = _mm_cvtps_pd(c);
+	const __m128d c1 = _mm_cvtps_pd(_mm_movehl_ps(c, c));
+	const __m128d m0 = _mm_mul_pd(a0, b0);
+	const __m128d m1 = _mm_mul_pd(a1, b1);
+	const __m128d r0 = _mm_add_pd(m0, c0);
+	const __m128d r1 = _mm_add_pd(m1, c1);
+	const __m128 r = _mm_movelh_ps(_mm_cvtpd_ps(r0), _mm_cvtpd_ps(r1));
+#endif
+	return r;
+#elif defined(ARCH_ARM64)
+	return vfmaq_f32(c, a, b);
+#else
+	v128 r;
+	for (int i = 0; i < 4; i++)
+	{
+		r._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
+	}
+	return r;
+#endif
+}
+
+inline v128 gv_muladdfs(const v128& a, const v128& b, const v128& c)
+{
+#if defined(ARCH_X64) && defined(__FMA__)
+	return _mm_fmadd_ps(a, b, c);
+#elif defined(__FMA4__)
+	return _mm_macc_ps(a, b, c);
+#elif defined(ARCH_ARM64)
+	return vfmaq_f32(c, a, b);
+#elif defined(ARCH_X64)
+	return _mm_add_ps(_mm_mul_ps(a, b), c);
+#endif
+}
+
+// -> ssat((a * b * 2 + (c << 16) + 0x8000) >> 16)
+inline v128 gv_rmuladds_hds16(const v128& a, const v128& b, const v128& c)
+{
+#if defined(ARCH_ARM64)
+	return vqrdmlahq_s16(c, a, b);
+#elif defined(ARCH_X64)
+	const auto x80 = _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product
+	const auto al = _mm_unpacklo_epi16(a, x80);
+	const auto ah = _mm_unpackhi_epi16(a, x80);
+	const auto bl = _mm_unpacklo_epi16(b, x80);
+	const auto bh = _mm_unpackhi_epi16(b, x80);
+	const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15);
+	const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15);
+	const auto cl = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), c), 16);
+	const auto ch = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), c), 16);
+	const auto sl = _mm_add_epi32(ml, cl);
+	const auto sh = _mm_add_epi32(mh, ch);
+	return _mm_packs_epi32(sl, sh);
+#endif
+}
+
+// -> ssat((a * b * 2 + 0x8000) >> 16)
+inline v128 gv_rmuls_hds16(const v128& a, const v128& b)
+{
+#if defined(ARCH_ARM64)
+	return vqrdmulhq_s16(a, b);
+#elif defined(ARCH_X64)
+	const auto x80 = _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product
+	const auto al = _mm_unpacklo_epi16(a, x80);
+	const auto ah = _mm_unpackhi_epi16(a, x80);
+	const auto bl = _mm_unpacklo_epi16(b, x80);
+	const auto bh = _mm_unpackhi_epi16(b, x80);
+	const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15);
+	const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15);
+	return _mm_packs_epi32(ml, mh);
+#endif
+}
+
+// -> ssat((a * b * 2) >> 16)
+inline v128 gv_muls_hds16(const v128& a, const v128& b)
+{
+#if defined(ARCH_ARM64)
+	return vqdmulhq_s16(a, b);
+#elif defined(ARCH_X64)
+	const auto m = _mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1));
+	const auto s = _mm_cmpeq_epi16(m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000)
+	return _mm_xor_si128(m, s);
+#endif
+}
+
+inline v128 gv_muladd16(const v128& a, const v128& b, const v128& c)
+{
+#if defined(ARCH_X64)
+	return _mm_add_epi16(_mm_mullo_epi16(a, b), c);
+#elif defined(ARCH_ARM64)
+	return vmlaq_s16(c, a, b);
+#endif
+}
+
+inline v128 gv_mul16(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_mullo_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	return vmulq_s16(a, b);
+#endif
+}
+
+inline v128 gv_mul32(const v128& a, const v128& b)
+{
+#if defined(__SSE4_1__)
+	return _mm_mullo_epi32(a, b);
+#elif defined(ARCH_X64)
+	const __m128i lows = _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8);
+	const __m128i highs = _mm_shuffle_epi32(_mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 8);
+	return _mm_unpacklo_epi64(lows, highs);
+#elif defined(ARCH_ARM64)
+	return vmulq_s32(a, b);
+#endif
+}
+
+inline v128 gv_mulfs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_mul_ps(a, b);
+#elif defined(ARCH_ARM64)
+	return vmulq_f32(a, b);
+#endif
+}
+
+inline v128 gv_hadds8x2(const v128& a)
+{
+#if defined(__SSSE3__)
+	return _mm_maddubs_epi16(_mm_set1_epi8(1), a);
+#elif defined(ARCH_X64)
+	return _mm_add_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(_mm_slli_epi16(a, 8), 8));
+#elif defined(ARCH_ARM64)
+	return vpaddlq_s8(a);
+#endif
+}
+
+inline v128 gv_hadds8x4(const v128& a)
+{
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+	return _mm_dpbusd_epi32(_mm_setzero_si128(), _mm_set1_epi8(1), a);
+#elif defined(__SSSE3__)
+	return _mm_madd_epi16(_mm_maddubs_epi16(_mm_set1_epi8(1), a), _mm_set1_epi16(1));
+#elif defined(ARCH_X64)
+	return _mm_madd_epi16(_mm_add_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)), _mm_set1_epi16(1));
+#elif defined(ARCH_ARM64)
+	return vpaddlq_s16(vpaddlq_s8(a));
+#endif
+}
+
+inline v128 gv_haddu8x2(const v128& a)
+{
+#if defined(__SSSE3__)
+	return _mm_maddubs_epi16(a, _mm_set1_epi8(1));
+#elif defined(ARCH_X64)
+	return _mm_add_epi16(_mm_srli_epi16(a, 8), _mm_and_si128(a, _mm_set1_epi16(0x00ff)));
+#elif defined(ARCH_ARM64)
+	return vpaddlq_u8(a);
+#endif
+}
+
+inline v128 gv_haddu8x4(const v128& a)
+{
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+	return _mm_dpbusd_epi32(_mm_setzero_si128(), a, _mm_set1_epi8(1));
+#elif defined(__SSSE3__)
+	return _mm_madd_epi16(_mm_maddubs_epi16(a, _mm_set1_epi8(1)), _mm_set1_epi16(1));
+#elif defined(ARCH_X64)
+	return _mm_madd_epi16(_mm_add_epi16(_mm_srli_epi16(a, 8), _mm_and_si128(a, _mm_set1_epi16(0x00ff))), _mm_set1_epi16(1));
+#elif defined(ARCH_ARM64)
+	return vpaddlq_u16(vpaddlq_u8(a));
+#endif
+}
+
+inline v128 gv_hadds16x2(const v128& a)
+{
+#if defined(ARCH_X64)
+	return _mm_madd_epi16(a, _mm_set1_epi16(1));
+#elif defined(ARCH_ARM64)
+	return vpaddlq_s16(a);
+#endif
+}
+
+// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c
+inline v128 gv_dotu8s8x4(const v128& a, const v128& b, const v128& c)
+{
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+	return _mm_dpbusd_epi32(c, a, b);
+#elif defined(ARCH_X64)
+	const __m128i ah = _mm_srli_epi16(a, 8);
+	const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
+	const __m128i bh = _mm_srai_epi16(b, 8);
+	const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
+	const __m128i mh = _mm_madd_epi16(ah, bh);
+	const __m128i ml = _mm_madd_epi16(al, bl);
+	const __m128i x = _mm_add_epi32(mh, ml);
+	return _mm_add_epi32(c, x);
+#elif defined(__ARM_FEATURE_MATMUL_INT8)
+	return vusdotq_s32(c, a, b);
+#elif defined(ARCH_ARM64)
+    const auto l = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b))));
+	const auto h = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b))));
+    return vaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h)));
+#endif
+}
+
+inline v128 gv_dotu8x4(const v128& a, const v128& b, const v128& c)
+{
+#if defined(ARCH_X64)
+	const __m128i ah = _mm_srli_epi16(a, 8);
+	const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
+	const __m128i bh = _mm_srli_epi16(b, 8);
+	const __m128i bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff));
+	const __m128i mh = _mm_madd_epi16(ah, bh);
+	const __m128i ml = _mm_madd_epi16(al, bl);
+	const __m128i x = _mm_add_epi32(mh, ml);
+	return _mm_add_epi32(c, x);
+#elif defined(__ARM_FEATURE_DOTPROD)
+	return vdotq_u32(c, a, b);
+#elif defined(ARCH_ARM64)
+    const auto l = vpaddlq_u16(vmulq_u16(vmovl_u8(vget_low_u8(a)), vmovl_u8(vget_low_u8(b))));
+	const auto h = vpaddlq_u16(vmulq_u16(vmovl_u8(vget_high_u8(a)), vmovl_u8(vget_high_u8(b))));
+    return vaddq_u32(c, vaddq_u32(vuzp1q_u32(l, h), vuzp2q_u32(l, h)));
+#endif
+}
+
+inline v128 gv_dots16x2(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_madd_epi16(a, b);
+#elif defined(ARCH_ARM64)
+	const auto ml = vmull_s16(vget_low_s16(a), vget_low_s16(b));
+	const auto mh = vmull_s16(vget_high_s16(a), vget_high_s16(b));
+	const auto sl = vpadd_s32(vget_low_s32(ml), vget_high_s32(ml));
+	const auto sh = vpadd_s32(vget_low_s32(mh), vget_high_s32(mh));
+	return vcombine_s32(sl, sh);
+#endif
+}
+
+// Signed s16 from a and b, 32-bit accumulator c
+inline v128 gv_dots16x2(const v128& a, const v128& b, const v128& c)
+{
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+	return _mm_dpwssd_epi32(c, a, b);
+#else
+	return gv_add32(c, gv_dots16x2(a, b));
+#endif
+}
+
+inline v128 gv_dotu16x2(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	const auto ml = _mm_mullo_epi16(a, b); // low results
+	const auto mh = _mm_mulhi_epu16(a, b); // high results
+	const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)));
+	const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000)));
+	return _mm_add_epi32(ls, hs);
+#elif defined(ARCH_ARM64)
+	const auto ml = vmull_u16(vget_low_u16(a), vget_low_u16(b));
+	const auto mh = vmull_u16(vget_high_u16(a), vget_high_u16(b));
+	const auto sl = vpadd_u32(vget_low_u32(ml), vget_high_u32(ml));
+	const auto sh = vpadd_u32(vget_low_u32(mh), vget_high_u32(mh));
+	return vcombine_u32(sl, sh);
+#endif
+}
+
+// Signed s16 from a and b, 32-bit accumulator c; signed saturation
+inline v128 gv_dots_s16x2(const v128& a, const v128& b, const v128& c)
+{
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+	return _mm_dpwssds_epi32(c, a, b);
+#else
+	const auto ab = gv_dots16x2(a, b);
+	const auto s0 = gv_adds_s32(ab, c);
+	const auto s1 = gv_eq32(ab, gv_bcst32(0x80000000)); // +0x80000000, negative c -> c^0x80000000; otherwise 0x7fffffff
+	const auto s2 = gv_select32(gv_gts32(gv_bcst32(0), c), gv_xor32(c, gv_bcst32(0x80000000)), gv_bcst32(0x7fffffff));
+	return gv_select32(s1, s2, s0);
+#endif
+}
+
+inline v128 gv_cvts32_tofs(const v128& src)
+{
+#if defined(ARCH_X64)
+	return _mm_cvtepi32_ps(src);
+#elif defined(ARCH_ARM64)
+	return vcvtq_f32_s32(src);
+#endif
+}
+
+inline v128 gv_cvtu32_tofs(const v128& src)
+{
+#if defined(__AVX512VL__)
+	return _mm_cvtepu32_ps(src);
+#elif defined(ARCH_X64)
+	const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(src, 31)), _mm_set1_ps(0x80000000));
+	return _mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(src, _mm_set1_epi32(0x7fffffff))), fix);
+#elif defined(ARCH_ARM64)
+	return vcvtq_f32_u32(src);
+#endif
+}
+
+inline v128 gv_cvtfs_tos32(const v128& src)
+{
+#if defined(ARCH_X64)
+	return _mm_cvttps_epi32(src);
+#elif defined(ARCH_ARM64)
+	return vcvtq_s32_f32(src);
+#endif
+}
+
+inline v128 gv_cvtfs_tou32(const v128& src)
+{
+#if defined(__AVX512VL__)
+	return _mm_cvttps_epu32(src);
+#elif defined(ARCH_X64)
+	const auto c1 = _mm_cvttps_epi32(src);
+	const auto s1 = _mm_srai_epi32(c1, 31);
+	const auto c2 = _mm_cvttps_epi32(_mm_sub_ps(src, _mm_set1_ps(2147483648.)));
+	return _mm_or_si128(c1, _mm_and_si128(c2, s1));
+#elif defined(ARCH_ARM64)
+	return vcvtq_u32_f32(src);
+#endif
+}
+
+inline bool gv_testz(const v128& a)
+{
+#if defined(__SSE4_1__)
+	return !!_mm_testz_si128(a, a);
+#elif defined(ARCH_X64)
+	return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0;
+#elif defined(ARCH_ARM64)
+	return std::bit_cast<s64>(vqmovn_s32(a)) == 0;
+#else
+	return !(a._u64[0] | a._u64[1]);
+#endif
+}
+
+// Same as gv_testz but tuned for pairing with gv_testall1
+inline bool gv_testall0(const v128& a)
+{
+#if defined(__SSE4_1__)
+	return !!_mm_testz_si128(a, _mm_set1_epi32(-1));
+#elif defined(ARCH_X64)
+	return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0;
+#elif defined(ARCH_ARM64)
+	return std::bit_cast<s64>(vqmovn_s32(a)) == 0;
+#else
+	return !(a._u64[0] | a._u64[1]);
+#endif
+}
+
+inline bool gv_testall1(const v128& a)
+{
+#if defined(__SSE4_1__)
+	return !!_mm_test_all_ones(a);
+#elif defined(ARCH_X64)
+	return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == -1;
+#elif defined(ARCH_ARM64)
+	return std::bit_cast<s64>(vqmovn_s32(a)) == -1;
+#else
+	return (a._u64[0] & a._u64[1]) == UINT64_MAX;
+#endif
+}
+
+// result = (~a) & (b)
+inline v128 gv_andn(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_andnot_si128(a, b);
+#elif defined(ARCH_ARM64)
+	return vbicq_s32(b, a);
+#endif
+}
+
+// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
+inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
+{
+#if defined(__SSE4_1__)
+	return _mm_blendv_epi8(_false, _true, _cmp);
+#elif defined(ARCH_ARM64)
+	return vbslq_u8(_cmp, _true, _false);
+#else
+	return (_cmp & _true) | gv_andn(_cmp, _false);
+#endif
+}
+
+// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
+inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false)
+{
+#if defined(__SSE4_1__)
+	return _mm_blendv_epi8(_false, _true, _cmp);
+#elif defined(ARCH_ARM64)
+	return vbslq_u16(_cmp, _true, _false);
+#else
+	return (_cmp & _true) | gv_andn(_cmp, _false);
+#endif
+}
+
+// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
+inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false)
+{
+#if defined(__SSE4_1__)
+	return _mm_blendv_epi8(_false, _true, _cmp);
+#elif defined(ARCH_ARM64)
+	return vbslq_u32(_cmp, _true, _false);
+#else
+	return (_cmp & _true) | gv_andn(_cmp, _false);
+#endif
+}
+
+// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
+inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false)
+{
+#if defined(__SSE4_1__)
+	return _mm_blendv_ps(_false, _true, _cmp);
+#elif defined(ARCH_ARM64)
+	return vbslq_f32(_cmp, _true, _false);
+#else
+	return _mm_or_ps(_mm_and_ps(_cmp, _true), _mm_andnot_ps(_cmp, _false));
+#endif
+}
+
+inline v128 gv_unpacklo8(const v128& lows, const v128& highs)
+{
+#if defined(ARCH_X64)
+	return _mm_unpacklo_epi8(lows, highs);
+#elif defined(ARCH_ARM64)
+	return vzip1q_s8(lows, highs);
+#endif
+}
+
+inline v128 gv_extend_lo_s8(const v128& vec)
+{
+#if defined(__SSE4_1__)
+	return _mm_cvtepi8_epi16(vec);
+#elif defined(ARCH_X64)
+	return _mm_srai_epi16(_mm_unpacklo_epi8(_mm_undefined_si128(), vec), 8);
+#elif defined(ARCH_ARM64)
+	return int16x8_t(vmovl_s8(vget_low_s8(vec)));
+#endif
+}
+
+inline v128 gv_extend_hi_s8(const v128& vec)
+{
+#if defined(__SSE4_1__)
+	return _mm_cvtepi8_epi16(_mm_loadu_si64(vec._bytes + 8));
+#elif defined(ARCH_X64)
+	return _mm_srai_epi16(_mm_unpackhi_epi8(_mm_undefined_si128(), vec), 8);
+#elif defined(ARCH_ARM64)
+	return int16x8_t(vmovl_s8(vget_high_s8(vec)));
+#endif
+}
+
+inline v128 gv_unpacklo16(const v128& lows, const v128& highs)
+{
+#if defined(ARCH_X64)
+	return _mm_unpacklo_epi16(lows, highs);
+#elif defined(ARCH_ARM64)
+	return vzip1q_s16(lows, highs);
+#endif
+}
+
+inline v128 gv_extend_lo_s16(const v128& vec)
+{
+#if defined(__SSE4_1__)
+	return _mm_cvtepi16_epi32(vec);
+#elif defined(ARCH_X64)
+	return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_undefined_si128(), vec), 16);
+#elif defined(ARCH_ARM64)
+	return int32x4_t(vmovl_s16(vget_low_s16(vec)));
+#endif
+}
+
+inline v128 gv_extend_hi_s16(const v128& vec)
+{
+#if defined(__SSE4_1__)
+	return _mm_cvtepi16_epi32(_mm_loadu_si64(vec._bytes + 8));
+#elif defined(ARCH_X64)
+	return _mm_srai_epi32(_mm_unpackhi_epi16(_mm_undefined_si128(), vec), 16);
+#elif defined(ARCH_ARM64)
+	return int32x4_t(vmovl_s16(vget_high_s16(vec)));
+#endif
+}
+
+inline v128 gv_unpacklo32(const v128& lows, const v128& highs)
+{
+#if defined(ARCH_X64)
+	return _mm_unpacklo_epi32(lows, highs);
+#elif defined(ARCH_ARM64)
+	return vzip1q_s32(lows, highs);
+#endif
+}
+
+inline v128 gv_unpackhi8(const v128& lows, const v128& highs)
+{
+#if defined(ARCH_X64)
+	return _mm_unpackhi_epi8(lows, highs);
+#elif defined(ARCH_ARM64)
+	return vzip2q_s8(lows, highs);
+#endif
+}
+
+inline v128 gv_unpackhi16(const v128& lows, const v128& highs)
+{
+#if defined(ARCH_X64)
+	return _mm_unpackhi_epi16(lows, highs);
+#elif defined(ARCH_ARM64)
+	return vzip2q_s16(lows, highs);
+#endif
+}
+
+inline v128 gv_unpackhi32(const v128& lows, const v128& highs)
+{
+#if defined(ARCH_X64)
+	return _mm_unpackhi_epi32(lows, highs);
+#elif defined(ARCH_ARM64)
+	return vzip2q_s32(lows, highs);
+#endif
+}
+
+inline bool v128::operator==(const v128& b) const
+{
+#if defined(ARCH_X64)
+	return gv_testz(_mm_xor_si128(*this, b));
+#else
+	return gv_testz(*this ^ b);
+#endif
+}
+
+inline v128 v128::operator|(const v128& rhs) const
+{
+#if defined(ARCH_X64)
+	return _mm_or_si128(*this, rhs);
+#elif defined(ARCH_ARM64)
+	return vorrq_s32(*this, rhs);
+#endif
+}
+
+inline v128 v128::operator&(const v128& rhs) const
+{
+#if defined(ARCH_X64)
+	return _mm_and_si128(*this, rhs);
+#elif defined(ARCH_ARM64)
+	return vandq_s32(*this, rhs);
+#endif
+}
+
+inline v128 v128::operator^(const v128& rhs) const
+{
+#if defined(ARCH_X64)
+	return _mm_xor_si128(*this, rhs);
+#elif defined(ARCH_ARM64)
+	return veorq_s32(*this, rhs);
+#endif
+}
+
+inline v128 v128::operator~() const
+{
+#if defined(ARCH_X64)
+	return _mm_xor_si128(*this, _mm_set1_epi32(-1));
+#elif defined(ARCH_ARM64)
+	return vmvnq_u32(*this);
+#endif
+}
+
+inline v128 gv_exp2_approxfs(const v128& a)
+{
+	// TODO
+#if 0
+	const auto x0 = _mm_max_ps(_mm_min_ps(a, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f));
+	const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f));
+	const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1)));
+	const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2));
+	const auto x4 = _mm_mul_ps(x3, x3);
+	const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f)));
+	const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5)));
+	return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23)));
+#else
+	v128 r;
+	for (u32 i = 0; i < 4; i++)
+		r._f[i] = std::exp2f(a._f[i]);
+	return r;
+#endif
+}
+
+inline v128 gv_log2_approxfs(const v128& a)
+{
+	// TODO
+#if 0
+	const auto _1 = _mm_set1_ps(1.0f);
+	const auto _c = _mm_set1_ps(1.442695040f);
+	const auto x0 = _mm_max_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
+	const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1);
+	const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1));
+	const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2);
+	const auto x4 = _mm_add_ps(x3, x3);
+	const auto x5 = _mm_mul_ps(x4, x4);
+	const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f));
+	const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f)));
+	const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127)));
+	return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8));
+#else
+	v128 r;
+	for (u32 i = 0; i < 4; i++)
+		r._f[i] = std::log2f(a._f[i]);
+	return r;
+#endif
+}
diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp
index 2b04395172..457cb0d417 100755
--- a/rpcs3/util/sysinfo.cpp
+++ b/rpcs3/util/sysinfo.cpp
@@ -19,15 +19,14 @@
 #endif
 
 #include "util/asm.hpp"
+#include "util/fence.hpp"
 
-#ifdef _MSC_VER
-extern "C"
-{
-	u64 _xgetbv(u32);
-}
+#ifdef _M_X64
+extern "C" u64 _xgetbv(u32);
 #endif
 
-inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
+#if defined(ARCH_X64)
+static inline std::array<u32, 4> get_cpuid(u32 func, u32 subfunc)
 {
 	int regs[4];
 #ifdef _MSC_VER
@@ -38,7 +37,7 @@ inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
 	return {0u+regs[0], 0u+regs[1], 0u+regs[2], 0u+regs[3]};
 }
 
-inline u64 utils::get_xgetbv(u32 xcr)
+static inline u64 get_xgetbv(u32 xcr)
 {
 #ifdef _MSC_VER
 	return _xgetbv(xcr);
@@ -48,6 +47,7 @@ inline u64 utils::get_xgetbv(u32 xcr)
 	return eax | (u64(edx) << 32);
 #endif
 }
+#endif
 
 #ifdef __APPLE__
 // sysinfo_darwin.mm
@@ -61,113 +61,192 @@ namespace Darwin_Version
 
 bool utils::has_ssse3()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x200;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_sse41()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x80000;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_avx()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x10000000 && (get_cpuid(1, 0)[2] & 0x0C000000) == 0x0C000000 && (get_xgetbv(0) & 0x6) == 0x6;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_avx2()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && get_cpuid(7, 0)[1] & 0x20 && (get_cpuid(1, 0)[2] & 0x0C000000) == 0x0C000000 && (get_xgetbv(0) & 0x6) == 0x6;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_rtm()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x800) == 0x800;
 	return g_value;
+#elif defined(ARCH_ARM64)
+	return false;
+#endif
 }
 
 bool utils::has_tsx_force_abort()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x2000) == 0x2000;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_rtm_always_abort()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x800) == 0x800;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_mpx()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x4000) == 0x4000;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_avx512()
 {
+#if defined(ARCH_X64)
 	// Check AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL extensions (Skylake-X level support)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0xd0030000) == 0xd0030000 && (get_cpuid(1, 0)[2] & 0x0C000000) == 0x0C000000 && (get_xgetbv(0) & 0xe6) == 0xe6;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_avx512_icl()
 {
+#if defined(ARCH_X64)
 	// Check AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512VPOPCNTDQ, AVX512BITALG, AVX512VNNI, AVX512VPCLMULQDQ, AVX512GFNI, AVX512VAES (Icelake-client level support)
 	static const bool g_value = has_avx512() && (get_cpuid(7, 0)[1] & 0x00200000) == 0x00200000 && (get_cpuid(7, 0)[2] & 0x00005f42) == 0x00005f42;
 	return g_value;
+#else
+	return false;
+#endif
+}
+
+bool utils::has_avx512_vnni()
+{
+#if defined(ARCH_X64)
+	// Check AVX512VNNI
+	static const bool g_value = has_avx512() && get_cpuid(7, 0)[2] & 0x00000800;
+	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_xop()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = has_avx() && get_cpuid(0x80000001, 0)[2] & 0x800;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_clwb()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x1000000) == 0x1000000;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_invariant_tsc()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000007, 0)[3] & 0x100) == 0x100;
 	return g_value;
+#elif defined(ARCH_ARM64)
+	return true;
+#endif
 }
 
 bool utils::has_fma3()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x1000;
 	return g_value;
+#elif defined(ARCH_ARM64)
+	return true;
+#endif
 }
 
 bool utils::has_fma4()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x10000) == 0x10000;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_erms()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x200) == 0x200;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 bool utils::has_fsrm()
 {
+#if defined(ARCH_X64)
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x10) == 0x10;
 	return g_value;
+#else
+	return false;
+#endif
 }
 
 u32 utils::get_rep_movsb_threshold()
 {
 	static const u32 g_value = []()
 	{
-		u32 thresh_value = 0xFFFFFFFF;
+		u32 thresh_value = umax;
 		if (has_fsrm())
 		{
 			thresh_value = 2047;
@@ -187,6 +266,7 @@ std::string utils::get_cpu_brand()
 {
 	std::string brand;
 
+#if defined(ARCH_X64)
 	if (get_cpuid(0x80000000, 0)[0] >= 0x80000004)
 	{
 		for (u32 i = 0; i < 3; i++)
@@ -198,6 +278,9 @@ std::string utils::get_cpu_brand()
 	{
 		brand = "Unknown CPU";
 	}
+#else
+	brand = "Unidentified CPU";
+#endif
 
 	brand.erase(brand.find_last_not_of('\0') + 1);
 	brand.erase(brand.find_last_not_of(' ') + 1);
@@ -396,19 +479,6 @@ static constexpr ullong round_tsc(ullong val)
 	return utils::rounded_div(val, 1'000'000) * 1'000'000;
 }
 
-#ifdef _MSC_VER
-extern "C" void _mm_lfence();
-#endif
-
-static inline void lfence()
-{
-#ifdef _MSC_VER
-	_mm_lfence();
-#else
-	__builtin_ia32_lfence();
-#endif
-}
-
 ullong utils::get_tsc_freq()
 {
 	static const ullong cal_tsc = []() -> ullong
@@ -449,17 +519,17 @@ ullong utils::get_tsc_freq()
 		{
 #ifdef _WIN32
 			Sleep(1);
-			error_data[i] = (lfence(), utils::get_tsc());
+			error_data[i] = (utils::lfence(), utils::get_tsc());
 			LARGE_INTEGER ctr;
 			QueryPerformanceCounter(&ctr);
-			rdtsc_data[i] = (lfence(), utils::get_tsc());
+			rdtsc_data[i] = (utils::lfence(), utils::get_tsc());
 			timer_data[i] = ctr.QuadPart;
 #else
 			usleep(200);
-			error_data[i] = (lfence(), utils::get_tsc());
+			error_data[i] = (utils::lfence(), utils::get_tsc());
 			struct timespec ts;
 			clock_gettime(CLOCK_MONOTONIC, &ts);
-			rdtsc_data[i] = (lfence(), utils::get_tsc());
+			rdtsc_data[i] = (utils::lfence(), utils::get_tsc());
 			timer_data[i] = ts.tv_nsec + (ts.tv_sec - sec_base) * 1'000'000'000;
 #endif
 		}
@@ -511,6 +581,7 @@ u32 utils::get_thread_count()
 
 u32 utils::get_cpu_family()
 {
+#if defined(ARCH_X64)
 	static const u32 g_value = []()
 	{
 		const u32 reg_value = get_cpuid(0x00000001, 0)[0]; // Processor feature info
@@ -528,10 +599,14 @@ u32 utils::get_cpu_family()
 	}();
 
 	return g_value;
+#elif defined(ARCH_ARM64)
+	return 0;
+#endif
 }
 
 u32 utils::get_cpu_model()
 {
+#if defined(ARCH_X64)
 	static const u32 g_value = []()
 	{
 		const u32 reg_value = get_cpuid(0x00000001, 0)[0]; // Processor feature info
@@ -550,16 +625,19 @@ u32 utils::get_cpu_model()
 	}();
 
 	return g_value;
+#elif defined(ARCH_ARM64)
+	return 0;
+#endif
 }
 
 namespace utils
 {
 	extern const u64 main_tid = []() -> u64
 	{
-	#ifdef _WIN32
+#ifdef _WIN32
 		return GetCurrentThreadId();
-	#else
+#else
 		return reinterpret_cast<u64>(pthread_self());
-	#endif
+#endif
 	}();
 }
diff --git a/rpcs3/util/sysinfo.hpp b/rpcs3/util/sysinfo.hpp
index 987da95b68..3aabc97311 100755
--- a/rpcs3/util/sysinfo.hpp
+++ b/rpcs3/util/sysinfo.hpp
@@ -5,10 +5,6 @@
 
 namespace utils
 {
-	std::array<u32, 4> get_cpuid(u32 func, u32 subfunc);
-
-	u64 get_xgetbv(u32 xcr);
-
 	bool has_ssse3();
 
 	bool has_sse41();
@@ -20,7 +16,7 @@ namespace utils
 	bool has_rtm();
 
 	bool has_tsx_force_abort();
-	
+
 	bool has_rtm_always_abort();
 
 	bool has_mpx();
@@ -29,6 +25,8 @@ namespace utils
 
 	bool has_avx512_icl();
 
+	bool has_avx512_vnni();
+
 	bool has_xop();
 
 	bool has_clwb();
diff --git a/rpcs3/util/tsc.hpp b/rpcs3/util/tsc.hpp
new file mode 100644
index 0000000000..80d2c51f2f
--- /dev/null
+++ b/rpcs3/util/tsc.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "util/types.hpp"
+
+#ifdef _M_X64
+extern "C" u64 __rdtsc();
+#endif
+
+namespace utils
+{
+	inline u64 get_tsc()
+	{
+#if defined(ARCH_ARM64)
+		u64 r = 0;
+		__asm__ volatile("mrs %0, cntvct_el0" : "=r" (r));
+		return r;
+#elif defined(_M_X64)
+		return __rdtsc();
+#elif defined(ARCH_X64)
+		return __builtin_ia32_rdtsc();
+#else
+#error "Missing utils::get_tsc() implementation"
+#endif
+	}
+}
diff --git a/rpcs3/util/types.hpp b/rpcs3/util/types.hpp
index e4f7558ceb..017b611986 100644
--- a/rpcs3/util/types.hpp
+++ b/rpcs3/util/types.hpp
@@ -12,6 +12,12 @@
 #include <memory>
 #include <bit>
 
+#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(__amd64__)
+#define ARCH_X64 1
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
+#define ARCH_ARM64 1
+#endif
+
 using std::chrono::steady_clock;
 
 using namespace std::literals;
@@ -180,15 +186,15 @@ public:
 	}
 };
 
-#ifndef _MSC_VER
-
-using u128 = __uint128_t;
-using s128 = __int128_t;
-
+#if defined(ARCH_X64) && !defined(_MSC_VER)
 using __m128i = long long __attribute__((vector_size(16)));
 using __m128d = double __attribute__((vector_size(16)));
 using __m128 = float __attribute__((vector_size(16)));
+#endif
 
+#ifndef _MSC_VER
+using u128 = __uint128_t;
+using s128 = __int128_t;
 #else
 
 extern "C"
diff --git a/rpcs3/util/v128.hpp b/rpcs3/util/v128.hpp
index 4273d8f3d2..8ea3b335b0 100644
--- a/rpcs3/util/v128.hpp
+++ b/rpcs3/util/v128.hpp
@@ -2,6 +2,9 @@
 
 #include "util/types.hpp"
 
+template <typename T>
+concept Vector128 = (sizeof(T) == 16) && (std::is_trivial_v<T>);
+
 // 128-bit vector type
 union alignas(16) v128
 {
@@ -58,39 +61,23 @@ union alignas(16) v128
 	u128 _u;
 	s128 _s;
 
-#ifdef _MSC_VER
-	template <typename T>
-	struct opaque_wrapper
+	v128() = default;
+
+	constexpr v128(const v128&) noexcept = default;
+
+	template <Vector128 T>
+	constexpr v128(const T& rhs) noexcept
+		: v128(std::bit_cast<v128>(rhs))
 	{
-		u128 m_data;
+	}
 
-		opaque_wrapper() = default;
+	constexpr v128& operator=(const v128&) noexcept = default;
 
-		opaque_wrapper(const T& value)
-			: m_data(std::bit_cast<u128>(value))
-		{
-		}
-
-		opaque_wrapper& operator=(const T& value)
-		{
-			m_data = std::bit_cast<u128>(value);
-			return *this;
-		}
-
-		operator T() const
-		{
-			return std::bit_cast<T>(m_data);
-		}
-	};
-
-	opaque_wrapper<__m128> vf;
-	opaque_wrapper<__m128i> vi;
-	opaque_wrapper<__m128d> vd;
-#else
-	__m128 vf;
-	__m128i vi;
-	__m128d vd;
-#endif
+	template <Vector128 T>
+	constexpr operator T() const noexcept
+	{
+		return std::bit_cast<T>(*this);
+	}
 
 	using enable_bitcopy = std::true_type;
 
@@ -107,6 +94,14 @@ union alignas(16) v128
 		return from64(_0, _1);
 	}
 
+	static v128 from64p(u64 value)
+	{
+		v128 ret;
+		ret._u64[0] = value;
+		ret._u64[1] = value;
+		return ret;
+	}
+
 	static v128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0)
 	{
 		v128 ret;
@@ -132,6 +127,16 @@ union alignas(16) v128
 		return ret;
 	}
 
+	static v128 fromf32p(f32 value)
+	{
+		v128 ret;
+		ret._f[0] = value;
+		ret._f[1] = value;
+		ret._f[2] = value;
+		ret._f[3] = value;
+		return ret;
+	}
+
 	static v128 from16p(u16 value)
 	{
 		v128 ret;
@@ -153,11 +158,18 @@ union alignas(16) v128
 		return ret;
 	}
 
-	static inline v128 fromV(const __m128i& value);
-
-	static inline v128 fromF(const __m128& value);
-
-	static inline v128 fromD(const __m128d& value);
+	static v128 undef()
+	{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+		v128 ret;
+		return ret;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+	}
 
 	// Unaligned load with optional index offset
 	static v128 loadu(const void* ptr, usz index = 0)
@@ -173,45 +185,13 @@ union alignas(16) v128
 		std::memcpy(static_cast<u8*>(ptr) + index * sizeof(v128), &value, sizeof(v128));
 	}
 
-	static inline v128 add8(const v128& left, const v128& right);
-
-	static inline v128 add16(const v128& left, const v128& right);
-
-	static inline v128 add32(const v128& left, const v128& right);
-
-	static inline v128 addfs(const v128& left, const v128& right);
-
-	static inline v128 addfd(const v128& left, const v128& right);
-
-	static inline v128 sub8(const v128& left, const v128& right);
-
-	static inline v128 sub16(const v128& left, const v128& right);
-
-	static inline v128 sub32(const v128& left, const v128& right);
-
-	static inline v128 subfs(const v128& left, const v128& right);
-
-	static inline v128 subfd(const v128& left, const v128& right);
-
-	static inline v128 maxu8(const v128& left, const v128& right);
-
-	static inline v128 minu8(const v128& left, const v128& right);
-
-	static inline v128 eq8(const v128& left, const v128& right);
-
-	static inline v128 eq16(const v128& left, const v128& right);
-
-	static inline v128 eq32(const v128& left, const v128& right);
-
-	static inline v128 eq32f(const v128& left, const v128& right);
-
-	static inline v128 fma32f(v128 a, const v128& b, const v128& c);
+	v128 operator|(const v128&) const;
+	v128 operator&(const v128&) const;
+	v128 operator^(const v128&) const;
+	v128 operator~() const;
 
 	bool operator==(const v128& right) const;
 
-	// result = (~left) & (right)
-	static inline v128 andnot(const v128& left, const v128& right);
-
 	void clear()
 	{
 		*this = {};
@@ -227,3 +207,12 @@ struct offset32_array<v128::masked_array_t<T, N, M>>
 		return u32{sizeof(T)} * (static_cast<u32>(arg) ^ static_cast<u32>(M));
 	}
 };
+
+template <>
+struct std::hash<v128>
+{
+	usz operator()(const v128& key) const
+	{
+		return key._u64[0] + key._u64[1];
+	}
+};
diff --git a/rpcs3/util/v128sse.hpp b/rpcs3/util/v128sse.hpp
deleted file mode 100644
index 006dbf6f57..0000000000
--- a/rpcs3/util/v128sse.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-#pragma once
-
-#include "util/types.hpp"
-#include "util/v128.hpp"
-#include "util/sysinfo.hpp"
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
-
-#include <immintrin.h>
-#include <emmintrin.h>
-
-#include <cmath>
-
-inline bool v128_use_fma = utils::has_fma3();
-
-inline v128 v128::fromV(const __m128i& value)
-{
-	v128 ret;
-	ret.vi = value;
-	return ret;
-}
-
-inline v128 v128::fromF(const __m128& value)
-{
-	v128 ret;
-	ret.vf = value;
-	return ret;
-}
-
-inline v128 v128::fromD(const __m128d& value)
-{
-	v128 ret;
-	ret.vd = value;
-	return ret;
-}
-
-inline v128 v128::add8(const v128& left, const v128& right)
-{
-	return fromV(_mm_add_epi8(left.vi, right.vi));
-}
-
-inline v128 v128::add16(const v128& left, const v128& right)
-{
-	return fromV(_mm_add_epi16(left.vi, right.vi));
-}
-
-inline v128 v128::add32(const v128& left, const v128& right)
-{
-	return fromV(_mm_add_epi32(left.vi, right.vi));
-}
-
-inline v128 v128::addfs(const v128& left, const v128& right)
-{
-	return fromF(_mm_add_ps(left.vf, right.vf));
-}
-
-inline v128 v128::addfd(const v128& left, const v128& right)
-{
-	return fromD(_mm_add_pd(left.vd, right.vd));
-}
-
-inline v128 v128::sub8(const v128& left, const v128& right)
-{
-	return fromV(_mm_sub_epi8(left.vi, right.vi));
-}
-
-inline v128 v128::sub16(const v128& left, const v128& right)
-{
-	return fromV(_mm_sub_epi16(left.vi, right.vi));
-}
-
-inline v128 v128::sub32(const v128& left, const v128& right)
-{
-	return fromV(_mm_sub_epi32(left.vi, right.vi));
-}
-
-inline v128 v128::subfs(const v128& left, const v128& right)
-{
-	return fromF(_mm_sub_ps(left.vf, right.vf));
-}
-
-inline v128 v128::subfd(const v128& left, const v128& right)
-{
-	return fromD(_mm_sub_pd(left.vd, right.vd));
-}
-
-inline v128 v128::maxu8(const v128& left, const v128& right)
-{
-	return fromV(_mm_max_epu8(left.vi, right.vi));
-}
-
-inline v128 v128::minu8(const v128& left, const v128& right)
-{
-	return fromV(_mm_min_epu8(left.vi, right.vi));
-}
-
-inline v128 v128::eq8(const v128& left, const v128& right)
-{
-	return fromV(_mm_cmpeq_epi8(left.vi, right.vi));
-}
-
-inline v128 v128::eq16(const v128& left, const v128& right)
-{
-	return fromV(_mm_cmpeq_epi16(left.vi, right.vi));
-}
-
-inline v128 v128::eq32(const v128& left, const v128& right)
-{
-	return fromV(_mm_cmpeq_epi32(left.vi, right.vi));
-}
-
-inline v128 v128::eq32f(const v128& left, const v128& right)
-{
-	return fromF(_mm_cmpeq_ps(left.vf, right.vf));
-}
-
-inline v128 v128::fma32f(v128 a, const v128& b, const v128& c)
-{
-#ifndef __FMA__
-	if (v128_use_fma) [[likely]]
-	{
-#ifdef _MSC_VER
-		a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
-		return a;
-#else
-		__asm__("vfmadd213ps %[c], %[b], %[a]"
-			: [a] "+x" (a.vf)
-			: [b] "x" (b.vf)
-			, [c] "x" (c.vf));
-		return a;
-#endif
-	}
-
-	for (int i = 0; i < 4; i++)
-	{
-		a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
-	}
-	return a;
-#else
-	a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
-	return a;
-#endif
-}
-
-inline bool v128::operator==(const v128& right) const
-{
-	return _mm_movemask_epi8(v128::eq32(*this, right).vi) == 0xffff;
-}
-
-// result = (~left) & (right)
-inline v128 v128::andnot(const v128& left, const v128& right)
-{
-	return fromV(_mm_andnot_si128(left.vi, right.vi));
-}
-
-inline v128 operator|(const v128& left, const v128& right)
-{
-	return v128::fromV(_mm_or_si128(left.vi, right.vi));
-}
-
-inline v128 operator&(const v128& left, const v128& right)
-{
-	return v128::fromV(_mm_and_si128(left.vi, right.vi));
-}
-
-inline v128 operator^(const v128& left, const v128& right)
-{
-	return v128::fromV(_mm_xor_si128(left.vi, right.vi));
-}
-
-inline v128 operator~(const v128& other)
-{
-	return other ^ v128::from32p(umax); // XOR with ones
-}
diff --git a/rpcs3/util/vm_native.cpp b/rpcs3/util/vm_native.cpp
index f3194f69cc..bd104c5187 100644
--- a/rpcs3/util/vm_native.cpp
+++ b/rpcs3/util/vm_native.cpp
@@ -27,7 +27,7 @@
 #ifdef __NR_memfd_create
 #elif __x86_64__
 #define __NR_memfd_create 319
-#elif __aarch64__
+#elif ARCH_ARM64
 #define __NR_memfd_create 279
 #endif