From eec11bfba93e4e1c8d0c6105685ea70aee8000c1 Mon Sep 17 00:00:00 2001
From: Nekotekina <nekotekina@gmail.com>
Date: Fri, 18 Dec 2020 17:43:34 +0300
Subject: [PATCH] Move align helpers to util/asm.hpp

Also add some files:
GLTextureCache.cpp
VKTextureCache.cpp
---
 Utilities/File.cpp                        |   4 +-
 Utilities/JIT.cpp                         |  19 +-
 rpcs3/Crypto/unedat.cpp                   |   5 +-
 rpcs3/Emu/CMakeLists.txt                  |   2 +
 rpcs3/Emu/Cell/Modules/cellDmux.cpp       |   8 +-
 rpcs3/Emu/Cell/Modules/cellSaveData.cpp   |   8 +-
 rpcs3/Emu/Cell/Modules/cellVdec.cpp       |   5 +-
 rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp    |   3 +-
 rpcs3/Emu/Cell/PPUModule.cpp              |  13 +-
 rpcs3/Emu/Cell/PPUThread.cpp              |   4 +-
 rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp    |   6 +-
 rpcs3/Emu/Cell/SPUThread.cpp              |   2 +-
 rpcs3/Emu/Cell/lv2/sys_memory.cpp         |   5 +-
 rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp     |   4 +-
 rpcs3/Emu/Cell/lv2/sys_spu.cpp            |   2 +-
 rpcs3/Emu/Memory/vm.cpp                   |  14 +-
 rpcs3/Emu/NP/np_handler.cpp               |   4 +-
 rpcs3/Emu/RSX/Capture/rsx_replay.cpp      |   3 +-
 rpcs3/Emu/RSX/Common/TextureUtils.cpp     |  12 +-
 rpcs3/Emu/RSX/Common/ring_buffer_helper.h |   9 +-
 rpcs3/Emu/RSX/Common/surface_store.cpp    |  12 +-
 rpcs3/Emu/RSX/Common/surface_store.h      |   4 +-
 rpcs3/Emu/RSX/GL/GLCompute.h              |   4 +-
 rpcs3/Emu/RSX/GL/GLGSRender.cpp           |   2 +-
 rpcs3/Emu/RSX/GL/GLHelpers.h              |  17 +-
 rpcs3/Emu/RSX/GL/GLTexture.cpp            |  12 +-
 rpcs3/Emu/RSX/GL/GLTextureCache.cpp       | 191 ++++++++++++
 rpcs3/Emu/RSX/GL/GLTextureCache.h         | 184 +----------
 rpcs3/Emu/RSX/RSXThread.cpp               |  48 ++-
 rpcs3/Emu/RSX/RSXThread.h                 |  46 +--
 rpcs3/Emu/RSX/VK/VKCompute.h              |   8 +-
 rpcs3/Emu/RSX/VK/VKDMA.cpp                |   6 +-
 rpcs3/Emu/RSX/VK/VKHelpers.cpp            |   8 +-
 rpcs3/Emu/RSX/VK/VKPresent.cpp            |   4 +-
 rpcs3/Emu/RSX/VK/VKResolveHelper.h        |   4 +-
 rpcs3/Emu/RSX/VK/VKTexture.cpp            |  18 +-
 rpcs3/Emu/RSX/VK/VKTextureCache.cpp       | 360 ++++++++++++++++++++++
 rpcs3/Emu/RSX/VK/VKTextureCache.h         | 352 +--------------------
 rpcs3/GLGSRender.vcxproj                  |   1 +
 rpcs3/GLGSRender.vcxproj.filters          |   1 +
 rpcs3/Loader/PSF.cpp                      |   4 +-
 rpcs3/VKGSRender.vcxproj                  |   1 +
 rpcs3/VKGSRender.vcxproj.filters          |   1 +
 rpcs3/rpcs3qt/cheat_manager.cpp           |   9 +-
 rpcs3/rpcs3qt/debugger_frame.cpp          |   6 +-
 rpcs3/rpcs3qt/memory_viewer_panel.cpp     |   6 +-
 rpcs3/rpcs3qt/register_editor_dialog.cpp  |   3 +-
 rpcs3/rpcs3qt/settings_dialog.cpp         |   3 +-
 rpcs3/util/asm.hpp                        |  26 ++
 rpcs3/util/sysinfo.cpp                    |   4 +-
 rpcs3/util/types.hpp                      |  25 --
 rpcs3/util/vm_native.cpp                  |   5 +-
 52 files changed, 794 insertions(+), 713 deletions(-)
 create mode 100644 rpcs3/Emu/RSX/GL/GLTextureCache.cpp
 create mode 100644 rpcs3/Emu/RSX/VK/VKTextureCache.cpp
diff --git a/Utilities/File.cpp b/Utilities/File.cpp
index 141251b5c0..f1f52ae816 100644
--- a/Utilities/File.cpp
+++ b/Utilities/File.cpp
@@ -10,6 +10,8 @@
 #include <typeinfo>
 #include <map>
 
+#include "util/asm.hpp"
+
 using namespace std::literals::string_literals;
 
 #ifdef _WIN32
@@ -1725,7 +1727,7 @@ u64 fs::get_dir_size(const std::string& path, u64 rounding_alignment)
 
 		if (!entry.is_directory)
 		{
-			result += ::align(entry.size, rounding_alignment);
+			result += utils::align(entry.size, rounding_alignment);
 		}
 		else
 		{
diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp
index 83e8c130a2..14c006af12 100644
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@@ -6,6 +6,7 @@
 #include "util/logs.hpp"
 #include "mutex.h"
 #include "util/vm.hpp"
+#include "util/asm.hpp"
 #include <immintrin.h>
 #include <zlib.h>
 
@@ -52,8 +53,8 @@ static u8* add_jit_memory(usz size, uint align)
 	// Simple allocation by incrementing pointer to the next free data
 	const u64 pos = Ctr.atomic_op([&](u64& ctr) -> u64
 	{
-		const u64 _pos = ::align(ctr & 0xffff'ffff, align);
-		const u64 _new = ::align(_pos + size, align);
+		const u64 _pos = utils::align(ctr & 0xffff'ffff, align);
+		const u64 _new = utils::align(_pos + size, align);
 
 		if (_new > 0x40000000) [[unlikely]]
 		{
@@ -69,7 +70,7 @@ static u8* add_jit_memory(usz size, uint align)
 		// Check the necessity to commit more memory
 		if (_new > olda) [[unlikely]]
 		{
-			newa = ::align(_new, 0x200000);
+			newa = utils::align(_new, 0x200000);
 		}
 
 		ctr += _new - (ctr & 0xffff'ffff);
@@ -223,7 +224,7 @@ asmjit::Runtime& asmjit::get_global_runtime()
 				return asmjit::kErrorNoCodeGenerated;
 			}
 
-			void* p = m_pos.fetch_add(::align(codeSize, 4096));
+			void* p = m_pos.fetch_add(utils::align(codeSize, 4096));
 			if (!p || m_pos > m_max) [[unlikely]]
 			{
 				*dst = nullptr;
@@ -237,7 +238,7 @@ asmjit::Runtime& asmjit::get_global_runtime()
 				return asmjit::kErrorInvalidState;
 			}
 
-			utils::memory_protect(p, ::align(codeSize, 4096), utils::protection::rx);
+			utils::memory_protect(p, utils::align(codeSize, 4096), utils::protection::rx);
 			flush(p, relocSize);
 			*dst = p;
 
@@ -351,8 +352,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
 			return nullptr;
 		}
 
-		const u64 olda = ::align(oldp, align);
-		const u64 newp = ::align(olda + size, align);
+		const u64 olda = utils::align(oldp, align);
+		const u64 newp = utils::align(olda + size, align);
 
 		if ((newp - 1) / c_max_size != oldp / c_max_size)
 		{
@@ -363,8 +364,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
 		if ((oldp - 1) / c_page_size != (newp - 1) / c_page_size)
 		{
 			// Allocate pages on demand
-			const u64 pagea = ::align(oldp, c_page_size);
-			const u64 psize = ::align(newp - pagea, c_page_size);
+			const u64 pagea = utils::align(oldp, c_page_size);
+			const u64 psize = utils::align(newp - pagea, c_page_size);
 			utils::memory_commit(this->ptr + pagea, psize, prot);
 		}
 
diff --git a/rpcs3/Crypto/unedat.cpp b/rpcs3/Crypto/unedat.cpp
index d6059de6c1..50aab0aa32 100644
--- a/rpcs3/Crypto/unedat.cpp
+++ b/rpcs3/Crypto/unedat.cpp
@@ -6,6 +6,7 @@
 #include <cmath>
 
 #include "util/v128.hpp"
+#include "util/asm.hpp"
 
 LOG_CHANNEL(edat_log, "EDAT");
 
@@ -949,7 +950,7 @@ bool EDATADecrypter::ReadHeader()
 	}*/
 
 	file_size = edatHeader.file_size;
-	total_blocks = ::aligned_div(edatHeader.file_size, edatHeader.block_size);
+	total_blocks = utils::aligned_div(edatHeader.file_size, edatHeader.block_size);
 
 	return true;
 }
@@ -962,7 +963,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size)
 	// now we need to offset things to account for the actual 'range' requested
 	const u64 startOffset = pos % edatHeader.block_size;
 
-	const u32 num_blocks = static_cast<u32>(::aligned_div(startOffset + size, edatHeader.block_size));
+	const u32 num_blocks = static_cast<u32>(utils::aligned_div(startOffset + size, edatHeader.block_size));
 	const u64 bufSize = num_blocks*edatHeader.block_size;
 	if (data_buf_size < (bufSize))
 	{
diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt
index b13c6cc7d7..df0dbe6429 100644
--- a/rpcs3/Emu/CMakeLists.txt
+++ b/rpcs3/Emu/CMakeLists.txt
@@ -428,6 +428,7 @@ target_sources(rpcs3_emu PRIVATE
 	RSX/GL/GLTexture.cpp
 	RSX/GL/GLVertexBuffers.cpp
 	RSX/GL/GLVertexProgram.cpp
+	RSX/GL/GLTextureCache.cpp
 	RSX/GL/OpenGL.cpp
 )
 
@@ -454,6 +455,7 @@ if(TARGET 3rdparty_vulkan)
 		RSX/VK/VKTexture.cpp
 		RSX/VK/VKVertexBuffers.cpp
 		RSX/VK/VKVertexProgram.cpp
+		RSX/VK/VKTextureCache.cpp
 	)
 endif()
 
diff --git a/rpcs3/Emu/Cell/Modules/cellDmux.cpp b/rpcs3/Emu/Cell/Modules/cellDmux.cpp
index eb66739524..df3607620c 100644
--- a/rpcs3/Emu/Cell/Modules/cellDmux.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellDmux.cpp
@@ -7,7 +7,7 @@
 #include "cellPamf.h"
 #include "cellDmux.h"
 
-#include <thread>
+#include "util/asm.hpp"
 
 LOG_CHANNEL(cellDmux);
 
@@ -753,9 +753,9 @@ PesHeader::PesHeader(DemuxerStream& stream)
 }
 
 ElementaryStream::ElementaryStream(Demuxer* dmux, u32 addr, u32 size, u32 fidMajor, u32 fidMinor, u32 sup1, u32 sup2, vm::ptr<CellDmuxCbEsMsg> cbFunc, u32 cbArg, u32 spec)
-	: put(align(addr, 128))
+	: put(utils::align(addr, 128))
 	, dmux(dmux)
-	, memAddr(align(addr, 128))
+	, memAddr(utils::align(addr, 128))
 	, memSize(size - (addr - memAddr))
 	, fidMajor(fidMajor)
 	, fidMinor(fidMinor)
@@ -847,7 +847,7 @@ void ElementaryStream::push_au(u32 size, u64 dts, u64 pts, u64 userdata, bool ra
 
 		addr = put;
 
-		put = align(put + 128 + size, 128);
+		put = utils::align(put + 128 + size, 128);
 
 		put_count++;
 	}
diff --git a/rpcs3/Emu/Cell/Modules/cellSaveData.cpp b/rpcs3/Emu/Cell/Modules/cellSaveData.cpp
index 499d615e69..37ea8fecf5 100644
--- a/rpcs3/Emu/Cell/Modules/cellSaveData.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSaveData.cpp
@@ -20,6 +20,8 @@
 #include <mutex>
 #include <algorithm>
 
+#include "util/asm.hpp"
+
 LOG_CHANNEL(cellSaveData);
 
 template<>
@@ -953,7 +955,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
 			{
 				if (!file.is_directory)
 				{
-					size_bytes += ::align(file.size, 1024);
+					size_bytes += utils::align(file.size, 1024);
 				}
 			}
 
@@ -1334,7 +1336,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
 			{
 				statGet->fileNum++;
 
-				size_bytes += ::align(entry.size, 1024); // firmware rounds this value up
+				size_bytes += utils::align(entry.size, 1024); // firmware rounds this value up
 
 				if (statGet->fileListNum >= setBuf->fileListMax)
 					continue;
@@ -1892,7 +1894,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
 		// add file list per FS order to PARAM.SFO
 		std::string final_blist;
 		final_blist = fmt::merge(blist, "/");
-		psf::assign(psf, "RPCS3_BLIST", psf::string(::align(::size32(final_blist) + 1, 4), final_blist));
+		psf::assign(psf, "RPCS3_BLIST", psf::string(utils::align(::size32(final_blist) + 1, 4), final_blist));
 
 		// Write all files in temporary directory
 		auto& fsfo = all_files["PARAM.SFO"];
diff --git a/rpcs3/Emu/Cell/Modules/cellVdec.cpp b/rpcs3/Emu/Cell/Modules/cellVdec.cpp
index 648952383c..93045fbd0b 100644
--- a/rpcs3/Emu/Cell/Modules/cellVdec.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellVdec.cpp
@@ -34,6 +34,7 @@ extern "C"
 #include <cmath>
 #include "Utilities/lockless.h"
 #include <variant>
+#include "util/asm.hpp"
 
 std::mutex g_mutex_avcodec_open2;
 
@@ -879,7 +880,7 @@ error_code cellVdecGetPicture(u32 handle, vm::cptr<CellVdecPicFormat> format, vm
 
 		sws_scale(vdec->sws, in_data, in_line, 0, h, out_data, out_line);
 
-		//const u32 buf_size = align(av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1), 128);
+		//const u32 buf_size = utils::align(av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1), 128);
 
 		//// TODO: zero padding bytes
 
@@ -974,7 +975,7 @@ error_code cellVdecGetPicItem(u32 handle, vm::pptr<CellVdecPicItem> picItem)
 	info->startAddr = 0x00000123; // invalid value (no address for picture)
 	const int buffer_size = av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1);
 	ensure(buffer_size >= 0);
-	info->size = align<u32>(buffer_size, 128);
+	info->size = utils::align<u32>(buffer_size, 128);
 	info->auNum = 1;
 	info->auPts[0].lower = static_cast<u32>(pts);
 	info->auPts[0].upper = static_cast<u32>(pts >> 32);
diff --git a/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp b/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp
index 1baf1b0bca..e1a4e66a32 100644
--- a/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp
+++ b/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp
@@ -20,6 +20,7 @@
 #include "Emu/Cell/lv2/sys_process.h"
 
 #include <cmath>
+#include "util/asm.hpp"
 
 LOG_CHANNEL(sceNpTrophy);
 
@@ -1109,7 +1110,7 @@ error_code sceNpTrophyGetGameProgress(u32 context, u32 handle, vm::ptr<s32> perc
 	const u32 trp_count = ctxt->tropusr->GetTrophiesCount();
 
 	// Round result to nearest (TODO: Check 0 trophies)
-	*percentage = trp_count ? ::rounded_div(unlocked * 100, trp_count) : 0;
+	*percentage = trp_count ? utils::rounded_div(unlocked * 100, trp_count) : 0;
 
 	if (trp_count == 0 || trp_count > 128)
 	{
diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp
index 53ac4c4f08..34f525bbd3 100644
--- a/rpcs3/Emu/Cell/PPUModule.cpp
+++ b/rpcs3/Emu/Cell/PPUModule.cpp
@@ -22,6 +22,7 @@
 #include <map>
 #include <set>
 #include <algorithm>
+#include "util/asm.hpp"
 
 LOG_CHANNEL(ppu_loader);
 
@@ -263,7 +264,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link)
 	}
 
 	// Set memory protection to read-only
-	vm::page_protect(ppu_function_manager::addr, ::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable);
+	vm::page_protect(ppu_function_manager::addr, utils::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable);
 
 	// Initialize function names
 	const bool is_first = g_ppu_function_names.empty();
@@ -319,7 +320,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link)
 			}
 			else
 			{
-				const u32 next = ::align(alloc_addr, variable.second.align);
+				const u32 next = utils::align(alloc_addr, variable.second.align);
 				const u32 end = next + variable.second.size;
 
 				if (!next || (end >> 12 != alloc_addr >> 12))
@@ -1500,7 +1501,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
 
 	for (const auto& arg : Emu.argv)
 	{
-		const u32 arg_size = ::align(::size32(arg) + 1, 0x10);
+		const u32 arg_size = utils::align(::size32(arg) + 1, 0x10);
 		const u32 arg_addr = vm::alloc(arg_size, vm::main);
 
 		std::memcpy(vm::base(arg_addr), arg.data(), arg_size);
@@ -1513,7 +1514,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
 
 	for (const auto& arg : Emu.envp)
 	{
-		const u32 arg_size = ::align(::size32(arg) + 1, 0x10);
+		const u32 arg_size = utils::align(::size32(arg) + 1, 0x10);
 		const u32 arg_addr = vm::alloc(arg_size, vm::main);
 
 		std::memcpy(vm::base(arg_addr), arg.data(), arg_size);
@@ -1533,7 +1534,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
 	case 0x70: primary_stacksize = 1024 * 1024; break; // SYS_PROCESS_PRIMARY_STACK_SIZE_1M
 	default:
 	{
-		primary_stacksize = ::align<u32>(std::clamp<u32>(sz, 0x10000, 0x100000), 4096);
+		primary_stacksize = utils::align<u32>(std::clamp<u32>(sz, 0x10000, 0x100000), 4096);
 		break;
 	}
 	}
@@ -1636,7 +1637,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
 		if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz && (prog.p_flags & 0x2) == 0u /* W */)
 		{
 			// Set memory protection to read-only when necessary
-			ensure(vm::page_protect(addr, ::align(size, 0x1000), 0, 0, vm::page_writable));
+			ensure(vm::page_protect(addr, utils::align(size, 0x1000), 0, 0, vm::page_writable));
 		}
 	}
 }
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index 2e462f8e87..9b472040ab 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -242,7 +242,7 @@ extern void ppu_register_range(u32 addr, u32 size)
 
 	// Register executable range at
 	utils::memory_commit(&ppu_ref(addr), size * 2, utils::protection::rw);
-	vm::page_protect(addr, align(size, 0x10000), 0, vm::page_executable);
+	vm::page_protect(addr, utils::align(size, 0x10000), 0, vm::page_executable);
 
 	const u64 fallback = g_cfg.core.ppu_decoder == ppu_decoder_type::llvm ? reinterpret_cast<uptr>(ppu_recompiler_fallback) : reinterpret_cast<uptr>(ppu_fallback);
 
@@ -1098,7 +1098,7 @@ u32 ppu_thread::stack_push(u32 size, u32 align_v)
 		ppu_thread& context = static_cast<ppu_thread&>(*cpu);
 
 		const u32 old_pos = vm::cast(context.gpr[1]);
-		context.gpr[1] -= align(size + 4, 8); // room minimal possible size
+		context.gpr[1] -= utils::align(size + 4, 8); // room minimal possible size
 		context.gpr[1] &= ~(u64{align_v} - 1); // fix stack alignment
 
 		if (old_pos >= context.stack_addr && old_pos < context.stack_addr + context.stack_size && context.gpr[1] < context.stack_addr)
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
index 689b6032e7..0b5e180302 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@@ -288,7 +288,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 64;
 
 		const u32 starta = start & -64;
-		const u32 enda = ::align(end, 64);
+		const u32 enda = utils::align(end, 64);
 		const u32 sizea = (enda - starta) / 64;
 		ensure(sizea);
 
@@ -369,7 +369,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 32;
 
 		const u32 starta = start & -32;
-		const u32 enda = ::align(end, 32);
+		const u32 enda = utils::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		ensure(sizea);
 
@@ -491,7 +491,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 32;
 
 		const u32 starta = start & -32;
-		const u32 enda = ::align(end, 32);
+		const u32 enda = utils::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		ensure(sizea);
 
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index bfa0a336a9..cd15901925 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -2338,7 +2338,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 			}
 
 			u32 range_addr = eal & -128;
-			u32 range_end = ::align(eal + size, 128);
+			u32 range_end = utils::align(eal + size, 128);
 
 			// Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?)
 			if (range_addr >> 16 != (range_end - 1) >> 16)
diff --git a/rpcs3/Emu/Cell/lv2/sys_memory.cpp b/rpcs3/Emu/Cell/lv2/sys_memory.cpp
index 3340cb8295..79d569fa06 100644
--- a/rpcs3/Emu/Cell/lv2/sys_memory.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_memory.cpp
@@ -8,6 +8,7 @@
 #include "Emu/IdManager.h"
 
 #include "util/vm.hpp"
+#include "util/asm.hpp"
 
 LOG_CHANNEL(sys_memory);
 
@@ -57,7 +58,7 @@ error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptr<u32
 		return CELL_ENOMEM;
 	}
 
-	if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, ::align(size, 0x10000000), 0x401))
+	if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, utils::align(size, 0x10000000), 0x401))
 	{
 		if (u32 addr = area->alloc(size, nullptr, align))
 		{
@@ -128,7 +129,7 @@ error_code sys_memory_allocate_from_container(cpu_thread& cpu, u32 size, u32 cid
 		return ct.ret;
 	}
 
-	if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, ::align(size, 0x10000000), 0x401))
+	if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, utils::align(size, 0x10000000), 0x401))
 	{
 		if (u32 addr = area->alloc(size))
 		{
diff --git a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp
index 65cb1ba2e3..613f835d4d 100644
--- a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp
@@ -12,6 +12,8 @@
 #include "sys_mmapper.h"
 #include "sys_memory.h"
 
+#include "util/asm.hpp"
+
 LOG_CHANNEL(sys_ppu_thread);
 
 // Simple structure to cleanup previous thread, because can't remove its own thread
@@ -388,7 +390,7 @@ error_code _sys_ppu_thread_create(ppu_thread& ppu, vm::ptr<u64> thread_id, vm::p
 	g_fxo->get<ppu_thread_cleaner>()->clean(0);
 
 	// Compute actual stack size and allocate
-	const u32 stack_size = ::align<u32>(std::max<u32>(_stacksz, 4096), 4096);
+	const u32 stack_size = utils::align<u32>(std::max<u32>(_stacksz, 4096), 4096);
 
 	const auto dct = g_fxo->get<lv2_memory_container>();
 
diff --git a/rpcs3/Emu/Cell/lv2/sys_spu.cpp b/rpcs3/Emu/Cell/lv2/sys_spu.cpp
index da70bf7865..f715335f00 100644
--- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp
@@ -99,7 +99,7 @@ void sys_spu_image::load(const fs::file& stream)
 	this->nsegs = 0;
 	this->segs = vm::null;
 
-	vm::page_protect(segs.addr(), ::align(mem_size, 4096), 0, 0, vm::page_writable);
+	vm::page_protect(segs.addr(), utils::align(mem_size, 4096), 0, 0, vm::page_writable);
 }
 
 void sys_spu_image::free()
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index 06e337ad5a..576126c6fb 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -974,13 +974,13 @@ namespace vm
 
 			if (state & page_1m_size)
 			{
-				i = ::align(i + 1, 0x100000 / 4096);
+				i = utils::align(i + 1, 0x100000 / 4096);
 				continue;
 			}
 
 			if (state & page_64k_size)
 			{
-				i = ::align(i + 1, 0x10000 / 4096);
+				i = utils::align(i + 1, 0x10000 / 4096);
 				continue;
 			}
 
@@ -1177,7 +1177,7 @@ namespace vm
 		const u32 min_page_size = flags & 0x100 ? 0x1000 : 0x10000;
 
 		// Align to minimal page size
-		const u32 size = ::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0);
+		const u32 size = utils::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0);
 
 		// Check alignment (it's page allocation, so passing small values there is just silly)
 		if (align < min_page_size || align != (0x80000000u >> std::countl_zero(align)))
@@ -1217,7 +1217,7 @@ namespace vm
 		vm::writer_lock lock(0);
 
 		// Search for an appropriate place (unoptimized)
-		for (u32 addr = ::align(this->addr, align); u64{addr} + size <= u64{this->addr} + this->size; addr += align)
+		for (u32 addr = utils::align(this->addr, align); u64{addr} + size <= u64{this->addr} + this->size; addr += align)
 		{
 			if (try_alloc(addr, pflags, size, std::move(shm)))
 			{
@@ -1240,7 +1240,7 @@ namespace vm
 		const u32 min_page_size = flags & 0x100 ? 0x1000 : 0x10000;
 
 		// Align to minimal page size
-		const u32 size = ::align(orig_size, min_page_size);
+		const u32 size = utils::align(orig_size, min_page_size);
 
 		// return if addr or size is invalid
 		if (!size || addr < this->addr || orig_size > size || addr + u64{size} > this->addr + u64{this->size} || flags & 0x10)
@@ -1410,7 +1410,7 @@ namespace vm
 
 	static std::shared_ptr<block_t> _find_map(u32 size, u32 align, u64 flags)
 	{
-		for (u32 addr = ::align<u32>(0x20000000, align); addr - 1 < 0xC0000000 - 1; addr += align)
+		for (u32 addr = utils::align<u32>(0x20000000, align); addr - 1 < 0xC0000000 - 1; addr += align)
 		{
 			if (_test_map(addr, size))
 			{
@@ -1485,7 +1485,7 @@ namespace vm
 		vm::writer_lock lock(0);
 
 		// Align to minimal page size
-		const u32 size = ::align(orig_size, 0x10000);
+		const u32 size = utils::align(orig_size, 0x10000);
 
 		// Check alignment
 		if (align < 0x10000 || align != (0x80000000u >> std::countl_zero(align)))
diff --git a/rpcs3/Emu/NP/np_handler.cpp b/rpcs3/Emu/NP/np_handler.cpp
index 5c00d186bb..f624c7d520 100644
--- a/rpcs3/Emu/NP/np_handler.cpp
+++ b/rpcs3/Emu/NP/np_handler.cpp
@@ -32,6 +32,8 @@
 #include <net/if_dl.h>
 #endif
 
+#include "util/asm.hpp"
+
 LOG_CHANNEL(sys_net);
 LOG_CHANNEL(sceNp2);
 LOG_CHANNEL(sceNp);
@@ -384,7 +386,7 @@ vm::addr_t np_handler::allocate(u32 size)
 		return vm::cast(static_cast<u64>(0));
 
 	// Align allocs
-	const u32 alloc_size = ::align(size, 4);
+	const u32 alloc_size = utils::align(size, 4);
 	if (alloc_size > mpool_avail)
 	{
 		sceNp.error("Not enough memory available in NP pool!");
diff --git a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
index c9f4091a36..220dd52773 100644
--- a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
+++ b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
@@ -7,6 +7,7 @@
 #include "Emu/RSX/RSXThread.h"
 
 #include <map>
+#include "util/asm.hpp"
 
 namespace rsx
 {
@@ -23,7 +24,7 @@ namespace rsx
 		}
 
 		// User memory + fifo size
-		buffer_size = ::align<u32>(buffer_size, 0x100000) + 0x10000000;
+		buffer_size = utils::align<u32>(buffer_size, 0x100000) + 0x10000000;
 		// We are not allowed to drain all memory so add a little
 		g_fxo->init<lv2_memory_container>(buffer_size + 0x1000000);
 
diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.cpp b/rpcs3/Emu/RSX/Common/TextureUtils.cpp
index 67baa736c8..ea0cd557f1 100644
--- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp
@@ -4,6 +4,8 @@
 #include "../RSXThread.h"
 #include "../rsx_utils.h"
 
+#include "util/asm.hpp"
+
 namespace
 {
 	// FIXME: GSL as_span break build if template parameter is non const with current revision.
@@ -346,8 +348,8 @@ namespace
 				}
 				else
 				{
-					current_subresource_layout.width_in_block = aligned_div(miplevel_width_in_texel, block_edge_in_texel);
-					current_subresource_layout.height_in_block = aligned_div(miplevel_height_in_texel, block_edge_in_texel);
+					current_subresource_layout.width_in_block = utils::aligned_div(miplevel_width_in_texel, block_edge_in_texel);
+					current_subresource_layout.height_in_block = utils::aligned_div(miplevel_height_in_texel, block_edge_in_texel);
 				}
 
 				if (padded_row)
@@ -375,7 +377,7 @@ namespace
 				miplevel_height_in_texel = std::max(miplevel_height_in_texel / 2, 1);
 			}
 
-			offset_in_src = align(offset_in_src, 128);
+			offset_in_src = utils::align(offset_in_src, 128);
 		}
 
 		return result;
@@ -922,8 +924,8 @@ namespace rsx
 		usz result = 0;
 		for (u16 i = 0; i < mipmap; ++i)
 		{
-			usz rowPitch = align(block_size_in_byte * width_in_blocks, row_pitch_alignment);
-			result += align(rowPitch * height_in_blocks * depth, mipmap_alignment);
+			usz rowPitch = utils::align(block_size_in_byte * width_in_blocks, row_pitch_alignment);
+			result += utils::align(rowPitch * height_in_blocks * depth, mipmap_alignment);
 			height_in_blocks = std::max<usz>(height_in_blocks / 2, 1);
 			width_in_blocks = std::max<usz>(width_in_blocks / 2, 1);
 		}
diff --git a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
index ae59bc5685..b06b75c89f 100644
--- a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
+++ b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "util/logs.hpp"
+#include "util/asm.hpp"
 
 /**
  * Ring buffer memory helper :
@@ -19,8 +20,8 @@ protected:
 	template<int Alignment>
 	bool can_alloc(usz size) const
 	{
-		usz alloc_size = align(size, Alignment);
-		usz aligned_put_pos = align(m_put_pos, Alignment);
+		usz alloc_size = utils::align(size, Alignment);
+		usz aligned_put_pos = utils::align(m_put_pos, Alignment);
 		if (aligned_put_pos + alloc_size < m_size)
 		{
 			// range before get
@@ -83,8 +84,8 @@ public:
 	template<int Alignment>
 	usz alloc(usz size)
 	{
-		const usz alloc_size = align(size, Alignment);
-		const usz aligned_put_pos = align(m_put_pos, Alignment);
+		const usz alloc_size = utils::align(size, Alignment);
+		const usz aligned_put_pos = utils::align(m_put_pos, Alignment);
 
 		if (!can_alloc<Alignment>(size) && !grow(aligned_put_pos + alloc_size))
 		{
diff --git a/rpcs3/Emu/RSX/Common/surface_store.cpp b/rpcs3/Emu/RSX/Common/surface_store.cpp
index 72d1008fc9..53981ac6db 100644
--- a/rpcs3/Emu/RSX/Common/surface_store.cpp
+++ b/rpcs3/Emu/RSX/Common/surface_store.cpp
@@ -1,6 +1,8 @@
 #include "stdafx.h"
 #include "surface_store.h"
 
+#include "util/asm.hpp"
+
 namespace rsx
 {
 	namespace utility
@@ -23,20 +25,20 @@ namespace rsx
 		{
 			switch (format)
 			{
-			case surface_color_format::b8: return align(width, 256);
+			case surface_color_format::b8: return utils::align(width, 256);
 			case surface_color_format::g8b8:
 			case surface_color_format::x1r5g5b5_o1r5g5b5:
 			case surface_color_format::x1r5g5b5_z1r5g5b5:
-			case surface_color_format::r5g6b5: return align(width * 2, 256);
+			case surface_color_format::r5g6b5: return utils::align(width * 2, 256);
 			case surface_color_format::a8b8g8r8:
 			case surface_color_format::x8b8g8r8_o8b8g8r8:
 			case surface_color_format::x8b8g8r8_z8b8g8r8:
 			case surface_color_format::x8r8g8b8_o8r8g8b8:
 			case surface_color_format::x8r8g8b8_z8r8g8b8:
 			case surface_color_format::x32:
-			case surface_color_format::a8r8g8b8: return align(width * 4, 256);
-			case surface_color_format::w16z16y16x16: return align(width * 8, 256);
-			case surface_color_format::w32z32y32x32: return align(width * 16, 256);
+			case surface_color_format::a8r8g8b8: return utils::align(width * 4, 256);
+			case surface_color_format::w16z16y16x16: return utils::align(width * 8, 256);
+			case surface_color_format::w32z32y32x32: return utils::align(width * 16, 256);
 			}
 			fmt::throw_exception("Unknown color surface format");
 		}
diff --git a/rpcs3/Emu/RSX/Common/surface_store.h b/rpcs3/Emu/RSX/Common/surface_store.h
index 039592e40e..90fde1bd3c 100644
--- a/rpcs3/Emu/RSX/Common/surface_store.h
+++ b/rpcs3/Emu/RSX/Common/surface_store.h
@@ -5,6 +5,8 @@
 #include "../rsx_utils.h"
 #include <list>
 
+#include "util/asm.hpp"
+
 namespace rsx
 {
 	namespace utility
@@ -918,7 +920,7 @@ namespace rsx
 					{
 						// Width is calculated in the coordinate-space of the requester; normalize
 						info.src_area.x = (info.src_area.x * required_bpp) / surface_bpp;
-						info.src_area.width = align(width * required_bpp, surface_bpp) / surface_bpp;
+						info.src_area.width = utils::align(width * required_bpp, surface_bpp) / surface_bpp;
 					}
 					else
 					{
diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h
index 2f6b7773b0..5906dcb0f2 100644
--- a/rpcs3/Emu/RSX/GL/GLCompute.h
+++ b/rpcs3/Emu/RSX/GL/GLCompute.h
@@ -4,6 +4,8 @@
 #include "Emu/IdManager.h"
 #include "GLHelpers.h"
 
+#include "util/asm.hpp"
+
 namespace gl
 {
     struct compute_task
@@ -224,7 +226,7 @@ namespace gl
 			m_data_length = data_length;
 
 			const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
-			const auto num_bytes_to_process = align(data_length, num_bytes_per_invocation);
+			const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
 			const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
 
 			if ((num_bytes_to_process + data_offset) > data->size())
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index dee0cf0617..ececc28ef6 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -740,7 +740,7 @@ void GLGSRender::load_program_env()
 		if (update_fragment_env) m_fragment_env_buffer->reserve_storage_on_heap(128);
 		if (update_vertex_env) m_vertex_env_buffer->reserve_storage_on_heap(256);
 		if (update_fragment_texture_env) m_texture_parameters_buffer->reserve_storage_on_heap(256);
-		if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(align(fragment_constants_size, 256));
+		if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256));
 		if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192);
 		if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128);
 
diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h
index 52a36873e3..4e0bc10609 100644
--- a/rpcs3/Emu/RSX/GL/GLHelpers.h
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.h
@@ -16,6 +16,7 @@
 #include "Utilities/mutex.h"
 #include "Utilities/geometry.h"
 #include "util/logs.hpp"
+#include "util/asm.hpp"
 
 #define GL_FRAGMENT_TEXTURES_START 0
 #define GL_VERTEX_TEXTURES_START   (GL_FRAGMENT_TEXTURES_START + 16)
@@ -808,7 +809,7 @@ namespace gl
 		virtual std::pair<void*, u32> alloc_from_heap(u32 alloc_size, u16 alignment)
 		{
 			u32 offset = m_data_loc;
-			if (m_data_loc) offset = align(offset, alignment);
+			if (m_data_loc) offset = utils::align(offset, alignment);
 
 			if ((offset + alloc_size) > m_size)
 			{
@@ -827,7 +828,7 @@ namespace gl
 			}
 
 			//Align data loc to 256; allows some "guard" region so we dont trample our own data inadvertently
-			m_data_loc = align(offset + alloc_size, 256);
+			m_data_loc = utils::align(offset + alloc_size, 256);
 			return std::make_pair(static_cast<char*>(m_memory_mapping) + offset, offset);
 		}
 
@@ -897,9 +898,9 @@ namespace gl
 			ensure(m_memory_mapping == nullptr);
 
 			u32 offset = m_data_loc;
-			if (m_data_loc) offset = align(offset, 256);
+			if (m_data_loc) offset = utils::align(offset, 256);
 
-			const u32 block_size = align(alloc_size + 16, 256);	//Overallocate just in case we need to realign base
+			const u32 block_size = utils::align(alloc_size + 16, 256);	//Overallocate just in case we need to realign base
 
 			if ((offset + block_size) > m_size)
 			{
@@ -933,10 +934,10 @@ namespace gl
 		std::pair<void*, u32> alloc_from_heap(u32 alloc_size, u16 alignment) override
 		{
 			u32 offset = m_data_loc;
-			if (m_data_loc) offset = align(offset, alignment);
+			if (m_data_loc) offset = utils::align(offset, alignment);
 
 			u32 padding = (offset - m_data_loc);
-			u32 real_size = align(padding + alloc_size, alignment);	//Ensures we leave the loc pointer aligned after we exit
+			u32 real_size = utils::align(padding + alloc_size, alignment);	//Ensures we leave the loc pointer aligned after we exit
 
 			if (real_size > m_mapped_bytes)
 			{
@@ -946,10 +947,10 @@ namespace gl
 				reserve_storage_on_heap(std::max(real_size, 4096U));
 
 				offset = m_data_loc;
-				if (m_data_loc) offset = align(offset, alignment);
+				if (m_data_loc) offset = utils::align(offset, alignment);
 
 				padding = (offset - m_data_loc);
-				real_size = align(padding + alloc_size, alignment);
+				real_size = utils::align(padding + alloc_size, alignment);
 			}
 
 			m_data_loc = offset + real_size;
diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp
index b416a02aac..2948bf49ca 100644
--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@@ -6,6 +6,8 @@
 #include "../RSXThread.h"
 #include "../RSXTexture.h"
 
+#include "util/asm.hpp"
+
 namespace gl
 {
 	buffer g_typeless_transfer_buffer;
@@ -614,8 +616,8 @@ namespace gl
 		{
 			//Compressed formats have a 4-byte alignment
 			//TODO: Verify that samplers are not affected by the padding
-			width = align(width, 4);
-			height = align(height, 4);
+			width = utils::align(width, 4);
+			height = utils::align(height, 4);
 		}
 
 		GLenum target;
@@ -654,7 +656,7 @@ namespace gl
 		{
 			caps.supports_vtc_decoding = gl::get_driver_caps().vendor_NVIDIA;
 
-			unpack_settings.row_length(align(dst->width(), 4));
+			unpack_settings.row_length(utils::align(dst->width(), 4));
 			unpack_settings.apply();
 
 			glBindTexture(static_cast<GLenum>(dst->get_target()), dst->id());
@@ -664,7 +666,7 @@ namespace gl
 			for (const rsx::subresource_layout& layout : input_layouts)
 			{
 				upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps);
-				const sizei image_size{ align(layout.width_in_texel, 4), align(layout.height_in_texel, 4) };
+				const sizei image_size{utils::align(layout.width_in_texel, 4), utils::align(layout.height_in_texel, 4)};
 
 				switch (dst->get_target())
 				{
@@ -835,7 +837,7 @@ namespace gl
 	void upload_texture(texture* dst, u32 gcm_format, bool is_swizzled, const std::vector<rsx::subresource_layout>& subresources_layout)
 	{
 		// Calculate staging buffer size
-		const u32 aligned_pitch = align<u32>(dst->pitch(), 4);
+		const u32 aligned_pitch = utils::align<u32>(dst->pitch(), 4);
 		usz texture_data_sz = dst->depth() * dst->height() * aligned_pitch;
 		std::vector<std::byte> data_upload_buf(texture_data_sz);
 
diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
new file mode 100644
index 0000000000..040ee067bf
--- /dev/null
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
@@ -0,0 +1,191 @@
+#include "stdafx.h"
+#include "Emu/RSX/RSXThread.h"
+#include "GLTexture.h"
+#include "GLTextureCache.h"
+
+#include "util/asm.hpp"
+
+namespace gl
+{
+	void cached_texture_section::finish_flush()
+	{
+		// Free resources
+		glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+		glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
+
+		const auto valid_range = get_confirmed_range_delta();
+		const u32 valid_offset = valid_range.first;
+		const u32 valid_length = valid_range.second;
+		void *dst = get_ptr(get_section_base() + valid_offset);
+
+		if (!gl::get_driver_caps().ARB_compute_shader_supported)
+		{
+			switch (type)
+			{
+			case gl::texture::type::sbyte:
+			case gl::texture::type::ubyte:
+			{
+				// byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty
+				ensure(!pack_unpack_swap_bytes);
+				break;
+			}
+			case gl::texture::type::uint_24_8:
+			{
+				// Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24
+				// In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3
+				ensure(pack_unpack_swap_bytes == false);
+				ensure(real_pitch == (width * 4));
+				if (rsx_pitch == real_pitch) [[likely]]
+				{
+					stream_data_to_memory_swapped_u32<true>(dst, dst, valid_length / 4, 4);
+				}
+				else
+				{
+					const u32 num_rows = utils::align(valid_length, rsx_pitch) / rsx_pitch;
+					u8* data = static_cast<u8*>(dst);
+					for (u32 row = 0; row < num_rows; ++row)
+					{
+						stream_data_to_memory_swapped_u32<true>(data, data, width, 4);
+						data += rsx_pitch;
+					}
+				}
+				break;
+			}
+			default:
+				break;
+			}
+		}
+
+		if (is_swizzled())
+		{
+			// This format is completely worthless to CPU processing algorithms where cache lines on die are linear.
+			// If this is happening, usually it means it was not a planned readback (e.g shared pages situation)
+			rsx_log.warning("[Performance warning] CPU readback of swizzled data");
+
+			// Read-modify-write to avoid corrupting already resident memory outside texture region
+			std::vector<u8> tmp_data(rsx_pitch * height);
+			std::memcpy(tmp_data.data(), dst, tmp_data.size());
+
+			switch (type)
+			{
+			case gl::texture::type::uint_8_8_8_8:
+			case gl::texture::type::uint_24_8:
+				rsx::convert_linear_swizzle<u32, false>(tmp_data.data(), dst, width, height, rsx_pitch);
+				break;
+			case gl::texture::type::ushort_5_6_5:
+			case gl::texture::type::ushort:
+				rsx::convert_linear_swizzle<u16, false>(tmp_data.data(), dst, width, height, rsx_pitch);
+				break;
+			default:
+				rsx_log.error("Unexpected swizzled texture format 0x%x", static_cast<u32>(format));
+			}
+		}
+
+		if (context == rsx::texture_upload_context::framebuffer_storage)
+		{
+			// Update memory tag
+			static_cast<gl::render_target*>(vram_texture)->sync_tag();
+		}
+	}
+
+	void texture_cache::copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const
+	{
+		const auto dst_bpp = dst_image->pitch() / dst_image->width();
+		const auto dst_aspect = dst_image->aspect();
+
+		for (const auto &slice : sources)
+		{
+			if (!slice.src)
+				continue;
+
+			const bool typeless = dst_aspect != slice.src->aspect() ||
+				!formats_are_bitcast_compatible(static_cast<GLenum>(slice.src->get_internal_format()), static_cast<GLenum>(dst_image->get_internal_format()));
+
+			std::unique_ptr<gl::texture> tmp;
+			auto src_image = slice.src;
+			auto src_x = slice.src_x;
+			auto src_y = slice.src_y;
+			auto src_w = slice.src_w;
+			auto src_h = slice.src_h;
+
+			if (slice.xform == rsx::surface_transform::coordinate_transform)
+			{
+				// Dimensions were given in 'dst' space. Work out the real source coordinates
+				const auto src_bpp = slice.src->pitch() / slice.src->width();
+				src_x = (src_x * dst_bpp) / src_bpp;
+				src_w = utils::aligned_div<u16>(src_w * dst_bpp, src_bpp);
+			}
+
+			if (auto surface = dynamic_cast<gl::render_target*>(slice.src))
+			{
+				surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
+			}
+
+			if (typeless) [[unlikely]]
+			{
+				const auto src_bpp = slice.src->pitch() / slice.src->width();
+				const u16 convert_w = u16(slice.src->width() * src_bpp) / dst_bpp;
+				tmp = std::make_unique<texture>(GL_TEXTURE_2D, convert_w, slice.src->height(), 1, 1, static_cast<GLenum>(dst_image->get_internal_format()));
+
+				src_image = tmp.get();
+
+				// Compute src region in dst format layout
+				const u16 src_w2 = u16(src_w * src_bpp) / dst_bpp;
+				const u16 src_x2 = u16(src_x * src_bpp) / dst_bpp;
+
+				if (src_w2 == slice.dst_w && src_h == slice.dst_h && slice.level == 0)
+				{
+					// Optimization, avoid typeless copy to tmp followed by data copy to dst
+					// Combine the two transfers into one
+					const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
+					const coord3u dst_region = { { slice.dst_x, slice.dst_y, slice.dst_z }, { slice.dst_w, slice.dst_h, 1 } };
+					gl::copy_typeless(dst_image, slice.src, dst_region, src_region);
+
+					continue;
+				}
+
+				const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
+				const coord3u dst_region = { { src_x2, src_y, 0 }, { src_w2, src_h, 1 } };
+				gl::copy_typeless(src_image, slice.src, dst_region, src_region);
+
+				src_x = src_x2;
+				src_w = src_w2;
+			}
+
+			if (src_w == slice.dst_w && src_h == slice.dst_h)
+			{
+				glCopyImageSubData(src_image->id(), GL_TEXTURE_2D, 0, src_x, src_y, 0,
+					dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, src_w, src_h, 1);
+			}
+			else
+			{
+				ensure(dst_image->get_target() == gl::texture::target::texture2D);
+
+				auto _blitter = gl::g_hw_blitter;
+				const areai src_rect = { src_x, src_y, src_x + src_w, src_y + src_h };
+				const areai dst_rect = { slice.dst_x, slice.dst_y, slice.dst_x + slice.dst_w, slice.dst_y + slice.dst_h };
+
+				gl::texture* _dst;
+				if (src_image->get_internal_format() == dst_image->get_internal_format() && slice.level == 0)
+				{
+					_dst = dst_image;
+				}
+				else
+				{
+					tmp = std::make_unique<texture>(GL_TEXTURE_2D, dst_rect.x2, dst_rect.y2, 1, 1, static_cast<GLenum>(slice.src->get_internal_format()));
+					_dst = tmp.get();
+				}
+
+				_blitter->scale_image(cmd, src_image, _dst,
+					src_rect, dst_rect, false, {});
+
+				if (_dst != dst_image)
+				{
+					// Data cast comes after scaling
+					glCopyImageSubData(tmp->id(), GL_TEXTURE_2D, 0, slice.dst_x, slice.dst_y, 0,
+						dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, slice.dst_w, slice.dst_h, 1);
+				}
+			}
+		}
+	}
+}
diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h
index 8527b826ef..a0174b22f4 100644
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@@ -62,7 +62,7 @@ namespace gl
 		void init_buffer(const gl::texture* src)
 		{
 			const u32 vram_size = src->pitch() * src->height();
-			const u32 buffer_size = align(vram_size, 4096);
+			const u32 buffer_size = utils::align(vram_size, 4096);
 
 			if (pbo)
 			{
@@ -333,86 +333,7 @@ namespace gl
 			return glMapBufferRange(GL_PIXEL_PACK_BUFFER, offset, size, GL_MAP_READ_BIT);
 		}
 
-		void finish_flush()
-		{
-			// Free resources
-			glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
-			glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
-
-			const auto valid_range = get_confirmed_range_delta();
-			const u32 valid_offset = valid_range.first;
-			const u32 valid_length = valid_range.second;
-			void *dst = get_ptr(get_section_base() + valid_offset);
-
-			if (!gl::get_driver_caps().ARB_compute_shader_supported)
-			{
-				switch (type)
-				{
-				case gl::texture::type::sbyte:
-				case gl::texture::type::ubyte:
-				{
-					// byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty
-					ensure(!pack_unpack_swap_bytes);
-					break;
-				}
-				case gl::texture::type::uint_24_8:
-				{
-					// Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24
-					// In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3
-					ensure(pack_unpack_swap_bytes == false);
-					ensure(real_pitch == (width * 4));
-					if (rsx_pitch == real_pitch) [[likely]]
-					{
-						stream_data_to_memory_swapped_u32<true>(dst, dst, valid_length / 4, 4);
-					}
-					else
-					{
-						const u32 num_rows = align(valid_length, rsx_pitch) / rsx_pitch;
-						u8* data = static_cast<u8*>(dst);
-						for (u32 row = 0; row < num_rows; ++row)
-						{
-							stream_data_to_memory_swapped_u32<true>(data, data, width, 4);
-							data += rsx_pitch;
-						}
-					}
-					break;
-				}
-				default:
-					break;
-				}
-			}
-
-			if (is_swizzled())
-			{
-				// This format is completely worthless to CPU processing algorithms where cache lines on die are linear.
-				// If this is happening, usually it means it was not a planned readback (e.g shared pages situation)
-				rsx_log.warning("[Performance warning] CPU readback of swizzled data");
-
-				// Read-modify-write to avoid corrupting already resident memory outside texture region
-				std::vector<u8> tmp_data(rsx_pitch * height);
-				std::memcpy(tmp_data.data(), dst, tmp_data.size());
-
-				switch (type)
-				{
-				case gl::texture::type::uint_8_8_8_8:
-				case gl::texture::type::uint_24_8:
-					rsx::convert_linear_swizzle<u32, false>(tmp_data.data(), dst, width, height, rsx_pitch);
-					break;
-				case gl::texture::type::ushort_5_6_5:
-				case gl::texture::type::ushort:
-					rsx::convert_linear_swizzle<u16, false>(tmp_data.data(), dst, width, height, rsx_pitch);
-					break;
-				default:
-					rsx_log.error("Unexpected swizzled texture format 0x%x", static_cast<u32>(format));
-				}
-			}
-
-			if (context == rsx::texture_upload_context::framebuffer_storage)
-			{
-				// Update memory tag
-				static_cast<gl::render_target*>(vram_texture)->sync_tag();
-			}
-		}
+		void finish_flush();
 
 		/**
 		 * Misc
@@ -637,106 +558,7 @@ namespace gl
 			}
 		}
 
-		void copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const
-		{
-			const auto dst_bpp = dst_image->pitch() / dst_image->width();
-			const auto dst_aspect = dst_image->aspect();
-
-			for (const auto &slice : sources)
-			{
-				if (!slice.src)
-					continue;
-
-				const bool typeless = dst_aspect != slice.src->aspect() ||
-					!formats_are_bitcast_compatible(static_cast<GLenum>(slice.src->get_internal_format()), static_cast<GLenum>(dst_image->get_internal_format()));
-
-				std::unique_ptr<gl::texture> tmp;
-				auto src_image = slice.src;
-				auto src_x = slice.src_x;
-				auto src_y = slice.src_y;
-				auto src_w = slice.src_w;
-				auto src_h = slice.src_h;
-
-				if (slice.xform == rsx::surface_transform::coordinate_transform)
-				{
-					// Dimensions were given in 'dst' space. Work out the real source coordinates
-					const auto src_bpp = slice.src->pitch() / slice.src->width();
-					src_x = (src_x * dst_bpp) / src_bpp;
-					src_w = ::aligned_div<u16>(src_w * dst_bpp, src_bpp);
-				}
-
-				if (auto surface = dynamic_cast<gl::render_target*>(slice.src))
-				{
-					surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
-				}
-
-				if (typeless) [[unlikely]]
-				{
-					const auto src_bpp = slice.src->pitch() / slice.src->width();
-					const u16 convert_w = u16(slice.src->width() * src_bpp) / dst_bpp;
-					tmp = std::make_unique<texture>(GL_TEXTURE_2D, convert_w, slice.src->height(), 1, 1, static_cast<GLenum>(dst_image->get_internal_format()));
-
-					src_image = tmp.get();
-
-					// Compute src region in dst format layout
-					const u16 src_w2 = u16(src_w * src_bpp) / dst_bpp;
-					const u16 src_x2 = u16(src_x * src_bpp) / dst_bpp;
-
-					if (src_w2 == slice.dst_w && src_h == slice.dst_h && slice.level == 0)
-					{
-						// Optimization, avoid typeless copy to tmp followed by data copy to dst
-						// Combine the two transfers into one
-						const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
-						const coord3u dst_region = { { slice.dst_x, slice.dst_y, slice.dst_z }, { slice.dst_w, slice.dst_h, 1 } };
-						gl::copy_typeless(dst_image, slice.src, dst_region, src_region);
-
-						continue;
-					}
-
-					const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
-					const coord3u dst_region = { { src_x2, src_y, 0 }, { src_w2, src_h, 1 } };
-					gl::copy_typeless(src_image, slice.src, dst_region, src_region);
-
-					src_x = src_x2;
-					src_w = src_w2;
-				}
-
-				if (src_w == slice.dst_w && src_h == slice.dst_h)
-				{
-					glCopyImageSubData(src_image->id(), GL_TEXTURE_2D, 0, src_x, src_y, 0,
-						dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, src_w, src_h, 1);
-				}
-				else
-				{
-					ensure(dst_image->get_target() == gl::texture::target::texture2D);
-
-					auto _blitter = gl::g_hw_blitter;
-					const areai src_rect = { src_x, src_y, src_x + src_w, src_y + src_h };
-					const areai dst_rect = { slice.dst_x, slice.dst_y, slice.dst_x + slice.dst_w, slice.dst_y + slice.dst_h };
-
-					gl::texture* _dst;
-					if (src_image->get_internal_format() == dst_image->get_internal_format() && slice.level == 0)
-					{
-						_dst = dst_image;
-					}
-					else
-					{
-						tmp = std::make_unique<texture>(GL_TEXTURE_2D, dst_rect.x2, dst_rect.y2, 1, 1, static_cast<GLenum>(slice.src->get_internal_format()));
-						_dst = tmp.get();
-					}
-
-					_blitter->scale_image(cmd, src_image, _dst,
-						src_rect, dst_rect, false, {});
-
-					if (_dst != dst_image)
-					{
-						// Data cast comes after scaling
-						glCopyImageSubData(tmp->id(), GL_TEXTURE_2D, 0, slice.dst_x, slice.dst_y, 0,
-							dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, slice.dst_w, slice.dst_h, 1);
-					}
-				}
-			}
-		}
+		void copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const;
 
 		gl::texture* get_template_from_collection_impl(const std::vector<copy_region_descriptor>& sections_to_transfer) const
 		{
diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
index c27abf1f7e..d1067f406f 100644
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -139,6 +139,52 @@ namespace rsx
 		fmt::throw_exception("rsx::get_address(offset=0x%x, location=0x%x): %s%s", offset, location, msg, src_loc{line, col, file, func});
 	}
 
+	std::pair<u32, u32> interleaved_range_info::calculate_required_range(u32 first, u32 count) const
+	{
+		if (single_vertex)
+		{
+			return { 0, 1 };
+		}
+
+		const u32 max_index = (first + count) - 1;
+		u32 _max_index = 0;
+		u32 _min_index = first;
+
+		for (const auto &attrib : locations)
+		{
+			if (attrib.frequency <= 1) [[likely]]
+			{
+				_max_index = max_index;
+			}
+			else
+			{
+				if (attrib.modulo)
+				{
+					if (max_index >= attrib.frequency)
+					{
+						// Actually uses the modulo operator
+						_min_index = 0;
+						_max_index = attrib.frequency - 1;
+					}
+					else
+					{
+						// Same as having no modulo
+						_max_index = max_index;
+					}
+				}
+				else
+				{
+					// Division operator
+					_min_index = std::min(_min_index, first / attrib.frequency);
+					_max_index = std::max<u32>(_max_index, utils::aligned_div(max_index, attrib.frequency));
+				}
+			}
+		}
+
+		ensure(_max_index >= _min_index);
+		return { _min_index, (_max_index - _min_index) + 1 };
+	}
+
 	u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size)
 	{
 		switch (type)
@@ -2521,7 +2567,7 @@ namespace rsx
 		}
 
 		// Some cases do not need full delay
-		remaining = ::aligned_div(remaining, div);
+		remaining = utils::aligned_div(remaining, div);
 		const u64 until = get_system_time() + remaining;
 
 		while (true)
diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h
index 5b55e1e55f..9477337fba 100644
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@@ -246,51 +246,7 @@ namespace rsx
 		rsx::simple_array<interleaved_attribute_t> locations;
 
 		// Check if we need to upload a full unoptimized range, i.e [0-max_index]
-		std::pair<u32, u32> calculate_required_range(u32 first, u32 count) const
-		{
-			if (single_vertex)
-			{
-				return { 0, 1 };
-			}
-
-			const u32 max_index = (first + count) - 1;
-			u32 _max_index = 0;
-			u32 _min_index = first;
-
-			for (const auto &attrib : locations)
-			{
-				if (attrib.frequency <= 1) [[likely]]
-				{
-					_max_index = max_index;
-				}
-				else
-				{
-					if (attrib.modulo)
-					{
-						if (max_index >= attrib.frequency)
-						{
-							// Actually uses the modulo operator
-							_min_index = 0;
-							_max_index = attrib.frequency - 1;
-						}
-						else
-						{
-							// Same as having no modulo
-							_max_index = max_index;
-						}
-					}
-					else
-					{
-						// Division operator
-						_min_index = std::min(_min_index, first / attrib.frequency);
-						_max_index = std::max<u32>(_max_index, aligned_div(max_index, attrib.frequency));
-					}
-				}
-			}
-
-			ensure(_max_index >= _min_index);
-			return { _min_index, (_max_index - _min_index) + 1 };
-		}
+		std::pair<u32, u32> calculate_required_range(u32 first, u32 count) const;
 	};
 
 	enum attribute_buffer_placement : u8
diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h
index f0ea98efc9..b09a91f3f0 100644
--- a/rpcs3/Emu/RSX/VK/VKCompute.h
+++ b/rpcs3/Emu/RSX/VK/VKCompute.h
@@ -5,6 +5,8 @@
 #include "Utilities/StrUtil.h"
 #include "Emu/IdManager.h"
 
+#include "util/asm.hpp"
+
 #define VK_MAX_COMPUTE_TASKS 4096   // Max number of jobs per frame
 
 namespace vk
@@ -296,7 +298,7 @@ namespace vk
 				"%vars"
 				"\n";
 
-			const auto parameters_size = align(push_constants_size, 16) / 16;
+			const auto parameters_size = utils::align(push_constants_size, 16) / 16;
 			const std::pair<std::string, std::string> syntax_replace[] =
 			{
 				{ "%ws", std::to_string(optimal_group_size) },
@@ -943,7 +945,7 @@ namespace vk
 			set_parameters(cmd);
 
 			const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
-			const u32 linear_invocations = aligned_div(data_length, num_bytes_per_invocation);
+			const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
 			compute_task::run(cmd, linear_invocations);
 		}
 	};
@@ -997,7 +999,7 @@ namespace vk
 			word_count = num_words;
 			block_length = num_words * 4;
 
-			const u32 linear_invocations = aligned_div(word_count, optimal_group_size);
+			const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
 			compute_task::run(cmd, linear_invocations);
 		}
 	};
diff --git a/rpcs3/Emu/RSX/VK/VKDMA.cpp b/rpcs3/Emu/RSX/VK/VKDMA.cpp
index 90d1440f22..81437cef28 100644
--- a/rpcs3/Emu/RSX/VK/VKDMA.cpp
+++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp
@@ -3,6 +3,8 @@
 #include "VKResourceManager.h"
 #include "VKDMA.h"
 
+#include "util/asm.hpp"
+
 namespace vk
 {
 	static constexpr usz s_dma_block_length = 0x01000000;
@@ -85,7 +87,7 @@ namespace vk
 	{
 		if (!inheritance_info.parent)
 		{
-			const u32 start = align(range.start, s_page_size);
+			const u32 start = utils::align(range.start, s_page_size);
 			const u32 end = ((range.end + 1) & s_page_align);
 
 			for (u32 page = start; page < end; page += s_page_size)
@@ -259,7 +261,7 @@ namespace vk
 		}
 
 		dma_block* block_head = nullptr;
-		auto block_end = align(limit, s_dma_block_length);
+		auto block_end = utils::align(limit, s_dma_block_length);
 
 		// Reverse scan to try and find the minimum required length in case of other chaining
 		for (auto block = last_block; block != first_block; block -= s_dma_block_length)
diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp
index c7022a1019..a3aaad8907 100644
--- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp
@@ -132,7 +132,7 @@ namespace vk
 	{
 		// Create new heap. All sizes are aligned up by 64M, upto 1GiB
 		const usz size_limit = 1024 * 0x100000;
-		const usz aligned_new_size = align(m_size + size, 64 * 0x100000);
+		const usz aligned_new_size = utils::align(m_size + size, 64 * 0x100000);
 
 		if (aligned_new_size >= size_limit)
 		{
@@ -351,8 +351,8 @@ namespace vk
 	{
 		auto create_texture = [&]()
 		{
-			u32 new_width = align(requested_width, 1024u);
-			u32 new_height = align(requested_height, 1024u);
+			u32 new_width = utils::align(requested_width, 1024u);
+			u32 new_height = utils::align(requested_height, 1024u);
 
 			return new vk::image(*g_current_renderer, g_current_renderer->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
 				VK_IMAGE_TYPE_2D, format, new_width, new_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
@@ -388,7 +388,7 @@ namespace vk
 		if (!g_scratch_buffer)
 		{
 			// Choose optimal size
-			const u64 alloc_size = std::max<u64>(64 * 0x100000, align(min_required_size, 0x100000));
+			const u64 alloc_size = std::max<u64>(64 * 0x100000, utils::align(min_required_size, 0x100000));
 
 			g_scratch_buffer = std::make_unique<vk::buffer>(*g_current_renderer, alloc_size,
 				g_current_renderer->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp
index c440f85c25..8d20697105 100644
--- a/rpcs3/Emu/RSX/VK/VKPresent.cpp
+++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp
@@ -2,6 +2,8 @@
 #include "VKGSRender.h"
 #include "Emu/Cell/Modules/cellVideoOut.h"
 
+#include "util/asm.hpp"
+
 void VKGSRender::reinitialize_swapchain()
 {
 	m_swapchain_dims.width = m_frame->client_width();
@@ -651,7 +653,7 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info)
 
 			const usz sshot_size = buffer_height * buffer_width * 4;
 
-			vk::buffer sshot_vkbuf(*m_device, align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+			vk::buffer sshot_vkbuf(*m_device, utils::align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
 				VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
 
 			VkBufferImageCopy copy_info;
diff --git a/rpcs3/Emu/RSX/VK/VKResolveHelper.h b/rpcs3/Emu/RSX/VK/VKResolveHelper.h
index 9be843692b..952bb26518 100644
--- a/rpcs3/Emu/RSX/VK/VKResolveHelper.h
+++ b/rpcs3/Emu/RSX/VK/VKResolveHelper.h
@@ -131,8 +131,8 @@ namespace vk
 			multisampled = msaa_image;
 			resolve = resolve_image;
 
-			const u32 invocations_x = align(resolve_image->width(), cs_wave_x) / cs_wave_x;
-			const u32 invocations_y = align(resolve_image->height(), cs_wave_y) / cs_wave_y;
+			const u32 invocations_x = utils::align(resolve_image->width(), cs_wave_x) / cs_wave_x;
+			const u32 invocations_y = utils::align(resolve_image->height(), cs_wave_y) / cs_wave_y;
 
 			compute_task::run(cmd, invocations_x, invocations_y, 1);
 		}
diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp
index 9d095b6a2c..2038d75b7d 100644
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@@ -7,6 +7,8 @@
 #include "VKRenderPass.h"
 #include "VKRenderTargets.h"
 
+#include "util/asm.hpp"
+
 namespace vk
 {
 	VkComponentMapping default_component_map()
@@ -89,7 +91,7 @@ namespace vk
 			ensure(dst->size() >= allocation_end);
 
 			const auto data_offset = u32(region.bufferOffset);
-			const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
+			const auto z32_offset = utils::align<u32>(data_offset + packed16_length, 256);
 
 			// 1. Copy the depth to buffer
 			VkBufferImageCopy region2;
@@ -135,8 +137,8 @@ namespace vk
 			ensure(dst->size() >= allocation_end);
 
 			const auto data_offset = u32(region.bufferOffset);
-			const auto z_offset = align<u32>(data_offset + packed_length, 256);
-			const auto s_offset = align<u32>(z_offset + in_depth_size, 256);
+			const auto z_offset = utils::align<u32>(data_offset + packed_length, 256);
+			const auto s_offset = utils::align<u32>(z_offset + in_depth_size, 256);
 
 			// 1. Copy the depth and stencil blocks to separate banks
 			VkBufferImageCopy sub_regions[2];
@@ -225,7 +227,7 @@ namespace vk
 			ensure(src->size() >= allocation_end);
 
 			const auto data_offset = u32(region.bufferOffset);
-			const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
+			const auto z32_offset = utils::align<u32>(data_offset + packed16_length, 256);
 
 			// 1. Pre-compute barrier
 			vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
@@ -260,8 +262,8 @@ namespace vk
 			ensure(src->size() >= allocation_end); // "Out of memory (compute heap). Lower your resolution scale setting."
 
 			const auto data_offset = u32(region.bufferOffset);
-			const auto z_offset = align<u32>(data_offset + packed_length, 256);
-			const auto s_offset = align<u32>(z_offset + in_depth_size, 256);
+			const auto z_offset = utils::align<u32>(data_offset + packed_length, 256);
+			const auto s_offset = utils::align<u32>(z_offset + in_depth_size, 256);
 
 			// Zero out the stencil block
 			vkCmdFillBuffer(cmd, src->value, s_offset, in_stencil_size, 0);
@@ -821,7 +823,7 @@ namespace vk
 			const auto src_offset = section.bufferOffset;
 
 			// Align output to 128-byte boundary to keep some drivers happy
-			dst_offset = align(dst_offset, 128);
+			dst_offset = utils::align(dst_offset, 128);
 
 			u32 data_length = 0;
 			for (unsigned i = 0, j = packet.first; i < packet.second; ++i, ++j)
@@ -930,7 +932,7 @@ namespace vk
 				if (layout.level == 0)
 				{
 					// Align mip0 on a 128-byte boundary
-					scratch_offset = align(scratch_offset, 128);
+					scratch_offset = utils::align(scratch_offset, 128);
 				}
 
 				// Copy from upload heap to scratch mem
diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp
new file mode 100644
index 0000000000..116e05edc2
--- /dev/null
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp
@@ -0,0 +1,360 @@
+#include "stdafx.h"
+#include "VKGSRender.h"
+#include "VKTextureCache.h"
+
+#include "util/asm.hpp"
+
+namespace vk
+{
+	void cached_texture_section::dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch)
+	{
+		ensure(src->samples() == 1);
+
+		if (!m_device)
+		{
+			m_device = &cmd.get_command_pool().get_owner();
+		}
+
+		if (dma_fence)
+		{
+			// NOTE: This can be reached if previously synchronized, or a special path happens.
+			// If a hard flush occurred while this surface was flush_always the cache would have reset its protection afterwards.
+			// DMA resource would still be present but already used to flush previously.
+			vk::get_resource_manager()->dispose(dma_fence);
+		}
+
+		if (vk::is_renderpass_open(cmd))
+		{
+			vk::end_renderpass(cmd);
+		}
+
+		src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
+
+		const auto internal_bpp = vk::get_format_texel_width(src->format());
+		const auto transfer_width = static_cast<u32>(src_area.width());
+		const auto transfer_height = static_cast<u32>(src_area.height());
+		real_pitch = internal_bpp * transfer_width;
+		rsx_pitch = pitch;
+
+		const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT;
+		if (require_format_conversion || pack_unpack_swap_bytes)
+		{
+			const auto section_length = valid_range.length();
+			const auto transfer_pitch = real_pitch;
+			const auto task_length = transfer_pitch * src_area.height();
+
+			auto working_buffer = vk::get_scratch_buffer(task_length);
+			auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
+
+			VkBufferImageCopy region = {};
+			region.imageSubresource = { src->aspect(), 0, 0, 1 };
+			region.imageOffset = { src_area.x1, src_area.y1, 0 };
+			region.imageExtent = { transfer_width, transfer_height, 1 };
+			vk::copy_image_to_buffer(cmd, src, working_buffer, region, (require_format_conversion && pack_unpack_swap_bytes));
+
+			// NOTE: For depth/stencil formats, copying to buffer and byteswap are combined into one step above
+			if (pack_unpack_swap_bytes && !require_format_conversion)
+			{
+				const auto texel_layout = vk::get_format_element_size(src->format());
+				const auto elem_size = texel_layout.first;
+				vk::cs_shuffle_base *shuffle_kernel;
+
+				if (elem_size == 2)
+				{
+					shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
+				}
+				else if (elem_size == 4)
+				{
+					shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
+				}
+				else
+				{
+					ensure(get_context() == rsx::texture_upload_context::dma);
+					shuffle_kernel = nullptr;
+				}
+
+				if (shuffle_kernel)
+				{
+					vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
+						VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+						VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
+
+					shuffle_kernel->run(cmd, working_buffer, task_length);
+
+					vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
+						VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+						VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
+				}
+			}
+
+			if (rsx_pitch == real_pitch) [[likely]]
+			{
+				VkBufferCopy copy = {};
+				copy.dstOffset = final_mapping.first;
+				copy.size = section_length;
+				vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
+			}
+			else
+			{
+				if (context != rsx::texture_upload_context::dma)
+				{
+					// Partial load for the bits outside the existing image
+					// NOTE: A true DMA section would have been prepped beforehand
+					// TODO: Parial range load/flush
+					vk::load_dma(valid_range.start, section_length);
+				}
+
+				std::vector<VkBufferCopy> copy;
+				copy.reserve(transfer_height);
+
+				u32 dst_offset = final_mapping.first;
+				u32 src_offset = 0;
+
+				for (unsigned row = 0; row < transfer_height; ++row)
+				{
+					copy.push_back({ src_offset, dst_offset, transfer_pitch });
+					src_offset += real_pitch;
+					dst_offset += rsx_pitch;
+				}
+
+				vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
+			}
+		}
+		else
+		{
+			VkBufferImageCopy region = {};
+			region.bufferRowLength = (rsx_pitch / internal_bpp);
+			region.imageSubresource = { src->aspect(), 0, 0, 1 };
+			region.imageOffset = { src_area.x1, src_area.y1, 0 };
+			region.imageExtent = { transfer_width, transfer_height, 1 };
+
+			auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
+			region.bufferOffset = mapping.first;
+			vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, &region);
+		}
+
+		src->pop_layout(cmd);
+
+		// Create event object for this transfer and queue signal op
+		dma_fence = std::make_unique<vk::event>(*m_device);
+		dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT);
+
+		// Set cb flag for queued dma operations
+		cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
+
+		if (get_context() == rsx::texture_upload_context::dma)
+		{
+			// Save readback hint in case transformation is required later
+			switch (internal_bpp)
+			{
+			case 2:
+				gcm_format = CELL_GCM_TEXTURE_R5G6B5;
+				break;
+			case 4:
+			default:
+				gcm_format = CELL_GCM_TEXTURE_A8R8G8B8;
+				break;
+			}
+		}
+
+		synchronized = true;
+		sync_timestamp = get_system_time();
+	}
+
+	void texture_cache::copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const
+	{
+		const auto dst_aspect = dst->aspect();
+		const auto dst_bpp = vk::get_format_texel_width(dst->format());
+
+		for (const auto &section : sections_to_transfer)
+		{
+			if (!section.src)
+				continue;
+
+			const bool typeless = section.src->aspect() != dst_aspect ||
+				!formats_are_bitcast_compatible(dst, section.src);
+
+			// Avoid inserting unnecessary barrier GENERAL->TRANSFER_SRC->GENERAL in active render targets
+			const auto preferred_layout = (section.src->current_layout != VK_IMAGE_LAYOUT_GENERAL) ?
+				VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL;
+
+			section.src->push_layout(cmd, preferred_layout);
+
+			auto src_image = section.src;
+			auto src_x = section.src_x;
+			auto src_y = section.src_y;
+			auto src_w = section.src_w;
+			auto src_h = section.src_h;
+
+			rsx::flags32_t transform = section.xform;
+			if (section.xform == rsx::surface_transform::coordinate_transform)
+			{
+				// Dimensions were given in 'dst' space. Work out the real source coordinates
+				const auto src_bpp = vk::get_format_texel_width(section.src->format());
+				src_x = (src_x * dst_bpp) / src_bpp;
+				src_w = utils::aligned_div<u16>(src_w * dst_bpp, src_bpp);
+
+				transform &= ~(rsx::surface_transform::coordinate_transform);
+			}
+
+			if (auto surface = dynamic_cast<vk::render_target*>(section.src))
+			{
+				surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
+			}
+
+			if (typeless) [[unlikely]]
+			{
+				const auto src_bpp = vk::get_format_texel_width(section.src->format());
+				const u16 convert_w = u16(src_w * src_bpp) / dst_bpp;
+				const u16 convert_x = u16(src_x * src_bpp) / dst_bpp;
+
+				if (convert_w == section.dst_w && src_h == section.dst_h &&
+					transform == rsx::surface_transform::identity &&
+					section.level == 0 && section.dst_z == 0)
+				{
+					// Optimization to avoid double transfer
+					// TODO: Handle level and layer offsets
+					const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
+					const areai dst_rect = coordi{{ section.dst_x, section.dst_y }, { section.dst_w, section.dst_h }};
+					vk::copy_image_typeless(cmd, section.src, dst, src_rect, dst_rect, 1);
+
+					section.src->pop_layout(cmd);
+					continue;
+				}
+
+				src_image = vk::get_typeless_helper(dst->format(), dst->format_class(), convert_x + convert_w, src_y + src_h);
+				src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+
+				const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
+				const areai dst_rect = coordi{{ convert_x, src_y }, { convert_w, src_h }};
+				vk::copy_image_typeless(cmd, section.src, src_image, src_rect, dst_rect, 1);
+				src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
+
+				src_x = convert_x;
+				src_w = convert_w;
+			}
+
+			ensure(src_image->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src_image->current_layout == VK_IMAGE_LAYOUT_GENERAL);
+
+			// Final aspect mask of the 'final' transfer source
+			const auto new_src_aspect = src_image->aspect();
+
+			if (src_w == section.dst_w && src_h == section.dst_h && transform == rsx::surface_transform::identity) [[likely]]
+			{
+				VkImageCopy copy_rgn;
+				copy_rgn.srcOffset = { src_x, src_y, 0 };
+				copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
+				copy_rgn.dstSubresource = { dst_aspect, 0, 0, 1 };
+				copy_rgn.srcSubresource = { new_src_aspect, 0, 0, 1 };
+				copy_rgn.extent = { src_w, src_h, 1 };
+
+				if (dst->info.imageType == VK_IMAGE_TYPE_3D)
+				{
+					copy_rgn.dstOffset.z = section.dst_z;
+				}
+				else
+				{
+					copy_rgn.dstSubresource.baseArrayLayer = section.dst_z;
+					copy_rgn.dstSubresource.mipLevel = section.level;
+				}
+
+				vkCmdCopyImage(cmd, src_image->value, src_image->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
+			}
+			else
+			{
+				ensure(section.dst_z == 0);
+
+				u16 dst_x = section.dst_x, dst_y = section.dst_y;
+				vk::image* _dst;
+
+				if (src_image->info.format == dst->info.format && section.level == 0) [[likely]]
+				{
+					_dst = dst;
+				}
+				else
+				{
+					// Either a bitcast is required or a scale+copy to mipmap level
+					_dst = vk::get_typeless_helper(src_image->format(), src_image->format_class(), dst->width(), dst->height() * 2);
+					_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+				}
+
+				if (transform == rsx::surface_transform::identity)
+				{
+					vk::copy_scaled_image(cmd, src_image, _dst,
+						coordi{ { src_x, src_y }, { src_w, src_h } },
+						coordi{ { section.dst_x, section.dst_y }, { section.dst_w, section.dst_h } },
+						1, src_image->format() == _dst->format(),
+						VK_FILTER_NEAREST);
+				}
+				else if (transform == rsx::surface_transform::argb_to_bgra)
+				{
+					VkBufferImageCopy copy{};
+					copy.imageExtent = { src_w, src_h, 1 };
+					copy.imageOffset = { src_x, src_y, 0 };
+					copy.imageSubresource = { src_image->aspect(), 0, 0, 1 };
+
+					const auto mem_length = src_w * src_h * dst_bpp;
+					auto scratch_buf = vk::get_scratch_buffer(mem_length);
+					vkCmdCopyImageToBuffer(cmd, src_image->value, src_image->current_layout, scratch_buf->value, 1, &copy);
+
+					vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+						VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
+
+					auto shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
+					shuffle_kernel->run(cmd, scratch_buf, mem_length);
+
+					vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+						VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
+
+					auto tmp = vk::get_typeless_helper(src_image->format(), src_image->format_class(), section.dst_x + section.dst_w, section.dst_y + section.dst_h);
+					tmp->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+
+					copy.imageOffset = { 0, 0, 0 };
+					vkCmdCopyBufferToImage(cmd, scratch_buf->value, tmp->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy);
+
+					dst_x = 0;
+					dst_y = 0;
+
+					if (src_w != section.dst_w || src_h != section.dst_h)
+					{
+						// Optionally scale if needed
+						if (tmp == _dst) [[unlikely]]
+						{
+							dst_y = src_h;
+						}
+
+						vk::copy_scaled_image(cmd, tmp, _dst,
+							areai{ 0, 0, src_w, static_cast<s32>(src_h) },
+							coordi{ { dst_x, dst_y }, { section.dst_w, section.dst_h } },
+							1, tmp->info.format == _dst->info.format,
+							VK_FILTER_NEAREST);
+					}
+					else
+					{
+						_dst = tmp;
+					}
+				}
+				else
+				{
+					fmt::throw_exception("Unreachable");
+				}
+
+				if (_dst != dst) [[unlikely]]
+				{
+					// Casting comes after the scaling!
+					VkImageCopy copy_rgn;
+					copy_rgn.srcOffset = { s32(dst_x), s32(dst_y), 0 };
+					copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
+					copy_rgn.dstSubresource = { dst_aspect, section.level, 0, 1 };
+					copy_rgn.srcSubresource = { _dst->aspect(), 0, 0, 1 };
+					copy_rgn.extent = { section.dst_w, section.dst_h, 1 };
+
+					_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
+					vkCmdCopyImage(cmd, _dst->value, _dst->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
+				}
+			}
+
+			section.src->pop_layout(cmd);
+		}
+	}
+}
diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h
index 31471673f6..fdeaa08b89 100644
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.h
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h
@@ -167,160 +167,7 @@ namespace vk
 			return flushed;
 		}
 
-		void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch)
-		{
-			ensure(src->samples() == 1);
-
-			if (!m_device)
-			{
-				m_device = &cmd.get_command_pool().get_owner();
-			}
-
-			if (dma_fence)
-			{
-				// NOTE: This can be reached if previously synchronized, or a special path happens.
-				// If a hard flush occurred while this surface was flush_always the cache would have reset its protection afterwards.
-				// DMA resource would still be present but already used to flush previously.
-				vk::get_resource_manager()->dispose(dma_fence);
-			}
-
-			if (vk::is_renderpass_open(cmd))
-			{
-				vk::end_renderpass(cmd);
-			}
-
-			src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
-
-			const auto internal_bpp = vk::get_format_texel_width(src->format());
-			const auto transfer_width = static_cast<u32>(src_area.width());
-			const auto transfer_height = static_cast<u32>(src_area.height());
-			real_pitch = internal_bpp * transfer_width;
-			rsx_pitch = pitch;
-
-			const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT;
-			if (require_format_conversion || pack_unpack_swap_bytes)
-			{
-				const auto section_length = valid_range.length();
-				const auto transfer_pitch = real_pitch;
-				const auto task_length = transfer_pitch * src_area.height();
-
-				auto working_buffer = vk::get_scratch_buffer(task_length);
-				auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
-
-				VkBufferImageCopy region = {};
-				region.imageSubresource = { src->aspect(), 0, 0, 1 };
-				region.imageOffset = { src_area.x1, src_area.y1, 0 };
-				region.imageExtent = { transfer_width, transfer_height, 1 };
-				vk::copy_image_to_buffer(cmd, src, working_buffer, region, (require_format_conversion && pack_unpack_swap_bytes));
-
-				// NOTE: For depth/stencil formats, copying to buffer and byteswap are combined into one step above
-				if (pack_unpack_swap_bytes && !require_format_conversion)
-				{
-					const auto texel_layout = vk::get_format_element_size(src->format());
-					const auto elem_size = texel_layout.first;
-					vk::cs_shuffle_base *shuffle_kernel;
-
-					if (elem_size == 2)
-					{
-						shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
-					}
-					else if (elem_size == 4)
-					{
-						shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
-					}
-					else
-					{
-						ensure(get_context() == rsx::texture_upload_context::dma);
-						shuffle_kernel = nullptr;
-					}
-
-					if (shuffle_kernel)
-					{
-						vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
-							VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-							VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
-
-						shuffle_kernel->run(cmd, working_buffer, task_length);
-
-						vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
-							VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
-							VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
-					}
-				}
-
-				if (rsx_pitch == real_pitch) [[likely]]
-				{
-					VkBufferCopy copy = {};
-					copy.dstOffset = final_mapping.first;
-					copy.size = section_length;
-					vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
-				}
-				else
-				{
-					if (context != rsx::texture_upload_context::dma)
-					{
-						// Partial load for the bits outside the existing image
-						// NOTE: A true DMA section would have been prepped beforehand
-						// TODO: Parial range load/flush
-						vk::load_dma(valid_range.start, section_length);
-					}
-
-					std::vector<VkBufferCopy> copy;
-					copy.reserve(transfer_height);
-
-					u32 dst_offset = final_mapping.first;
-					u32 src_offset = 0;
-
-					for (unsigned row = 0; row < transfer_height; ++row)
-					{
-						copy.push_back({ src_offset, dst_offset, transfer_pitch });
-						src_offset += real_pitch;
-						dst_offset += rsx_pitch;
-					}
-
-					vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
-				}
-			}
-			else
-			{
-				VkBufferImageCopy region = {};
-				region.bufferRowLength = (rsx_pitch / internal_bpp);
-				region.imageSubresource = { src->aspect(), 0, 0, 1 };
-				region.imageOffset = { src_area.x1, src_area.y1, 0 };
-				region.imageExtent = { transfer_width, transfer_height, 1 };
-
-				auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
-				region.bufferOffset = mapping.first;
-				vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, &region);
-			}
-
-			src->pop_layout(cmd);
-
-			// Create event object for this transfer and queue signal op
-			dma_fence = std::make_unique<vk::event>(*m_device);
-			dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT);
-
-			// Set cb flag for queued dma operations
-			cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
-
-			if (get_context() == rsx::texture_upload_context::dma)
-			{
-				// Save readback hint in case transformation is required later
-				switch (internal_bpp)
-				{
-				case 2:
-					gcm_format = CELL_GCM_TEXTURE_R5G6B5;
-					break;
-				case 4:
-				default:
-					gcm_format = CELL_GCM_TEXTURE_A8R8G8B8;
-					break;
-				}
-			}
-
-			synchronized = true;
-			sync_timestamp = get_system_time();
-		}
+		void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch);
 
 		void copy_texture(vk::command_buffer& cmd, bool miss)
 		{
@@ -610,202 +457,7 @@ namespace vk
 			return mapping;
 		}
 
-		void copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const
-		{
-			const auto dst_aspect = dst->aspect();
-			const auto dst_bpp = vk::get_format_texel_width(dst->format());
-
-			for (const auto &section : sections_to_transfer)
-			{
-				if (!section.src)
-					continue;
-
-				const bool typeless = section.src->aspect() != dst_aspect ||
-					!formats_are_bitcast_compatible(dst, section.src);
-
-				// Avoid inserting unnecessary barrier GENERAL->TRANSFER_SRC->GENERAL in active render targets
-				const auto preferred_layout = (section.src->current_layout != VK_IMAGE_LAYOUT_GENERAL) ?
-					VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL;
-
-				section.src->push_layout(cmd, preferred_layout);
-
-				auto src_image = section.src;
-				auto src_x = section.src_x;
-				auto src_y = section.src_y;
-				auto src_w = section.src_w;
-				auto src_h = section.src_h;
-
-				rsx::flags32_t transform = section.xform;
-				if (section.xform == rsx::surface_transform::coordinate_transform)
-				{
-					// Dimensions were given in 'dst' space. Work out the real source coordinates
-					const auto src_bpp = vk::get_format_texel_width(section.src->format());
-					src_x = (src_x * dst_bpp) / src_bpp;
-					src_w = ::aligned_div<u16>(src_w * dst_bpp, src_bpp);
-
-					transform &= ~(rsx::surface_transform::coordinate_transform);
-				}
-
-				if (auto surface = dynamic_cast<vk::render_target*>(section.src))
-				{
-					surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
-				}
-
-				if (typeless) [[unlikely]]
-				{
-					const auto src_bpp = vk::get_format_texel_width(section.src->format());
-					const u16 convert_w = u16(src_w * src_bpp) / dst_bpp;
-					const u16 convert_x = u16(src_x * src_bpp) / dst_bpp;
-
-					if (convert_w == section.dst_w && src_h == section.dst_h &&
-						transform == rsx::surface_transform::identity &&
-						section.level == 0 && section.dst_z == 0)
-					{
-						// Optimization to avoid double transfer
-						// TODO: Handle level and layer offsets
-						const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
-						const areai dst_rect = coordi{{ section.dst_x, section.dst_y }, { section.dst_w, section.dst_h }};
-						vk::copy_image_typeless(cmd, section.src, dst, src_rect, dst_rect, 1);
-
-						section.src->pop_layout(cmd);
-						continue;
-					}
-
-					src_image = vk::get_typeless_helper(dst->format(), dst->format_class(), convert_x + convert_w, src_y + src_h);
-					src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
-
-					const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
-					const areai dst_rect = coordi{{ convert_x, src_y }, { convert_w, src_h }};
-					vk::copy_image_typeless(cmd, section.src, src_image, src_rect, dst_rect, 1);
-					src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
-
-					src_x = convert_x;
-					src_w = convert_w;
-				}
-
-				ensure(src_image->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src_image->current_layout == VK_IMAGE_LAYOUT_GENERAL);
-
-				// Final aspect mask of the 'final' transfer source
-				const auto new_src_aspect = src_image->aspect();
-
-				if (src_w == section.dst_w && src_h == section.dst_h && transform == rsx::surface_transform::identity) [[likely]]
-				{
-					VkImageCopy copy_rgn;
-					copy_rgn.srcOffset = { src_x, src_y, 0 };
-					copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
-					copy_rgn.dstSubresource = { dst_aspect, 0, 0, 1 };
-					copy_rgn.srcSubresource = { new_src_aspect, 0, 0, 1 };
-					copy_rgn.extent = { src_w, src_h, 1 };
-
-					if (dst->info.imageType == VK_IMAGE_TYPE_3D)
-					{
-						copy_rgn.dstOffset.z = section.dst_z;
-					}
-					else
-					{
-						copy_rgn.dstSubresource.baseArrayLayer = section.dst_z;
-						copy_rgn.dstSubresource.mipLevel = section.level;
-					}
-
-					vkCmdCopyImage(cmd, src_image->value, src_image->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
-				}
-				else
-				{
-					ensure(section.dst_z == 0);
-
-					u16 dst_x = section.dst_x, dst_y = section.dst_y;
-					vk::image* _dst;
-
-					if (src_image->info.format == dst->info.format && section.level == 0) [[likely]]
-					{
-						_dst = dst;
-					}
-					else
-					{
-						// Either a bitcast is required or a scale+copy to mipmap level
-						_dst = vk::get_typeless_helper(src_image->format(), src_image->format_class(), dst->width(), dst->height() * 2);
-						_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
-					}
-
-					if (transform == rsx::surface_transform::identity)
-					{
-						vk::copy_scaled_image(cmd, src_image, _dst,
-							coordi{ { src_x, src_y }, { src_w, src_h } },
-							coordi{ { section.dst_x, section.dst_y }, { section.dst_w, section.dst_h } },
-							1, src_image->format() == _dst->format(),
-							VK_FILTER_NEAREST);
-					}
-					else if (transform == rsx::surface_transform::argb_to_bgra)
-					{
-						VkBufferImageCopy copy{};
-						copy.imageExtent = { src_w, src_h, 1 };
-						copy.imageOffset = { src_x, src_y, 0 };
-						copy.imageSubresource = { src_image->aspect(), 0, 0, 1 };
-
-						const auto mem_length = src_w * src_h * dst_bpp;
-						auto scratch_buf = vk::get_scratch_buffer(mem_length);
-						vkCmdCopyImageToBuffer(cmd, src_image->value, src_image->current_layout, scratch_buf->value, 1, &copy);
-
-						vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-							VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
-
-						auto shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
-						shuffle_kernel->run(cmd, scratch_buf, mem_length);
-
-						vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
-							VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
-
-						auto tmp = vk::get_typeless_helper(src_image->format(), src_image->format_class(), section.dst_x + section.dst_w, section.dst_y + section.dst_h);
-						tmp->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
-
-						copy.imageOffset = { 0, 0, 0 };
-						vkCmdCopyBufferToImage(cmd, scratch_buf->value, tmp->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy);
-
-						dst_x = 0;
-						dst_y = 0;
-
-						if (src_w != section.dst_w || src_h != section.dst_h)
-						{
-							// Optionally scale if needed
-							if (tmp == _dst) [[unlikely]]
-							{
-								dst_y = src_h;
-							}
-
-							vk::copy_scaled_image(cmd, tmp, _dst,
-								areai{ 0, 0, src_w, static_cast<s32>(src_h) },
-								coordi{ { dst_x, dst_y }, { section.dst_w, section.dst_h } },
-								1, tmp->info.format == _dst->info.format,
-								VK_FILTER_NEAREST);
-						}
-						else
-						{
-							_dst = tmp;
-						}
-					}
-					else
-					{
-						fmt::throw_exception("Unreachable");
-					}
-
-					if (_dst != dst) [[unlikely]]
-					{
-						// Casting comes after the scaling!
-						VkImageCopy copy_rgn;
-						copy_rgn.srcOffset = { s32(dst_x), s32(dst_y), 0 };
-						copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
-						copy_rgn.dstSubresource = { dst_aspect, section.level, 0, 1 };
-						copy_rgn.srcSubresource = { _dst->aspect(), 0, 0, 1 };
-						copy_rgn.extent = { section.dst_w, section.dst_h, 1 };
-
-						_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
-						vkCmdCopyImage(cmd, _dst->value, _dst->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
-					}
-				}
-
-				section.src->pop_layout(cmd);
-			}
-		}
+		void copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const;
 
 		vk::image* get_template_from_collection_impl(const std::vector<copy_region_descriptor>& sections_to_transfer) const
 		{
diff --git a/rpcs3/GLGSRender.vcxproj b/rpcs3/GLGSRender.vcxproj
index f49cbef01e..e2ae2e13ad 100644
--- a/rpcs3/GLGSRender.vcxproj
+++ b/rpcs3/GLGSRender.vcxproj
@@ -107,6 +107,7 @@
     <ClCompile Include="Emu\RSX\GL\OpenGL.cpp" />
     <ClCompile Include="Emu\RSX\GL\GLTexture.cpp" />
     <ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
+    <ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/rpcs3/GLGSRender.vcxproj.filters b/rpcs3/GLGSRender.vcxproj.filters
index 8d24eb74ba..7ba169eba9 100644
--- a/rpcs3/GLGSRender.vcxproj.filters
+++ b/rpcs3/GLGSRender.vcxproj.filters
@@ -14,6 +14,7 @@
     <ClCompile Include="Emu\RSX\GL\GLShaderInterpreter.cpp" />
     <ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
     <ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
+    <ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Emu\RSX\GL\GLTexture.h" />
diff --git a/rpcs3/Loader/PSF.cpp b/rpcs3/Loader/PSF.cpp
index 60d8cc875f..6d0952aa15 100644
--- a/rpcs3/Loader/PSF.cpp
+++ b/rpcs3/Loader/PSF.cpp
@@ -1,6 +1,8 @@
 #include "stdafx.h"
 #include "PSF.h"
 
+#include "util/asm.hpp"
+
 LOG_CHANNEL(psf_log, "PSF");
 
 template<>
@@ -208,7 +210,7 @@ namespace psf
 		}
 
 		// Align next section (data) offset
-		key_offset = ::align(key_offset, 4);
+		key_offset = utils::align(key_offset, 4);
 
 		// Generate header
 		header_t header;
diff --git a/rpcs3/VKGSRender.vcxproj b/rpcs3/VKGSRender.vcxproj
index 3f3eabafce..be1582e34e 100644
--- a/rpcs3/VKGSRender.vcxproj
+++ b/rpcs3/VKGSRender.vcxproj
@@ -67,6 +67,7 @@
     <ClCompile Include="Emu\RSX\VK\VKTexture.cpp" />
     <ClCompile Include="Emu\RSX\VK\VKVertexBuffers.cpp" />
     <ClCompile Include="Emu\RSX\VK\VKVertexProgram.cpp" />
+    <ClCompile Include="Emu\RSX\VK\VKTextureCache.cpp" />
     <ClCompile Include="Emu\RSX\VK\VKMemAlloc.cpp" />
   </ItemGroup>
   <ItemGroup>
diff --git a/rpcs3/VKGSRender.vcxproj.filters b/rpcs3/VKGSRender.vcxproj.filters
index d9ff7e0d59..8ce096bb50 100644
--- a/rpcs3/VKGSRender.vcxproj.filters
+++ b/rpcs3/VKGSRender.vcxproj.filters
@@ -18,6 +18,7 @@
     <ClCompile Include="Emu\RSX\VK\VKTexture.cpp" />
     <ClCompile Include="Emu\RSX\VK\VKVertexBuffers.cpp" />
     <ClCompile Include="Emu\RSX\VK\VKVertexProgram.cpp" />
+    <ClCompile Include="Emu\RSX\VK\VKTextureCache.cpp" />
     <ClCompile Include="Emu\RSX\VK\VKMemAlloc.cpp" />
     <ClCompile Include="Emu\RSX\VK\VKCommandStream.cpp" />
     <ClCompile Include="Emu\RSX\VK\VKQueryPool.cpp" />
diff --git a/rpcs3/rpcs3qt/cheat_manager.cpp b/rpcs3/rpcs3qt/cheat_manager.cpp
index b534f8b347..ea7abd3c92 100644
--- a/rpcs3/rpcs3qt/cheat_manager.cpp
+++ b/rpcs3/rpcs3qt/cheat_manager.cpp
@@ -18,6 +18,7 @@
 #include "Emu/Cell/PPUFunction.h"
 
 #include "util/yaml.hpp"
+#include "util/asm.hpp"
 #include "util/to_endian.hpp"
 #include "Utilities/StrUtil.h"
 #include "Utilities/bin_patch.h" // get_patches_path()
@@ -418,17 +419,17 @@ bool cheat_engine::set_value(const u32 offset, const T value)
 
 			if (exec_code_at_end && exec_code_at_start)
 			{
-				size = align<u32>(addr + size, 4) - (addr & -4);
+				size = utils::align<u32>(addr + size, 4) - (addr & -4);
 				addr &= -4;
 			}
 			else if (exec_code_at_end)
 			{
-				size -= align<u32>(size - 4096 + (addr & 4095), 4);
-				addr = align<u32>(addr, 4096);
+				size -= utils::align<u32>(size - 4096 + (addr & 4095), 4);
+				addr = utils::align<u32>(addr, 4096);
 			}
 			else if (exec_code_at_start)
 			{
-				size = align<u32>(4096 - (addr & 4095), 4);
+				size = utils::align<u32>(4096 - (addr & 4095), 4);
 				addr &= -4;
 			}
 
diff --git a/rpcs3/rpcs3qt/debugger_frame.cpp b/rpcs3/rpcs3qt/debugger_frame.cpp
index 011c0e49fa..7ab9da67ee 100644
--- a/rpcs3/rpcs3qt/debugger_frame.cpp
+++ b/rpcs3/rpcs3qt/debugger_frame.cpp
@@ -27,6 +27,8 @@
 #include <QVBoxLayout>
 #include <QTimer>
 
+#include "util/asm.hpp"
+
 constexpr auto qstr = QString::fromStdString;
 
 debugger_frame::debugger_frame(std::shared_ptr<gui_settings> settings, QWidget *parent)
@@ -573,7 +575,7 @@ void debugger_frame::ShowGotoAddressDialog()
 	if (cpu)
 	{
 		// -1 turns into 0
-		u32 pc = ::align<u32>(cpu->get_pc(), 4);
+		u32 pc = utils::align<u32>(cpu->get_pc(), 4);
 		address_preview_label->setText(QString("Address: 0x%1").arg(pc, 8, 16, QChar('0')));
 		expression_input->setPlaceholderText(QString("0x%1").arg(pc, 8, 16, QChar('0')));
 	}
@@ -605,7 +607,7 @@ void debugger_frame::ShowGotoAddressDialog()
 	if (diag->exec() == QDialog::Accepted)
 	{
 		// -1 turns into 0
-		u32 address = ::align<u32>(cpu ? cpu->get_pc() : 0, 4);
+		u32 address = utils::align<u32>(cpu ? cpu->get_pc() : 0, 4);
 
 		if (expression_input->text().isEmpty())
 		{
diff --git a/rpcs3/rpcs3qt/memory_viewer_panel.cpp b/rpcs3/rpcs3qt/memory_viewer_panel.cpp
index 7e91f6009c..2e79798db8 100644
--- a/rpcs3/rpcs3qt/memory_viewer_panel.cpp
+++ b/rpcs3/rpcs3qt/memory_viewer_panel.cpp
@@ -15,6 +15,8 @@
 #include <QWheelEvent>
 #include <shared_mutex>
 
+#include "util/asm.hpp"
+
 constexpr auto qstr = QString::fromStdString;
 
 memory_viewer_panel::memory_viewer_panel(QWidget* parent, u32 addr)
@@ -209,7 +211,7 @@ memory_viewer_panel::memory_viewer_panel(QWidget* parent, u32 addr)
 	{
 		bool ok;
 		const QString text = m_addr_line->text();
-		m_addr = (text.startsWith("0x", Qt::CaseInsensitive) ? text.right(text.size() - 2) : text).toULong(&ok, 16); 
+		m_addr = (text.startsWith("0x", Qt::CaseInsensitive) ? text.right(text.size() - 2) : text).toULong(&ok, 16);
 		m_addr -= m_addr % (m_colcount * 4); // Align by amount of bytes in a row
 		m_addr_line->setText(QString("%1").arg(m_addr, 8, 16, QChar('0')));	// get 8 digits in input line
 		ShowMemory();
@@ -293,7 +295,7 @@ void memory_viewer_panel::resizeEvent(QResizeEvent *event)
 std::string memory_viewer_panel::getHeaderAtAddr(u32 addr)
 {
 	// Check if its an SPU Local Storage beginning
-	const u32 spu_boundary = ::align<u32>(addr, SPU_LS_SIZE);
+	const u32 spu_boundary = utils::align<u32>(addr, SPU_LS_SIZE);
 
 	if (spu_boundary <= addr + m_colcount * 4 - 1)
 	{
diff --git a/rpcs3/rpcs3qt/register_editor_dialog.cpp b/rpcs3/rpcs3qt/register_editor_dialog.cpp
index ecf4268579..b16a288932 100644
--- a/rpcs3/rpcs3qt/register_editor_dialog.cpp
+++ b/rpcs3/rpcs3qt/register_editor_dialog.cpp
@@ -15,6 +15,7 @@
 #include <charconv>
 
 #include "util/v128.hpp"
+#include "util/asm.hpp"
 
 constexpr auto qstr = QString::fromStdString;
 inline std::string sstr(const QString& _in) { return _in.toStdString(); }
@@ -30,7 +31,7 @@ enum registers : int
 	ppu_ff31 = ppu_ff0 + 31,
 	ppu_v0,
 	ppu_v31 = ppu_v0 + 31,
-	spu_r0 = ::align(ppu_v31 + 1u, 128),
+	spu_r0 = utils::align(ppu_v31 + 1u, 128),
 	spu_r127 = spu_r0 + 127,
 	PPU_CR,
 	PPU_LR,
diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp
index c0761f2c1e..9ec73cf9ae 100644
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@@ -34,6 +34,7 @@
 #include <thread>
 
 #include "util/sysinfo.hpp"
+#include "util/asm.hpp"
 
 #ifdef WITH_DISCORD_RPC
 #include "_discord_utils.h"
@@ -1809,7 +1810,7 @@ void settings_dialog::SnapSlider(QSlider *slider, int interval)
 		{
 			return;
 		}
-		slider->setValue(::rounded_div(value, interval) * interval);
+		slider->setValue(utils::rounded_div(value, interval) * interval);
 	});
 }
 
diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp
index 0d8c24bb3a..ef72fbb50e 100644
--- a/rpcs3/util/asm.hpp
+++ b/rpcs3/util/asm.hpp
@@ -292,6 +292,32 @@ namespace utils
 		do _mm_pause();
 		while (__rdtsc() - start < cycles);
 	}
+
+	// Align to power of 2
+	template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
+	constexpr T align(T value, ullong align)
+	{
+		return static_cast<T>((value + (align - 1)) & (0 - align));
+	}
+
+	// General purpose aligned division, the result is rounded up not truncated
+	template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
+	constexpr T aligned_div(T value, ullong align)
+	{
+		return static_cast<T>((value + align - 1) / align);
+	}
+
+	// General purpose aligned division, the result is rounded to nearest
+	template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
+	constexpr T rounded_div(T value, std::conditional_t<std::is_signed<T>::value, llong, ullong> align)
+	{
+		if constexpr (std::is_unsigned<T>::value)
+		{
+			return static_cast<T>((value + (align / 2)) / align);
+		}
+
+		return static_cast<T>((value + (value < 0 ? 0 - align : align) / 2) / align);
+	}
 } // namespace utils
 
 using utils::busy_wait;
diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp
index 826ec1220d..d9d798766b 100755
--- a/rpcs3/util/sysinfo.cpp
+++ b/rpcs3/util/sysinfo.cpp
@@ -15,6 +15,8 @@
 #include <errno.h>
 #endif
 
+#include "util/asm.hpp"
+
 inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
 {
 	int regs[4];
@@ -298,7 +300,7 @@ std::string utils::get_OS_version()
 
 static constexpr ullong round_tsc(ullong val)
 {
-	return ::rounded_div(val, 1'000'000) * 1'000'000;
+	return utils::rounded_div(val, 1'000'000) * 1'000'000;
 }
 
 ullong utils::get_tsc_freq()
diff --git a/rpcs3/util/types.hpp b/rpcs3/util/types.hpp
index 93ceb1c65b..c591badcdb 100644
--- a/rpcs3/util/types.hpp
+++ b/rpcs3/util/types.hpp
@@ -595,31 +595,6 @@ struct f16
 	}
 };
 
-template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
-constexpr T align(T value, ullong align)
-{
-	return static_cast<T>((value + (align - 1)) & (0 - align));
-}
-
-// General purpose aligned division, the result is rounded up not truncated
-template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
-constexpr T aligned_div(T value, ullong align)
-{
-	return static_cast<T>((value + align - 1) / align);
-}
-
-// General purpose aligned division, the result is rounded to nearest
-template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
-constexpr T rounded_div(T value, std::conditional_t<std::is_signed<T>::value, llong, ullong> align)
-{
-	if constexpr (std::is_unsigned<T>::value)
-	{
-		return static_cast<T>((value + (align / 2)) / align);
-	}
-
-	return static_cast<T>((value + (value < 0 ? 0 - align : align) / 2) / align);
-}
-
 template <typename T, typename T2>
 inline u32 offset32(T T2::*const mptr)
 {
diff --git a/rpcs3/util/vm_native.cpp b/rpcs3/util/vm_native.cpp
index 47b132e73a..7320c3b75e 100644
--- a/rpcs3/util/vm_native.cpp
+++ b/rpcs3/util/vm_native.cpp
@@ -1,6 +1,7 @@
 #include "stdafx.h"
 #include "util/logs.hpp"
 #include "util/vm.hpp"
+#include "util/asm.hpp"
 #ifdef _WIN32
 #include "util/dyn_lib.hpp"
 #include <Windows.h>
@@ -209,7 +210,7 @@ namespace utils
 	}
 
 	shm::shm(u32 size, u32 flags)
-		: m_size(::align(size, 0x10000))
+		: m_size(utils::align(size, 0x10000))
 		, m_flags(flags)
 		, m_ptr(0)
 	{
@@ -306,7 +307,7 @@ namespace utils
 		{
 			const u64 res64 = reinterpret_cast<u64>(::mmap(reinterpret_cast<void*>(ptr64), m_size + 0xf000, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0));
 
-			const u64 aligned = ::align(res64, 0x10000);
+			const u64 aligned = utils::align(res64, 0x10000);
 			const auto result = ::mmap(reinterpret_cast<void*>(aligned), m_size, +prot, MAP_SHARED | MAP_FIXED, m_file, 0);
 
 			// Now cleanup remnants