Move align helpers to util/asm.hpp

Also add some files: GLTextureCache.cpp VKTextureCache.cpp
2024-11-25 04:02:42 +01:00 · 2020-12-18 17:43:34 +03:00 · 2020-12-18 17:43:34 +03:00 · eec11bfba9
commit eec11bfba9
parent d254a5736b
52 changed files with 794 additions and 713 deletions
--- a/Utilities/File.cpp
+++ b/Utilities/File.cpp
@ -10,6 +10,8 @@
 #include <typeinfo>
 #include <map>
 #include "util/asm.hpp"
 using namespace std::literals::string_literals;
 #ifdef _WIN32
@ -1725,7 +1727,7 @@ u64 fs::get_dir_size(const std::string& path, u64 rounding_alignment)
 		if (!entry.is_directory)
 		{
-			result += ::align(entry.size, rounding_alignment);
+			result += utils::align(entry.size, rounding_alignment);
 		}
 		else
 		{
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@ -6,6 +6,7 @@
 #include "util/logs.hpp"
 #include "mutex.h"
 #include "util/vm.hpp"
 #include "util/asm.hpp"
 #include <immintrin.h>
 #include <zlib.h>
@ -52,8 +53,8 @@ static u8* add_jit_memory(usz size, uint align)
 	// Simple allocation by incrementing pointer to the next free data
 	const u64 pos = Ctr.atomic_op([&](u64& ctr) -> u64
 	{
-		const u64 _pos = ::align(ctr & 0xffff'ffff, align);
+		const u64 _pos = utils::align(ctr & 0xffff'ffff, align);
-		const u64 _new = ::align(_pos + size, align);
+		const u64 _new = utils::align(_pos + size, align);
 		if (_new > 0x40000000) [[unlikely]]
 		{
@ -69,7 +70,7 @@ static u8* add_jit_memory(usz size, uint align)
 		// Check the necessity to commit more memory
 		if (_new > olda) [[unlikely]]
 		{
-			newa = ::align(_new, 0x200000);
+			newa = utils::align(_new, 0x200000);
 		}
 		ctr += _new - (ctr & 0xffff'ffff);
@ -223,7 +224,7 @@ asmjit::Runtime& asmjit::get_global_runtime()
 				return asmjit::kErrorNoCodeGenerated;
 			}
-			void* p = m_pos.fetch_add(::align(codeSize, 4096));
+			void* p = m_pos.fetch_add(utils::align(codeSize, 4096));
 			if (!p || m_pos > m_max) [[unlikely]]
 			{
 				*dst = nullptr;
@ -237,7 +238,7 @@ asmjit::Runtime& asmjit::get_global_runtime()
 				return asmjit::kErrorInvalidState;
 			}
-			utils::memory_protect(p, ::align(codeSize, 4096), utils::protection::rx);
+			utils::memory_protect(p, utils::align(codeSize, 4096), utils::protection::rx);
 			flush(p, relocSize);
 			*dst = p;
@ -351,8 +352,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
 			return nullptr;
 		}
-		const u64 olda = ::align(oldp, align);
+		const u64 olda = utils::align(oldp, align);
-		const u64 newp = ::align(olda + size, align);
+		const u64 newp = utils::align(olda + size, align);
 		if ((newp - 1) / c_max_size != oldp / c_max_size)
 		{
@ -363,8 +364,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
 		if ((oldp - 1) / c_page_size != (newp - 1) / c_page_size)
 		{
 			// Allocate pages on demand
-			const u64 pagea = ::align(oldp, c_page_size);
+			const u64 pagea = utils::align(oldp, c_page_size);
-			const u64 psize = ::align(newp - pagea, c_page_size);
+			const u64 psize = utils::align(newp - pagea, c_page_size);
 			utils::memory_commit(this->ptr + pagea, psize, prot);
 		}
--- a/rpcs3/Crypto/unedat.cpp
+++ b/rpcs3/Crypto/unedat.cpp
@ -6,6 +6,7 @@
 #include <cmath>
 #include "util/v128.hpp"
 #include "util/asm.hpp"
 LOG_CHANNEL(edat_log, "EDAT");
@ -949,7 +950,7 @@ bool EDATADecrypter::ReadHeader()
 	}*/
 	file_size = edatHeader.file_size;
-	total_blocks = ::aligned_div(edatHeader.file_size, edatHeader.block_size);
+	total_blocks = utils::aligned_div(edatHeader.file_size, edatHeader.block_size);
 	return true;
 }
@ -962,7 +963,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size)
 	// now we need to offset things to account for the actual 'range' requested
 	const u64 startOffset = pos % edatHeader.block_size;
-	const u32 num_blocks = static_cast<u32>(::aligned_div(startOffset + size, edatHeader.block_size));
+	const u32 num_blocks = static_cast<u32>(utils::aligned_div(startOffset + size, edatHeader.block_size));
 	const u64 bufSize = num_blocks*edatHeader.block_size;
 	if (data_buf_size < (bufSize))
 	{
--- a/rpcs3/Emu/CMakeLists.txt
+++ b/rpcs3/Emu/CMakeLists.txt
@ -428,6 +428,7 @@ target_sources(rpcs3_emu PRIVATE
 	RSX/GL/GLTexture.cpp
 	RSX/GL/GLVertexBuffers.cpp
 	RSX/GL/GLVertexProgram.cpp
 	RSX/GL/GLTextureCache.cpp
 	RSX/GL/OpenGL.cpp
 )
@ -454,6 +455,7 @@ if(TARGET 3rdparty_vulkan)
 		RSX/VK/VKTexture.cpp
 		RSX/VK/VKVertexBuffers.cpp
 		RSX/VK/VKVertexProgram.cpp
 		RSX/VK/VKTextureCache.cpp
 	)
 endif()
--- a/rpcs3/Emu/Cell/Modules/cellDmux.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellDmux.cpp
@ -7,7 +7,7 @@
 #include "cellPamf.h"
 #include "cellDmux.h"
-#include <thread>
+#include "util/asm.hpp"
 LOG_CHANNEL(cellDmux);
@ -753,9 +753,9 @@ PesHeader::PesHeader(DemuxerStream& stream)
 }
 ElementaryStream::ElementaryStream(Demuxer* dmux, u32 addr, u32 size, u32 fidMajor, u32 fidMinor, u32 sup1, u32 sup2, vm::ptr<CellDmuxCbEsMsg> cbFunc, u32 cbArg, u32 spec)
-	: put(align(addr, 128))
+	: put(utils::align(addr, 128))
 	, dmux(dmux)
-	, memAddr(align(addr, 128))
+	, memAddr(utils::align(addr, 128))
 	, memSize(size - (addr - memAddr))
 	, fidMajor(fidMajor)
 	, fidMinor(fidMinor)
@ -847,7 +847,7 @@ void ElementaryStream::push_au(u32 size, u64 dts, u64 pts, u64 userdata, bool ra
 		addr = put;
-		put = align(put + 128 + size, 128);
+		put = utils::align(put + 128 + size, 128);
 		put_count++;
 	}
--- a/rpcs3/Emu/Cell/Modules/cellSaveData.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSaveData.cpp
@ -20,6 +20,8 @@
 #include <mutex>
 #include <algorithm>
 #include "util/asm.hpp"
 LOG_CHANNEL(cellSaveData);
 template<>
@ -953,7 +955,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
 			{
 				if (!file.is_directory)
 				{
-					size_bytes += ::align(file.size, 1024);
+					size_bytes += utils::align(file.size, 1024);
 				}
 			}
@ -1334,7 +1336,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
 			{
 				statGet->fileNum++;
-				size_bytes += ::align(entry.size, 1024); // firmware rounds this value up
+				size_bytes += utils::align(entry.size, 1024); // firmware rounds this value up
 				if (statGet->fileListNum >= setBuf->fileListMax)
 					continue;
@ -1892,7 +1894,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
 		// add file list per FS order to PARAM.SFO
 		std::string final_blist;
 		final_blist = fmt::merge(blist, "/");
-		psf::assign(psf, "RPCS3_BLIST", psf::string(::align(::size32(final_blist) + 1, 4), final_blist));
+		psf::assign(psf, "RPCS3_BLIST", psf::string(utils::align(::size32(final_blist) + 1, 4), final_blist));
 		// Write all files in temporary directory
 		auto& fsfo = all_files["PARAM.SFO"];
--- a/rpcs3/Emu/Cell/Modules/cellVdec.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellVdec.cpp
@ -34,6 +34,7 @@ extern "C"
 #include <cmath>
 #include "Utilities/lockless.h"
 #include <variant>
 #include "util/asm.hpp"
 std::mutex g_mutex_avcodec_open2;
@ -879,7 +880,7 @@ error_code cellVdecGetPicture(u32 handle, vm::cptr<CellVdecPicFormat> format, vm
 		sws_scale(vdec->sws, in_data, in_line, 0, h, out_data, out_line);
-		//const u32 buf_size = align(av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1), 128);
+		//const u32 buf_size = utils::align(av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1), 128);
 		//// TODO: zero padding bytes
@ -974,7 +975,7 @@ error_code cellVdecGetPicItem(u32 handle, vm::pptr<CellVdecPicItem> picItem)
 	info->startAddr = 0x00000123; // invalid value (no address for picture)
 	const int buffer_size = av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1);
 	ensure(buffer_size >= 0);
-	info->size = align<u32>(buffer_size, 128);
+	info->size = utils::align<u32>(buffer_size, 128);
 	info->auNum = 1;
 	info->auPts[0].lower = static_cast<u32>(pts);
 	info->auPts[0].upper = static_cast<u32>(pts >> 32);
--- a/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp
+++ b/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp
@ -20,6 +20,7 @@
 #include "Emu/Cell/lv2/sys_process.h"
 #include <cmath>
 #include "util/asm.hpp"
 LOG_CHANNEL(sceNpTrophy);
@ -1109,7 +1110,7 @@ error_code sceNpTrophyGetGameProgress(u32 context, u32 handle, vm::ptr<s32> perc
 	const u32 trp_count = ctxt->tropusr->GetTrophiesCount();
 	// Round result to nearest (TODO: Check 0 trophies)
-	*percentage = trp_count ? ::rounded_div(unlocked * 100, trp_count) : 0;
+	*percentage = trp_count ? utils::rounded_div(unlocked * 100, trp_count) : 0;
 	if (trp_count == 0 || trp_count > 128)
 	{
--- a/rpcs3/Emu/Cell/PPUModule.cpp
+++ b/rpcs3/Emu/Cell/PPUModule.cpp
@ -22,6 +22,7 @@
 #include <map>
 #include <set>
 #include <algorithm>
 #include "util/asm.hpp"
 LOG_CHANNEL(ppu_loader);
@ -263,7 +264,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link)
 	}
 	// Set memory protection to read-only
-	vm::page_protect(ppu_function_manager::addr, ::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable);
+	vm::page_protect(ppu_function_manager::addr, utils::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable);
 	// Initialize function names
 	const bool is_first = g_ppu_function_names.empty();
@ -319,7 +320,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link)
 			}
 			else
 			{
-				const u32 next = ::align(alloc_addr, variable.second.align);
+				const u32 next = utils::align(alloc_addr, variable.second.align);
 				const u32 end = next + variable.second.size;
 				if (!next || (end >> 12 != alloc_addr >> 12))
@ -1500,7 +1501,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
 	for (const auto& arg : Emu.argv)
 	{
-		const u32 arg_size = ::align(::size32(arg) + 1, 0x10);
+		const u32 arg_size = utils::align(::size32(arg) + 1, 0x10);
 		const u32 arg_addr = vm::alloc(arg_size, vm::main);
 		std::memcpy(vm::base(arg_addr), arg.data(), arg_size);
@ -1513,7 +1514,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
 	for (const auto& arg : Emu.envp)
 	{
-		const u32 arg_size = ::align(::size32(arg) + 1, 0x10);
+		const u32 arg_size = utils::align(::size32(arg) + 1, 0x10);
 		const u32 arg_addr = vm::alloc(arg_size, vm::main);
 		std::memcpy(vm::base(arg_addr), arg.data(), arg_size);
@ -1533,7 +1534,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
 	case 0x70: primary_stacksize = 1024 * 1024; break; // SYS_PROCESS_PRIMARY_STACK_SIZE_1M
 	default:
 	{
-		primary_stacksize = ::align<u32>(std::clamp<u32>(sz, 0x10000, 0x100000), 4096);
+		primary_stacksize = utils::align<u32>(std::clamp<u32>(sz, 0x10000, 0x100000), 4096);
 		break;
 	}
 	}
@ -1636,7 +1637,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
 		if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz && (prog.p_flags & 0x2) == 0u /* W */)
 		{
 			// Set memory protection to read-only when necessary
-			ensure(vm::page_protect(addr, ::align(size, 0x1000), 0, 0, vm::page_writable));
+			ensure(vm::page_protect(addr, utils::align(size, 0x1000), 0, 0, vm::page_writable));
 		}
 	}
 }
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -242,7 +242,7 @@ extern void ppu_register_range(u32 addr, u32 size)
 	// Register executable range at
 	utils::memory_commit(&ppu_ref(addr), size * 2, utils::protection::rw);
-	vm::page_protect(addr, align(size, 0x10000), 0, vm::page_executable);
+	vm::page_protect(addr, utils::align(size, 0x10000), 0, vm::page_executable);
 	const u64 fallback = g_cfg.core.ppu_decoder == ppu_decoder_type::llvm ? reinterpret_cast<uptr>(ppu_recompiler_fallback) : reinterpret_cast<uptr>(ppu_fallback);
@ -1098,7 +1098,7 @@ u32 ppu_thread::stack_push(u32 size, u32 align_v)
 		ppu_thread& context = static_cast<ppu_thread&>(*cpu);
 		const u32 old_pos = vm::cast(context.gpr[1]);
-		context.gpr[1] -= align(size + 4, 8); // room minimal possible size
+		context.gpr[1] -= utils::align(size + 4, 8); // room minimal possible size
 		context.gpr[1] &= ~(u64{align_v} - 1); // fix stack alignment
 		if (old_pos >= context.stack_addr && old_pos < context.stack_addr + context.stack_size && context.gpr[1] < context.stack_addr)
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -288,7 +288,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 64;
 		const u32 starta = start & -64;
-		const u32 enda = ::align(end, 64);
+		const u32 enda = utils::align(end, 64);
 		const u32 sizea = (enda - starta) / 64;
 		ensure(sizea);
@ -369,7 +369,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 32;
 		const u32 starta = start & -32;
-		const u32 enda = ::align(end, 32);
+		const u32 enda = utils::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		ensure(sizea);
@ -491,7 +491,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 32;
 		const u32 starta = start & -32;
-		const u32 enda = ::align(end, 32);
+		const u32 enda = utils::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		ensure(sizea);
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -2338,7 +2338,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 			}
 			u32 range_addr = eal & -128;
-			u32 range_end = ::align(eal + size, 128);
+			u32 range_end = utils::align(eal + size, 128);
 			// Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?)
 			if (range_addr >> 16 != (range_end - 1) >> 16)
--- a/rpcs3/Emu/Cell/lv2/sys_memory.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_memory.cpp
@ -8,6 +8,7 @@
 #include "Emu/IdManager.h"
 #include "util/vm.hpp"
 #include "util/asm.hpp"
 LOG_CHANNEL(sys_memory);
@ -57,7 +58,7 @@ error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptr<u32
 		return CELL_ENOMEM;
 	}
-	if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, ::align(size, 0x10000000), 0x401))
+	if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, utils::align(size, 0x10000000), 0x401))
 	{
 		if (u32 addr = area->alloc(size, nullptr, align))
 		{
@ -128,7 +129,7 @@ error_code sys_memory_allocate_from_container(cpu_thread& cpu, u32 size, u32 cid
 		return ct.ret;
 	}
-	if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, ::align(size, 0x10000000), 0x401))
+	if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, utils::align(size, 0x10000000), 0x401))
 	{
 		if (u32 addr = area->alloc(size))
 		{
--- a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp
@ -12,6 +12,8 @@
 #include "sys_mmapper.h"
 #include "sys_memory.h"
 #include "util/asm.hpp"
 LOG_CHANNEL(sys_ppu_thread);
 // Simple structure to cleanup previous thread, because can't remove its own thread
@ -388,7 +390,7 @@ error_code _sys_ppu_thread_create(ppu_thread& ppu, vm::ptr<u64> thread_id, vm::p
 	g_fxo->get<ppu_thread_cleaner>()->clean(0);
 	// Compute actual stack size and allocate
-	const u32 stack_size = ::align<u32>(std::max<u32>(_stacksz, 4096), 4096);
+	const u32 stack_size = utils::align<u32>(std::max<u32>(_stacksz, 4096), 4096);
 	const auto dct = g_fxo->get<lv2_memory_container>();
--- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp
@ -99,7 +99,7 @@ void sys_spu_image::load(const fs::file& stream)
 	this->nsegs = 0;
 	this->segs = vm::null;
-	vm::page_protect(segs.addr(), ::align(mem_size, 4096), 0, 0, vm::page_writable);
+	vm::page_protect(segs.addr(), utils::align(mem_size, 4096), 0, 0, vm::page_writable);
 }
 void sys_spu_image::free()
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -974,13 +974,13 @@ namespace vm
 			if (state & page_1m_size)
 			{
-				i = ::align(i + 1, 0x100000 / 4096);
+				i = utils::align(i + 1, 0x100000 / 4096);
 				continue;
 			}
 			if (state & page_64k_size)
 			{
-				i = ::align(i + 1, 0x10000 / 4096);
+				i = utils::align(i + 1, 0x10000 / 4096);
 				continue;
 			}
@ -1177,7 +1177,7 @@ namespace vm
 		const u32 min_page_size = flags & 0x100 ? 0x1000 : 0x10000;
 		// Align to minimal page size
-		const u32 size = ::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0);
+		const u32 size = utils::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0);
 		// Check alignment (it's page allocation, so passing small values there is just silly)
 		if (align < min_page_size || align != (0x80000000u >> std::countl_zero(align)))
@ -1217,7 +1217,7 @@ namespace vm
 		vm::writer_lock lock(0);
 		// Search for an appropriate place (unoptimized)
-		for (u32 addr = ::align(this->addr, align); u64{addr} + size <= u64{this->addr} + this->size; addr += align)
+		for (u32 addr = utils::align(this->addr, align); u64{addr} + size <= u64{this->addr} + this->size; addr += align)
 		{
 			if (try_alloc(addr, pflags, size, std::move(shm)))
 			{
@ -1240,7 +1240,7 @@ namespace vm
 		const u32 min_page_size = flags & 0x100 ? 0x1000 : 0x10000;
 		// Align to minimal page size
-		const u32 size = ::align(orig_size, min_page_size);
+		const u32 size = utils::align(orig_size, min_page_size);
 		// return if addr or size is invalid
 		if (!size || addr < this->addr || orig_size > size || addr + u64{size} > this->addr + u64{this->size} || flags & 0x10)
@ -1410,7 +1410,7 @@ namespace vm
 	static std::shared_ptr<block_t> _find_map(u32 size, u32 align, u64 flags)
 	{
-		for (u32 addr = ::align<u32>(0x20000000, align); addr - 1 < 0xC0000000 - 1; addr += align)
+		for (u32 addr = utils::align<u32>(0x20000000, align); addr - 1 < 0xC0000000 - 1; addr += align)
 		{
 			if (_test_map(addr, size))
 			{
@ -1485,7 +1485,7 @@ namespace vm
 		vm::writer_lock lock(0);
 		// Align to minimal page size
-		const u32 size = ::align(orig_size, 0x10000);
+		const u32 size = utils::align(orig_size, 0x10000);
 		// Check alignment
 		if (align < 0x10000 || align != (0x80000000u >> std::countl_zero(align)))
--- a/rpcs3/Emu/NP/np_handler.cpp
+++ b/rpcs3/Emu/NP/np_handler.cpp
@ -32,6 +32,8 @@
 #include <net/if_dl.h>
 #endif
 #include "util/asm.hpp"
 LOG_CHANNEL(sys_net);
 LOG_CHANNEL(sceNp2);
 LOG_CHANNEL(sceNp);
@ -384,7 +386,7 @@ vm::addr_t np_handler::allocate(u32 size)
 		return vm::cast(static_cast<u64>(0));
 	// Align allocs
-	const u32 alloc_size = ::align(size, 4);
+	const u32 alloc_size = utils::align(size, 4);
 	if (alloc_size > mpool_avail)
 	{
 		sceNp.error("Not enough memory available in NP pool!");
--- a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
+++ b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
@ -7,6 +7,7 @@
 #include "Emu/RSX/RSXThread.h"
 #include <map>
 #include "util/asm.hpp"
 namespace rsx
 {
@ -23,7 +24,7 @@ namespace rsx
 		}
 		// User memory + fifo size
-		buffer_size = ::align<u32>(buffer_size, 0x100000) + 0x10000000;
+		buffer_size = utils::align<u32>(buffer_size, 0x100000) + 0x10000000;
 		// We are not allowed to drain all memory so add a little
 		g_fxo->init<lv2_memory_container>(buffer_size + 0x1000000);
--- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp
@ -4,6 +4,8 @@
 #include "../RSXThread.h"
 #include "../rsx_utils.h"
 #include "util/asm.hpp"
 namespace
 {
 	// FIXME: GSL as_span break build if template parameter is non const with current revision.
@ -346,8 +348,8 @@ namespace
 				}
 				else
 				{
-					current_subresource_layout.width_in_block = aligned_div(miplevel_width_in_texel, block_edge_in_texel);
+					current_subresource_layout.width_in_block = utils::aligned_div(miplevel_width_in_texel, block_edge_in_texel);
-					current_subresource_layout.height_in_block = aligned_div(miplevel_height_in_texel, block_edge_in_texel);
+					current_subresource_layout.height_in_block = utils::aligned_div(miplevel_height_in_texel, block_edge_in_texel);
 				}
 				if (padded_row)
@ -375,7 +377,7 @@ namespace
 				miplevel_height_in_texel = std::max(miplevel_height_in_texel / 2, 1);
 			}
-			offset_in_src = align(offset_in_src, 128);
+			offset_in_src = utils::align(offset_in_src, 128);
 		}
 		return result;
@ -922,8 +924,8 @@ namespace rsx
 		usz result = 0;
 		for (u16 i = 0; i < mipmap; ++i)
 		{
-			usz rowPitch = align(block_size_in_byte * width_in_blocks, row_pitch_alignment);
+			usz rowPitch = utils::align(block_size_in_byte * width_in_blocks, row_pitch_alignment);
-			result += align(rowPitch * height_in_blocks * depth, mipmap_alignment);
+			result += utils::align(rowPitch * height_in_blocks * depth, mipmap_alignment);
 			height_in_blocks = std::max<usz>(height_in_blocks / 2, 1);
 			width_in_blocks = std::max<usz>(width_in_blocks / 2, 1);
 		}
--- a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
+++ b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
@ -1,6 +1,7 @@
 #pragma once
 #include "util/logs.hpp"
 #include "util/asm.hpp"
 /**
 * Ring buffer memory helper :
@ -19,8 +20,8 @@ protected:
 	template<int Alignment>
 	bool can_alloc(usz size) const
 	{
-		usz alloc_size = align(size, Alignment);
+		usz alloc_size = utils::align(size, Alignment);
-		usz aligned_put_pos = align(m_put_pos, Alignment);
+		usz aligned_put_pos = utils::align(m_put_pos, Alignment);
 		if (aligned_put_pos + alloc_size < m_size)
 		{
 			// range before get
@ -83,8 +84,8 @@ public:
 	template<int Alignment>
 	usz alloc(usz size)
 	{
-		const usz alloc_size = align(size, Alignment);
+		const usz alloc_size = utils::align(size, Alignment);
-		const usz aligned_put_pos = align(m_put_pos, Alignment);
+		const usz aligned_put_pos = utils::align(m_put_pos, Alignment);
 		if (!can_alloc<Alignment>(size) && !grow(aligned_put_pos + alloc_size))
 		{
--- a/rpcs3/Emu/RSX/Common/surface_store.cpp
+++ b/rpcs3/Emu/RSX/Common/surface_store.cpp
@ -1,6 +1,8 @@
 #include "stdafx.h"
 #include "surface_store.h"
 #include "util/asm.hpp"
 namespace rsx
 {
 	namespace utility
@ -23,20 +25,20 @@ namespace rsx
 		{
 			switch (format)
 			{
-			case surface_color_format::b8: return align(width, 256);
+			case surface_color_format::b8: return utils::align(width, 256);
 			case surface_color_format::g8b8:
 			case surface_color_format::x1r5g5b5_o1r5g5b5:
 			case surface_color_format::x1r5g5b5_z1r5g5b5:
-			case surface_color_format::r5g6b5: return align(width * 2, 256);
+			case surface_color_format::r5g6b5: return utils::align(width * 2, 256);
 			case surface_color_format::a8b8g8r8:
 			case surface_color_format::x8b8g8r8_o8b8g8r8:
 			case surface_color_format::x8b8g8r8_z8b8g8r8:
 			case surface_color_format::x8r8g8b8_o8r8g8b8:
 			case surface_color_format::x8r8g8b8_z8r8g8b8:
 			case surface_color_format::x32:
-			case surface_color_format::a8r8g8b8: return align(width * 4, 256);
+			case surface_color_format::a8r8g8b8: return utils::align(width * 4, 256);
-			case surface_color_format::w16z16y16x16: return align(width * 8, 256);
+			case surface_color_format::w16z16y16x16: return utils::align(width * 8, 256);
-			case surface_color_format::w32z32y32x32: return align(width * 16, 256);
+			case surface_color_format::w32z32y32x32: return utils::align(width * 16, 256);
 			}
 			fmt::throw_exception("Unknown color surface format");
 		}
--- a/rpcs3/Emu/RSX/Common/surface_store.h
+++ b/rpcs3/Emu/RSX/Common/surface_store.h
@ -5,6 +5,8 @@
 #include "../rsx_utils.h"
 #include <list>
 #include "util/asm.hpp"
 namespace rsx
 {
 	namespace utility
@ -918,7 +920,7 @@ namespace rsx
 					{
 						// Width is calculated in the coordinate-space of the requester; normalize
 						info.src_area.x = (info.src_area.x * required_bpp) / surface_bpp;
-						info.src_area.width = align(width * required_bpp, surface_bpp) / surface_bpp;
+						info.src_area.width = utils::align(width * required_bpp, surface_bpp) / surface_bpp;
 					}
 					else
 					{
--- a/rpcs3/Emu/RSX/GL/GLCompute.h
+++ b/rpcs3/Emu/RSX/GL/GLCompute.h
@ -4,6 +4,8 @@
 #include "Emu/IdManager.h"
 #include "GLHelpers.h"
 #include "util/asm.hpp"
 namespace gl
 {
    struct compute_task
@ -224,7 +226,7 @@ namespace gl
 			m_data_length = data_length;
 			const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
-			const auto num_bytes_to_process = align(data_length, num_bytes_per_invocation);
+			const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
 			const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
 			if ((num_bytes_to_process + data_offset) > data->size())
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -740,7 +740,7 @@ void GLGSRender::load_program_env()
 		if (update_fragment_env) m_fragment_env_buffer->reserve_storage_on_heap(128);
 		if (update_vertex_env) m_vertex_env_buffer->reserve_storage_on_heap(256);
 		if (update_fragment_texture_env) m_texture_parameters_buffer->reserve_storage_on_heap(256);
-		if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(align(fragment_constants_size, 256));
+		if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256));
 		if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192);
 		if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128);
--- a/rpcs3/Emu/RSX/GL/GLHelpers.h
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.h
@ -16,6 +16,7 @@
 #include "Utilities/mutex.h"
 #include "Utilities/geometry.h"
 #include "util/logs.hpp"
 #include "util/asm.hpp"
 #define GL_FRAGMENT_TEXTURES_START 0
 #define GL_VERTEX_TEXTURES_START   (GL_FRAGMENT_TEXTURES_START + 16)
@ -808,7 +809,7 @@ namespace gl
 		virtual std::pair<void*, u32> alloc_from_heap(u32 alloc_size, u16 alignment)
 		{
 			u32 offset = m_data_loc;
-			if (m_data_loc) offset = align(offset, alignment);
+			if (m_data_loc) offset = utils::align(offset, alignment);
 			if ((offset + alloc_size) > m_size)
 			{
@ -827,7 +828,7 @@ namespace gl
 			}
 			//Align data loc to 256; allows some "guard" region so we dont trample our own data inadvertently
-			m_data_loc = align(offset + alloc_size, 256);
+			m_data_loc = utils::align(offset + alloc_size, 256);
 			return std::make_pair(static_cast<char*>(m_memory_mapping) + offset, offset);
 		}
@ -897,9 +898,9 @@ namespace gl
 			ensure(m_memory_mapping == nullptr);
 			u32 offset = m_data_loc;
-			if (m_data_loc) offset = align(offset, 256);
+			if (m_data_loc) offset = utils::align(offset, 256);
-			const u32 block_size = align(alloc_size + 16, 256);	//Overallocate just in case we need to realign base
+			const u32 block_size = utils::align(alloc_size + 16, 256);	//Overallocate just in case we need to realign base
 			if ((offset + block_size) > m_size)
 			{
@ -933,10 +934,10 @@ namespace gl
 		std::pair<void*, u32> alloc_from_heap(u32 alloc_size, u16 alignment) override
 		{
 			u32 offset = m_data_loc;
-			if (m_data_loc) offset = align(offset, alignment);
+			if (m_data_loc) offset = utils::align(offset, alignment);
 			u32 padding = (offset - m_data_loc);
-			u32 real_size = align(padding + alloc_size, alignment);	//Ensures we leave the loc pointer aligned after we exit
+			u32 real_size = utils::align(padding + alloc_size, alignment);	//Ensures we leave the loc pointer aligned after we exit
 			if (real_size > m_mapped_bytes)
 			{
@ -946,10 +947,10 @@ namespace gl
 				reserve_storage_on_heap(std::max(real_size, 4096U));
 				offset = m_data_loc;
-				if (m_data_loc) offset = align(offset, alignment);
+				if (m_data_loc) offset = utils::align(offset, alignment);
 				padding = (offset - m_data_loc);
-				real_size = align(padding + alloc_size, alignment);
+				real_size = utils::align(padding + alloc_size, alignment);
 			}
 			m_data_loc = offset + real_size;
--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@ -6,6 +6,8 @@
 #include "../RSXThread.h"
 #include "../RSXTexture.h"
 #include "util/asm.hpp"
 namespace gl
 {
 	buffer g_typeless_transfer_buffer;
@ -614,8 +616,8 @@ namespace gl
 		{
 			//Compressed formats have a 4-byte alignment
 			//TODO: Verify that samplers are not affected by the padding
-			width = align(width, 4);
+			width = utils::align(width, 4);
-			height = align(height, 4);
+			height = utils::align(height, 4);
 		}
 		GLenum target;
@ -654,7 +656,7 @@ namespace gl
 		{
 			caps.supports_vtc_decoding = gl::get_driver_caps().vendor_NVIDIA;
-			unpack_settings.row_length(align(dst->width(), 4));
+			unpack_settings.row_length(utils::align(dst->width(), 4));
 			unpack_settings.apply();
 			glBindTexture(static_cast<GLenum>(dst->get_target()), dst->id());
@ -664,7 +666,7 @@ namespace gl
 			for (const rsx::subresource_layout& layout : input_layouts)
 			{
 				upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps);
-				const sizei image_size{ align(layout.width_in_texel, 4), align(layout.height_in_texel, 4) };
+				const sizei image_size{utils::align(layout.width_in_texel, 4), utils::align(layout.height_in_texel, 4)};
 				switch (dst->get_target())
 				{
@ -835,7 +837,7 @@ namespace gl
 	void upload_texture(texture* dst, u32 gcm_format, bool is_swizzled, const std::vector<rsx::subresource_layout>& subresources_layout)
 	{
 		// Calculate staging buffer size
-		const u32 aligned_pitch = align<u32>(dst->pitch(), 4);
+		const u32 aligned_pitch = utils::align<u32>(dst->pitch(), 4);
 		usz texture_data_sz = dst->depth() * dst->height() * aligned_pitch;
 		std::vector<std::byte> data_upload_buf(texture_data_sz);
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
@ -0,0 +1,191 @@
 #include "stdafx.h"
 #include "Emu/RSX/RSXThread.h"
 #include "GLTexture.h"
 #include "GLTextureCache.h"
 #include "util/asm.hpp"
 namespace gl
 {
 	void cached_texture_section::finish_flush()
 	{
 		// Free resources
 		glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
 		glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
 		const auto valid_range = get_confirmed_range_delta();
 		const u32 valid_offset = valid_range.first;
 		const u32 valid_length = valid_range.second;
 		void *dst = get_ptr(get_section_base() + valid_offset);
 		if (!gl::get_driver_caps().ARB_compute_shader_supported)
 		{
 			switch (type)
 			{
 			case gl::texture::type::sbyte:
 			case gl::texture::type::ubyte:
 			{
 				// byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty
 				ensure(!pack_unpack_swap_bytes);
 				break;
 			}
 			case gl::texture::type::uint_24_8:
 			{
 				// Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24
 				// In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3
 				ensure(pack_unpack_swap_bytes == false);
 				ensure(real_pitch == (width * 4));
 				if (rsx_pitch == real_pitch) [[likely]]
 				{
 					stream_data_to_memory_swapped_u32<true>(dst, dst, valid_length / 4, 4);
 				}
 				else
 				{
 					const u32 num_rows = utils::align(valid_length, rsx_pitch) / rsx_pitch;
 					u8* data = static_cast<u8*>(dst);
 					for (u32 row = 0; row < num_rows; ++row)
 					{
 						stream_data_to_memory_swapped_u32<true>(data, data, width, 4);
 						data += rsx_pitch;
 					}
 				}
 				break;
 			}
 			default:
 				break;
 			}
 		}
 		if (is_swizzled())
 		{
 			// This format is completely worthless to CPU processing algorithms where cache lines on die are linear.
 			// If this is happening, usually it means it was not a planned readback (e.g shared pages situation)
 			rsx_log.warning("[Performance warning] CPU readback of swizzled data");
 			// Read-modify-write to avoid corrupting already resident memory outside texture region
 			std::vector<u8> tmp_data(rsx_pitch * height);
 			std::memcpy(tmp_data.data(), dst, tmp_data.size());
 			switch (type)
 			{
 			case gl::texture::type::uint_8_8_8_8:
 			case gl::texture::type::uint_24_8:
 				rsx::convert_linear_swizzle<u32, false>(tmp_data.data(), dst, width, height, rsx_pitch);
 				break;
 			case gl::texture::type::ushort_5_6_5:
 			case gl::texture::type::ushort:
 				rsx::convert_linear_swizzle<u16, false>(tmp_data.data(), dst, width, height, rsx_pitch);
 				break;
 			default:
 				rsx_log.error("Unexpected swizzled texture format 0x%x", static_cast<u32>(format));
 			}
 		}
 		if (context == rsx::texture_upload_context::framebuffer_storage)
 		{
 			// Update memory tag
 			static_cast<gl::render_target*>(vram_texture)->sync_tag();
 		}
 	}
 	void texture_cache::copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const
 	{
 		const auto dst_bpp = dst_image->pitch() / dst_image->width();
 		const auto dst_aspect = dst_image->aspect();
 		for (const auto &slice : sources)
 		{
 			if (!slice.src)
 				continue;
 			const bool typeless = dst_aspect != slice.src->aspect() ||
 				!formats_are_bitcast_compatible(static_cast<GLenum>(slice.src->get_internal_format()), static_cast<GLenum>(dst_image->get_internal_format()));
 			std::unique_ptr<gl::texture> tmp;
 			auto src_image = slice.src;
 			auto src_x = slice.src_x;
 			auto src_y = slice.src_y;
 			auto src_w = slice.src_w;
 			auto src_h = slice.src_h;
 			if (slice.xform == rsx::surface_transform::coordinate_transform)
 			{
 				// Dimensions were given in 'dst' space. Work out the real source coordinates
 				const auto src_bpp = slice.src->pitch() / slice.src->width();
 				src_x = (src_x * dst_bpp) / src_bpp;
 				src_w = utils::aligned_div<u16>(src_w * dst_bpp, src_bpp);
 			}
 			if (auto surface = dynamic_cast<gl::render_target*>(slice.src))
 			{
 				surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
 			}
 			if (typeless) [[unlikely]]
 			{
 				const auto src_bpp = slice.src->pitch() / slice.src->width();
 				const u16 convert_w = u16(slice.src->width() * src_bpp) / dst_bpp;
 				tmp = std::make_unique<texture>(GL_TEXTURE_2D, convert_w, slice.src->height(), 1, 1, static_cast<GLenum>(dst_image->get_internal_format()));
 				src_image = tmp.get();
 				// Compute src region in dst format layout
 				const u16 src_w2 = u16(src_w * src_bpp) / dst_bpp;
 				const u16 src_x2 = u16(src_x * src_bpp) / dst_bpp;
 				if (src_w2 == slice.dst_w && src_h == slice.dst_h && slice.level == 0)
 				{
 					// Optimization, avoid typeless copy to tmp followed by data copy to dst
 					// Combine the two transfers into one
 					const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
 					const coord3u dst_region = { { slice.dst_x, slice.dst_y, slice.dst_z }, { slice.dst_w, slice.dst_h, 1 } };
 					gl::copy_typeless(dst_image, slice.src, dst_region, src_region);
 					continue;
 				}
 				const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
 				const coord3u dst_region = { { src_x2, src_y, 0 }, { src_w2, src_h, 1 } };
 				gl::copy_typeless(src_image, slice.src, dst_region, src_region);
 				src_x = src_x2;
 				src_w = src_w2;
 			}
 			if (src_w == slice.dst_w && src_h == slice.dst_h)
 			{
 				glCopyImageSubData(src_image->id(), GL_TEXTURE_2D, 0, src_x, src_y, 0,
 					dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, src_w, src_h, 1);
 			}
 			else
 			{
 				ensure(dst_image->get_target() == gl::texture::target::texture2D);
 				auto _blitter = gl::g_hw_blitter;
 				const areai src_rect = { src_x, src_y, src_x + src_w, src_y + src_h };
 				const areai dst_rect = { slice.dst_x, slice.dst_y, slice.dst_x + slice.dst_w, slice.dst_y + slice.dst_h };
 				gl::texture* _dst;
 				if (src_image->get_internal_format() == dst_image->get_internal_format() && slice.level == 0)
 				{
 					_dst = dst_image;
 				}
 				else
 				{
 					tmp = std::make_unique<texture>(GL_TEXTURE_2D, dst_rect.x2, dst_rect.y2, 1, 1, static_cast<GLenum>(slice.src->get_internal_format()));
 					_dst = tmp.get();
 				}
 				_blitter->scale_image(cmd, src_image, _dst,
 					src_rect, dst_rect, false, {});
 				if (_dst != dst_image)
 				{
 					// Data cast comes after scaling
 					glCopyImageSubData(tmp->id(), GL_TEXTURE_2D, 0, slice.dst_x, slice.dst_y, 0,
 						dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, slice.dst_w, slice.dst_h, 1);
 				}
 			}
 		}
 	}
 }
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@ -62,7 +62,7 @@ namespace gl
 		void init_buffer(const gl::texture* src)
 		{
 			const u32 vram_size = src->pitch() * src->height();
-			const u32 buffer_size = align(vram_size, 4096);
+			const u32 buffer_size = utils::align(vram_size, 4096);
 			if (pbo)
 			{
@ -333,86 +333,7 @@ namespace gl
 			return glMapBufferRange(GL_PIXEL_PACK_BUFFER, offset, size, GL_MAP_READ_BIT);
 		}
-		void finish_flush()
+		void finish_flush();
 		{
 			// Free resources
 			glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
 			glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
 			const auto valid_range = get_confirmed_range_delta();
 			const u32 valid_offset = valid_range.first;
 			const u32 valid_length = valid_range.second;
 			void *dst = get_ptr(get_section_base() + valid_offset);
 			if (!gl::get_driver_caps().ARB_compute_shader_supported)
 			{
 				switch (type)
 				{
 				case gl::texture::type::sbyte:
 				case gl::texture::type::ubyte:
 				{
 					// byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty
 					ensure(!pack_unpack_swap_bytes);
 					break;
 				}
 				case gl::texture::type::uint_24_8:
 				{
 					// Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24
 					// In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3
 					ensure(pack_unpack_swap_bytes == false);
 					ensure(real_pitch == (width * 4));
 					if (rsx_pitch == real_pitch) [[likely]]
 					{
 						stream_data_to_memory_swapped_u32<true>(dst, dst, valid_length / 4, 4);
 					}
 					else
 					{
 						const u32 num_rows = align(valid_length, rsx_pitch) / rsx_pitch;
 						u8* data = static_cast<u8*>(dst);
 						for (u32 row = 0; row < num_rows; ++row)
 						{
 							stream_data_to_memory_swapped_u32<true>(data, data, width, 4);
 							data += rsx_pitch;
 						}
 					}
 					break;
 				}
 				default:
 					break;
 				}
 			}
 			if (is_swizzled())
 			{
 				// This format is completely worthless to CPU processing algorithms where cache lines on die are linear.
 				// If this is happening, usually it means it was not a planned readback (e.g shared pages situation)
 				rsx_log.warning("[Performance warning] CPU readback of swizzled data");
 				// Read-modify-write to avoid corrupting already resident memory outside texture region
 				std::vector<u8> tmp_data(rsx_pitch * height);
 				std::memcpy(tmp_data.data(), dst, tmp_data.size());
 				switch (type)
 				{
 				case gl::texture::type::uint_8_8_8_8:
 				case gl::texture::type::uint_24_8:
 					rsx::convert_linear_swizzle<u32, false>(tmp_data.data(), dst, width, height, rsx_pitch);
 					break;
 				case gl::texture::type::ushort_5_6_5:
 				case gl::texture::type::ushort:
 					rsx::convert_linear_swizzle<u16, false>(tmp_data.data(), dst, width, height, rsx_pitch);
 					break;
 				default:
 					rsx_log.error("Unexpected swizzled texture format 0x%x", static_cast<u32>(format));
 				}
 			}
 			if (context == rsx::texture_upload_context::framebuffer_storage)
 			{
 				// Update memory tag
 				static_cast<gl::render_target*>(vram_texture)->sync_tag();
 			}
 		}
 		/**
 		 * Misc
@ -637,106 +558,7 @@ namespace gl
 			}
 		}
-		void copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const
+		void copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const;
 		{
 			const auto dst_bpp = dst_image->pitch() / dst_image->width();
 			const auto dst_aspect = dst_image->aspect();
 			for (const auto &slice : sources)
 			{
 				if (!slice.src)
 					continue;
 				const bool typeless = dst_aspect != slice.src->aspect() ||
 					!formats_are_bitcast_compatible(static_cast<GLenum>(slice.src->get_internal_format()), static_cast<GLenum>(dst_image->get_internal_format()));
 				std::unique_ptr<gl::texture> tmp;
 				auto src_image = slice.src;
 				auto src_x = slice.src_x;
 				auto src_y = slice.src_y;
 				auto src_w = slice.src_w;
 				auto src_h = slice.src_h;
 				if (slice.xform == rsx::surface_transform::coordinate_transform)
 				{
 					// Dimensions were given in 'dst' space. Work out the real source coordinates
 					const auto src_bpp = slice.src->pitch() / slice.src->width();
 					src_x = (src_x * dst_bpp) / src_bpp;
 					src_w = ::aligned_div<u16>(src_w * dst_bpp, src_bpp);
 				}
 				if (auto surface = dynamic_cast<gl::render_target*>(slice.src))
 				{
 					surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
 				}
 				if (typeless) [[unlikely]]
 				{
 					const auto src_bpp = slice.src->pitch() / slice.src->width();
 					const u16 convert_w = u16(slice.src->width() * src_bpp) / dst_bpp;
 					tmp = std::make_unique<texture>(GL_TEXTURE_2D, convert_w, slice.src->height(), 1, 1, static_cast<GLenum>(dst_image->get_internal_format()));
 					src_image = tmp.get();
 					// Compute src region in dst format layout
 					const u16 src_w2 = u16(src_w * src_bpp) / dst_bpp;
 					const u16 src_x2 = u16(src_x * src_bpp) / dst_bpp;
 					if (src_w2 == slice.dst_w && src_h == slice.dst_h && slice.level == 0)
 					{
 						// Optimization, avoid typeless copy to tmp followed by data copy to dst
 						// Combine the two transfers into one
 						const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
 						const coord3u dst_region = { { slice.dst_x, slice.dst_y, slice.dst_z }, { slice.dst_w, slice.dst_h, 1 } };
 						gl::copy_typeless(dst_image, slice.src, dst_region, src_region);
 						continue;
 					}
 					const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
 					const coord3u dst_region = { { src_x2, src_y, 0 }, { src_w2, src_h, 1 } };
 					gl::copy_typeless(src_image, slice.src, dst_region, src_region);
 					src_x = src_x2;
 					src_w = src_w2;
 				}
 				if (src_w == slice.dst_w && src_h == slice.dst_h)
 				{
 					glCopyImageSubData(src_image->id(), GL_TEXTURE_2D, 0, src_x, src_y, 0,
 						dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, src_w, src_h, 1);
 				}
 				else
 				{
 					ensure(dst_image->get_target() == gl::texture::target::texture2D);
 					auto _blitter = gl::g_hw_blitter;
 					const areai src_rect = { src_x, src_y, src_x + src_w, src_y + src_h };
 					const areai dst_rect = { slice.dst_x, slice.dst_y, slice.dst_x + slice.dst_w, slice.dst_y + slice.dst_h };
 					gl::texture* _dst;
 					if (src_image->get_internal_format() == dst_image->get_internal_format() && slice.level == 0)
 					{
 						_dst = dst_image;
 					}
 					else
 					{
 						tmp = std::make_unique<texture>(GL_TEXTURE_2D, dst_rect.x2, dst_rect.y2, 1, 1, static_cast<GLenum>(slice.src->get_internal_format()));
 						_dst = tmp.get();
 					}
 					_blitter->scale_image(cmd, src_image, _dst,
 						src_rect, dst_rect, false, {});
 					if (_dst != dst_image)
 					{
 						// Data cast comes after scaling
 						glCopyImageSubData(tmp->id(), GL_TEXTURE_2D, 0, slice.dst_x, slice.dst_y, 0,
 							dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, slice.dst_w, slice.dst_h, 1);
 					}
 				}
 			}
 		}
 		gl::texture* get_template_from_collection_impl(const std::vector<copy_region_descriptor>& sections_to_transfer) const
 		{
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -139,6 +139,52 @@ namespace rsx
 		fmt::throw_exception("rsx::get_address(offset=0x%x, location=0x%x): %s%s", offset, location, msg, src_loc{line, col, file, func});
 	}
 	std::pair<u32, u32> interleaved_range_info::calculate_required_range(u32 first, u32 count) const
 	{
 		if (single_vertex)
 		{
 			return { 0, 1 };
 		}
 		const u32 max_index = (first + count) - 1;
 		u32 _max_index = 0;
 		u32 _min_index = first;
 		for (const auto &attrib : locations)
 		{
 			if (attrib.frequency <= 1) [[likely]]
 			{
 				_max_index = max_index;
 			}
 			else
 			{
 				if (attrib.modulo)
 				{
 					if (max_index >= attrib.frequency)
 					{
 						// Actually uses the modulo operator
 						_min_index = 0;
 						_max_index = attrib.frequency - 1;
 					}
 					else
 					{
 						// Same as having no modulo
 						_max_index = max_index;
 					}
 				}
 				else
 				{
 					// Division operator
 					_min_index = std::min(_min_index, first / attrib.frequency);
 					_max_index = std::max<u32>(_max_index, utils::aligned_div(max_index, attrib.frequency));
 				}
 			}
 		}
 		ensure(_max_index >= _min_index);
 		return { _min_index, (_max_index - _min_index) + 1 };
 	}
 	u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size)
 	{
 		switch (type)
@ -2521,7 +2567,7 @@ namespace rsx
 		}
 		// Some cases do not need full delay
-		remaining = ::aligned_div(remaining, div);
+		remaining = utils::aligned_div(remaining, div);
 		const u64 until = get_system_time() + remaining;
 		while (true)
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@ -246,51 +246,7 @@ namespace rsx
 		rsx::simple_array<interleaved_attribute_t> locations;
 		// Check if we need to upload a full unoptimized range, i.e [0-max_index]
-		std::pair<u32, u32> calculate_required_range(u32 first, u32 count) const
+		std::pair<u32, u32> calculate_required_range(u32 first, u32 count) const;
 		{
 			if (single_vertex)
 			{
 				return { 0, 1 };
 			}
 			const u32 max_index = (first + count) - 1;
 			u32 _max_index = 0;
 			u32 _min_index = first;
 			for (const auto &attrib : locations)
 			{
 				if (attrib.frequency <= 1) [[likely]]
 				{
 					_max_index = max_index;
 				}
 				else
 				{
 					if (attrib.modulo)
 					{
 						if (max_index >= attrib.frequency)
 						{
 							// Actually uses the modulo operator
 							_min_index = 0;
 							_max_index = attrib.frequency - 1;
 						}
 						else
 						{
 							// Same as having no modulo
 							_max_index = max_index;
 						}
 					}
 					else
 					{
 						// Division operator
 						_min_index = std::min(_min_index, first / attrib.frequency);
 						_max_index = std::max<u32>(_max_index, aligned_div(max_index, attrib.frequency));
 					}
 				}
 			}
 			ensure(_max_index >= _min_index);
 			return { _min_index, (_max_index - _min_index) + 1 };
 		}
 	};
 	enum attribute_buffer_placement : u8
--- a/rpcs3/Emu/RSX/VK/VKCompute.h
+++ b/rpcs3/Emu/RSX/VK/VKCompute.h
@ -5,6 +5,8 @@
 #include "Utilities/StrUtil.h"
 #include "Emu/IdManager.h"
 #include "util/asm.hpp"
 #define VK_MAX_COMPUTE_TASKS 4096   // Max number of jobs per frame
 namespace vk
@ -296,7 +298,7 @@ namespace vk
 				"%vars"
 				"\n";
-			const auto parameters_size = align(push_constants_size, 16) / 16;
+			const auto parameters_size = utils::align(push_constants_size, 16) / 16;
 			const std::pair<std::string, std::string> syntax_replace[] =
 			{
 				{ "%ws", std::to_string(optimal_group_size) },
@ -943,7 +945,7 @@ namespace vk
 			set_parameters(cmd);
 			const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
-			const u32 linear_invocations = aligned_div(data_length, num_bytes_per_invocation);
+			const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
 			compute_task::run(cmd, linear_invocations);
 		}
 	};
@ -997,7 +999,7 @@ namespace vk
 			word_count = num_words;
 			block_length = num_words * 4;
-			const u32 linear_invocations = aligned_div(word_count, optimal_group_size);
+			const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
 			compute_task::run(cmd, linear_invocations);
 		}
 	};
--- a/rpcs3/Emu/RSX/VK/VKDMA.cpp
+++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp
@ -3,6 +3,8 @@
 #include "VKResourceManager.h"
 #include "VKDMA.h"
 #include "util/asm.hpp"
 namespace vk
 {
 	static constexpr usz s_dma_block_length = 0x01000000;
@ -85,7 +87,7 @@ namespace vk
 	{
 		if (!inheritance_info.parent)
 		{
-			const u32 start = align(range.start, s_page_size);
+			const u32 start = utils::align(range.start, s_page_size);
 			const u32 end = ((range.end + 1) & s_page_align);
 			for (u32 page = start; page < end; page += s_page_size)
@ -259,7 +261,7 @@ namespace vk
 		}
 		dma_block* block_head = nullptr;
-		auto block_end = align(limit, s_dma_block_length);
+		auto block_end = utils::align(limit, s_dma_block_length);
 		// Reverse scan to try and find the minimum required length in case of other chaining
 		for (auto block = last_block; block != first_block; block -= s_dma_block_length)
--- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp
@ -132,7 +132,7 @@ namespace vk
 	{
 		// Create new heap. All sizes are aligned up by 64M, upto 1GiB
 		const usz size_limit = 1024 * 0x100000;
-		const usz aligned_new_size = align(m_size + size, 64 * 0x100000);
+		const usz aligned_new_size = utils::align(m_size + size, 64 * 0x100000);
 		if (aligned_new_size >= size_limit)
 		{
@ -351,8 +351,8 @@ namespace vk
 	{
 		auto create_texture = [&]()
 		{
-			u32 new_width = align(requested_width, 1024u);
+			u32 new_width = utils::align(requested_width, 1024u);
-			u32 new_height = align(requested_height, 1024u);
+			u32 new_height = utils::align(requested_height, 1024u);
 			return new vk::image(*g_current_renderer, g_current_renderer->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
 				VK_IMAGE_TYPE_2D, format, new_width, new_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
@ -388,7 +388,7 @@ namespace vk
 		if (!g_scratch_buffer)
 		{
 			// Choose optimal size
-			const u64 alloc_size = std::max<u64>(64 * 0x100000, align(min_required_size, 0x100000));
+			const u64 alloc_size = std::max<u64>(64 * 0x100000, utils::align(min_required_size, 0x100000));
 			g_scratch_buffer = std::make_unique<vk::buffer>(*g_current_renderer, alloc_size,
 				g_current_renderer->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
--- a/rpcs3/Emu/RSX/VK/VKPresent.cpp
+++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp
@ -2,6 +2,8 @@
 #include "VKGSRender.h"
 #include "Emu/Cell/Modules/cellVideoOut.h"
 #include "util/asm.hpp"
 void VKGSRender::reinitialize_swapchain()
 {
 	m_swapchain_dims.width = m_frame->client_width();
@ -651,7 +653,7 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info)
 			const usz sshot_size = buffer_height * buffer_width * 4;
-			vk::buffer sshot_vkbuf(*m_device, align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+			vk::buffer sshot_vkbuf(*m_device, utils::align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
 				VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
 			VkBufferImageCopy copy_info;
--- a/rpcs3/Emu/RSX/VK/VKResolveHelper.h
+++ b/rpcs3/Emu/RSX/VK/VKResolveHelper.h
@ -131,8 +131,8 @@ namespace vk
 			multisampled = msaa_image;
 			resolve = resolve_image;
-			const u32 invocations_x = align(resolve_image->width(), cs_wave_x) / cs_wave_x;
+			const u32 invocations_x = utils::align(resolve_image->width(), cs_wave_x) / cs_wave_x;
-			const u32 invocations_y = align(resolve_image->height(), cs_wave_y) / cs_wave_y;
+			const u32 invocations_y = utils::align(resolve_image->height(), cs_wave_y) / cs_wave_y;
 			compute_task::run(cmd, invocations_x, invocations_y, 1);
 		}
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -7,6 +7,8 @@
 #include "VKRenderPass.h"
 #include "VKRenderTargets.h"
 #include "util/asm.hpp"
 namespace vk
 {
 	VkComponentMapping default_component_map()
@ -89,7 +91,7 @@ namespace vk
 			ensure(dst->size() >= allocation_end);
 			const auto data_offset = u32(region.bufferOffset);
-			const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
+			const auto z32_offset = utils::align<u32>(data_offset + packed16_length, 256);
 			// 1. Copy the depth to buffer
 			VkBufferImageCopy region2;
@ -135,8 +137,8 @@ namespace vk
 			ensure(dst->size() >= allocation_end);
 			const auto data_offset = u32(region.bufferOffset);
-			const auto z_offset = align<u32>(data_offset + packed_length, 256);
+			const auto z_offset = utils::align<u32>(data_offset + packed_length, 256);
-			const auto s_offset = align<u32>(z_offset + in_depth_size, 256);
+			const auto s_offset = utils::align<u32>(z_offset + in_depth_size, 256);
 			// 1. Copy the depth and stencil blocks to separate banks
 			VkBufferImageCopy sub_regions[2];
@ -225,7 +227,7 @@ namespace vk
 			ensure(src->size() >= allocation_end);
 			const auto data_offset = u32(region.bufferOffset);
-			const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
+			const auto z32_offset = utils::align<u32>(data_offset + packed16_length, 256);
 			// 1. Pre-compute barrier
 			vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
@ -260,8 +262,8 @@ namespace vk
 			ensure(src->size() >= allocation_end); // "Out of memory (compute heap). Lower your resolution scale setting."
 			const auto data_offset = u32(region.bufferOffset);
-			const auto z_offset = align<u32>(data_offset + packed_length, 256);
+			const auto z_offset = utils::align<u32>(data_offset + packed_length, 256);
-			const auto s_offset = align<u32>(z_offset + in_depth_size, 256);
+			const auto s_offset = utils::align<u32>(z_offset + in_depth_size, 256);
 			// Zero out the stencil block
 			vkCmdFillBuffer(cmd, src->value, s_offset, in_stencil_size, 0);
@ -821,7 +823,7 @@ namespace vk
 			const auto src_offset = section.bufferOffset;
 			// Align output to 128-byte boundary to keep some drivers happy
-			dst_offset = align(dst_offset, 128);
+			dst_offset = utils::align(dst_offset, 128);
 			u32 data_length = 0;
 			for (unsigned i = 0, j = packet.first; i < packet.second; ++i, ++j)
@ -930,7 +932,7 @@ namespace vk
 				if (layout.level == 0)
 				{
 					// Align mip0 on a 128-byte boundary
-					scratch_offset = align(scratch_offset, 128);
+					scratch_offset = utils::align(scratch_offset, 128);
 				}
 				// Copy from upload heap to scratch mem
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp
@ -0,0 +1,360 @@
 #include "stdafx.h"
 #include "VKGSRender.h"
 #include "VKTextureCache.h"
 #include "util/asm.hpp"
 namespace vk
 {
 	void cached_texture_section::dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch)
 	{
 		ensure(src->samples() == 1);
 		if (!m_device)
 		{
 			m_device = &cmd.get_command_pool().get_owner();
 		}
 		if (dma_fence)
 		{
 			// NOTE: This can be reached if previously synchronized, or a special path happens.
 			// If a hard flush occurred while this surface was flush_always the cache would have reset its protection afterwards.
 			// DMA resource would still be present but already used to flush previously.
 			vk::get_resource_manager()->dispose(dma_fence);
 		}
 		if (vk::is_renderpass_open(cmd))
 		{
 			vk::end_renderpass(cmd);
 		}
 		src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
 		const auto internal_bpp = vk::get_format_texel_width(src->format());
 		const auto transfer_width = static_cast<u32>(src_area.width());
 		const auto transfer_height = static_cast<u32>(src_area.height());
 		real_pitch = internal_bpp * transfer_width;
 		rsx_pitch = pitch;
 		const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT;
 		if (require_format_conversion || pack_unpack_swap_bytes)
 		{
 			const auto section_length = valid_range.length();
 			const auto transfer_pitch = real_pitch;
 			const auto task_length = transfer_pitch * src_area.height();
 			auto working_buffer = vk::get_scratch_buffer(task_length);
 			auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
 			VkBufferImageCopy region = {};
 			region.imageSubresource = { src->aspect(), 0, 0, 1 };
 			region.imageOffset = { src_area.x1, src_area.y1, 0 };
 			region.imageExtent = { transfer_width, transfer_height, 1 };
 			vk::copy_image_to_buffer(cmd, src, working_buffer, region, (require_format_conversion && pack_unpack_swap_bytes));
 			// NOTE: For depth/stencil formats, copying to buffer and byteswap are combined into one step above
 			if (pack_unpack_swap_bytes && !require_format_conversion)
 			{
 				const auto texel_layout = vk::get_format_element_size(src->format());
 				const auto elem_size = texel_layout.first;
 				vk::cs_shuffle_base *shuffle_kernel;
 				if (elem_size == 2)
 				{
 					shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
 				}
 				else if (elem_size == 4)
 				{
 					shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
 				}
 				else
 				{
 					ensure(get_context() == rsx::texture_upload_context::dma);
 					shuffle_kernel = nullptr;
 				}
 				if (shuffle_kernel)
 				{
 					vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
 						VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
 						VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
 					shuffle_kernel->run(cmd, working_buffer, task_length);
 					vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
 						VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
 						VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
 				}
 			}
 			if (rsx_pitch == real_pitch) [[likely]]
 			{
 				VkBufferCopy copy = {};
 				copy.dstOffset = final_mapping.first;
 				copy.size = section_length;
 				vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
 			}
 			else
 			{
 				if (context != rsx::texture_upload_context::dma)
 				{
 					// Partial load for the bits outside the existing image
 					// NOTE: A true DMA section would have been prepped beforehand
 					// TODO: Parial range load/flush
 					vk::load_dma(valid_range.start, section_length);
 				}
 				std::vector<VkBufferCopy> copy;
 				copy.reserve(transfer_height);
 				u32 dst_offset = final_mapping.first;
 				u32 src_offset = 0;
 				for (unsigned row = 0; row < transfer_height; ++row)
 				{
 					copy.push_back({ src_offset, dst_offset, transfer_pitch });
 					src_offset += real_pitch;
 					dst_offset += rsx_pitch;
 				}
 				vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
 			}
 		}
 		else
 		{
 			VkBufferImageCopy region = {};
 			region.bufferRowLength = (rsx_pitch / internal_bpp);
 			region.imageSubresource = { src->aspect(), 0, 0, 1 };
 			region.imageOffset = { src_area.x1, src_area.y1, 0 };
 			region.imageExtent = { transfer_width, transfer_height, 1 };
 			auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
 			region.bufferOffset = mapping.first;
 			vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, &region);
 		}
 		src->pop_layout(cmd);
 		// Create event object for this transfer and queue signal op
 		dma_fence = std::make_unique<vk::event>(*m_device);
 		dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT);
 		// Set cb flag for queued dma operations
 		cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
 		if (get_context() == rsx::texture_upload_context::dma)
 		{
 			// Save readback hint in case transformation is required later
 			switch (internal_bpp)
 			{
 			case 2:
 				gcm_format = CELL_GCM_TEXTURE_R5G6B5;
 				break;
 			case 4:
 			default:
 				gcm_format = CELL_GCM_TEXTURE_A8R8G8B8;
 				break;
 			}
 		}
 		synchronized = true;
 		sync_timestamp = get_system_time();
 	}
 	void texture_cache::copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const
 	{
 		const auto dst_aspect = dst->aspect();
 		const auto dst_bpp = vk::get_format_texel_width(dst->format());
 		for (const auto &section : sections_to_transfer)
 		{
 			if (!section.src)
 				continue;
 			const bool typeless = section.src->aspect() != dst_aspect ||
 				!formats_are_bitcast_compatible(dst, section.src);
 			// Avoid inserting unnecessary barrier GENERAL->TRANSFER_SRC->GENERAL in active render targets
 			const auto preferred_layout = (section.src->current_layout != VK_IMAGE_LAYOUT_GENERAL) ?
 				VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL;
 			section.src->push_layout(cmd, preferred_layout);
 			auto src_image = section.src;
 			auto src_x = section.src_x;
 			auto src_y = section.src_y;
 			auto src_w = section.src_w;
 			auto src_h = section.src_h;
 			rsx::flags32_t transform = section.xform;
 			if (section.xform == rsx::surface_transform::coordinate_transform)
 			{
 				// Dimensions were given in 'dst' space. Work out the real source coordinates
 				const auto src_bpp = vk::get_format_texel_width(section.src->format());
 				src_x = (src_x * dst_bpp) / src_bpp;
 				src_w = utils::aligned_div<u16>(src_w * dst_bpp, src_bpp);
 				transform &= ~(rsx::surface_transform::coordinate_transform);
 			}
 			if (auto surface = dynamic_cast<vk::render_target*>(section.src))
 			{
 				surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
 			}
 			if (typeless) [[unlikely]]
 			{
 				const auto src_bpp = vk::get_format_texel_width(section.src->format());
 				const u16 convert_w = u16(src_w * src_bpp) / dst_bpp;
 				const u16 convert_x = u16(src_x * src_bpp) / dst_bpp;
 				if (convert_w == section.dst_w && src_h == section.dst_h &&
 					transform == rsx::surface_transform::identity &&
 					section.level == 0 && section.dst_z == 0)
 				{
 					// Optimization to avoid double transfer
 					// TODO: Handle level and layer offsets
 					const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
 					const areai dst_rect = coordi{{ section.dst_x, section.dst_y }, { section.dst_w, section.dst_h }};
 					vk::copy_image_typeless(cmd, section.src, dst, src_rect, dst_rect, 1);
 					section.src->pop_layout(cmd);
 					continue;
 				}
 				src_image = vk::get_typeless_helper(dst->format(), dst->format_class(), convert_x + convert_w, src_y + src_h);
 				src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
 				const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
 				const areai dst_rect = coordi{{ convert_x, src_y }, { convert_w, src_h }};
 				vk::copy_image_typeless(cmd, section.src, src_image, src_rect, dst_rect, 1);
 				src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
 				src_x = convert_x;
 				src_w = convert_w;
 			}
 			ensure(src_image->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src_image->current_layout == VK_IMAGE_LAYOUT_GENERAL);
 			// Final aspect mask of the 'final' transfer source
 			const auto new_src_aspect = src_image->aspect();
 			if (src_w == section.dst_w && src_h == section.dst_h && transform == rsx::surface_transform::identity) [[likely]]
 			{
 				VkImageCopy copy_rgn;
 				copy_rgn.srcOffset = { src_x, src_y, 0 };
 				copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
 				copy_rgn.dstSubresource = { dst_aspect, 0, 0, 1 };
 				copy_rgn.srcSubresource = { new_src_aspect, 0, 0, 1 };
 				copy_rgn.extent = { src_w, src_h, 1 };
 				if (dst->info.imageType == VK_IMAGE_TYPE_3D)
 				{
 					copy_rgn.dstOffset.z = section.dst_z;
 				}
 				else
 				{
 					copy_rgn.dstSubresource.baseArrayLayer = section.dst_z;
 					copy_rgn.dstSubresource.mipLevel = section.level;
 				}
 				vkCmdCopyImage(cmd, src_image->value, src_image->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
 			}
 			else
 			{
 				ensure(section.dst_z == 0);
 				u16 dst_x = section.dst_x, dst_y = section.dst_y;
 				vk::image* _dst;
 				if (src_image->info.format == dst->info.format && section.level == 0) [[likely]]
 				{
 					_dst = dst;
 				}
 				else
 				{
 					// Either a bitcast is required or a scale+copy to mipmap level
 					_dst = vk::get_typeless_helper(src_image->format(), src_image->format_class(), dst->width(), dst->height() * 2);
 					_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
 				}
 				if (transform == rsx::surface_transform::identity)
 				{
 					vk::copy_scaled_image(cmd, src_image, _dst,
 						coordi{ { src_x, src_y }, { src_w, src_h } },
 						coordi{ { section.dst_x, section.dst_y }, { section.dst_w, section.dst_h } },
 						1, src_image->format() == _dst->format(),
 						VK_FILTER_NEAREST);
 				}
 				else if (transform == rsx::surface_transform::argb_to_bgra)
 				{
 					VkBufferImageCopy copy{};
 					copy.imageExtent = { src_w, src_h, 1 };
 					copy.imageOffset = { src_x, src_y, 0 };
 					copy.imageSubresource = { src_image->aspect(), 0, 0, 1 };
 					const auto mem_length = src_w * src_h * dst_bpp;
 					auto scratch_buf = vk::get_scratch_buffer(mem_length);
 					vkCmdCopyImageToBuffer(cmd, src_image->value, src_image->current_layout, scratch_buf->value, 1, &copy);
 					vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
 						VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
 					auto shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
 					shuffle_kernel->run(cmd, scratch_buf, mem_length);
 					vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
 						VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
 					auto tmp = vk::get_typeless_helper(src_image->format(), src_image->format_class(), section.dst_x + section.dst_w, section.dst_y + section.dst_h);
 					tmp->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
 					copy.imageOffset = { 0, 0, 0 };
 					vkCmdCopyBufferToImage(cmd, scratch_buf->value, tmp->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy);
 					dst_x = 0;
 					dst_y = 0;
 					if (src_w != section.dst_w || src_h != section.dst_h)
 					{
 						// Optionally scale if needed
 						if (tmp == _dst) [[unlikely]]
 						{
 							dst_y = src_h;
 						}
 						vk::copy_scaled_image(cmd, tmp, _dst,
 							areai{ 0, 0, src_w, static_cast<s32>(src_h) },
 							coordi{ { dst_x, dst_y }, { section.dst_w, section.dst_h } },
 							1, tmp->info.format == _dst->info.format,
 							VK_FILTER_NEAREST);
 					}
 					else
 					{
 						_dst = tmp;
 					}
 				}
 				else
 				{
 					fmt::throw_exception("Unreachable");
 				}
 				if (_dst != dst) [[unlikely]]
 				{
 					// Casting comes after the scaling!
 					VkImageCopy copy_rgn;
 					copy_rgn.srcOffset = { s32(dst_x), s32(dst_y), 0 };
 					copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
 					copy_rgn.dstSubresource = { dst_aspect, section.level, 0, 1 };
 					copy_rgn.srcSubresource = { _dst->aspect(), 0, 0, 1 };
 					copy_rgn.extent = { section.dst_w, section.dst_h, 1 };
 					_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
 					vkCmdCopyImage(cmd, _dst->value, _dst->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
 				}
 			}
 			section.src->pop_layout(cmd);
 		}
 	}
 }
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.h
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h
@ -167,160 +167,7 @@ namespace vk
 			return flushed;
 		}
-		void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch)
+		void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch);
 		{
 			ensure(src->samples() == 1);
 			if (!m_device)
 			{
 				m_device = &cmd.get_command_pool().get_owner();
 			}
 			if (dma_fence)
 			{
 				// NOTE: This can be reached if previously synchronized, or a special path happens.
 				// If a hard flush occurred while this surface was flush_always the cache would have reset its protection afterwards.
 				// DMA resource would still be present but already used to flush previously.
 				vk::get_resource_manager()->dispose(dma_fence);
 			}
 			if (vk::is_renderpass_open(cmd))
 			{
 				vk::end_renderpass(cmd);
 			}
 			src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
 			const auto internal_bpp = vk::get_format_texel_width(src->format());
 			const auto transfer_width = static_cast<u32>(src_area.width());
 			const auto transfer_height = static_cast<u32>(src_area.height());
 			real_pitch = internal_bpp * transfer_width;
 			rsx_pitch = pitch;
 			const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT;
 			if (require_format_conversion || pack_unpack_swap_bytes)
 			{
 				const auto section_length = valid_range.length();
 				const auto transfer_pitch = real_pitch;
 				const auto task_length = transfer_pitch * src_area.height();
 				auto working_buffer = vk::get_scratch_buffer(task_length);
 				auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
 				VkBufferImageCopy region = {};
 				region.imageSubresource = { src->aspect(), 0, 0, 1 };
 				region.imageOffset = { src_area.x1, src_area.y1, 0 };
 				region.imageExtent = { transfer_width, transfer_height, 1 };
 				vk::copy_image_to_buffer(cmd, src, working_buffer, region, (require_format_conversion && pack_unpack_swap_bytes));
 				// NOTE: For depth/stencil formats, copying to buffer and byteswap are combined into one step above
 				if (pack_unpack_swap_bytes && !require_format_conversion)
 				{
 					const auto texel_layout = vk::get_format_element_size(src->format());
 					const auto elem_size = texel_layout.first;
 					vk::cs_shuffle_base *shuffle_kernel;
 					if (elem_size == 2)
 					{
 						shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
 					}
 					else if (elem_size == 4)
 					{
 						shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
 					}
 					else
 					{
 						ensure(get_context() == rsx::texture_upload_context::dma);
 						shuffle_kernel = nullptr;
 					}
 					if (shuffle_kernel)
 					{
 						vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
 							VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
 							VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
 						shuffle_kernel->run(cmd, working_buffer, task_length);
 						vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
 							VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
 							VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
 					}
 				}
 				if (rsx_pitch == real_pitch) [[likely]]
 				{
 					VkBufferCopy copy = {};
 					copy.dstOffset = final_mapping.first;
 					copy.size = section_length;
 					vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
 				}
 				else
 				{
 					if (context != rsx::texture_upload_context::dma)
 					{
 						// Partial load for the bits outside the existing image
 						// NOTE: A true DMA section would have been prepped beforehand
 						// TODO: Parial range load/flush
 						vk::load_dma(valid_range.start, section_length);
 					}
 					std::vector<VkBufferCopy> copy;
 					copy.reserve(transfer_height);
 					u32 dst_offset = final_mapping.first;
 					u32 src_offset = 0;
 					for (unsigned row = 0; row < transfer_height; ++row)
 					{
 						copy.push_back({ src_offset, dst_offset, transfer_pitch });
 						src_offset += real_pitch;
 						dst_offset += rsx_pitch;
 					}
 					vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
 				}
 			}
 			else
 			{
 				VkBufferImageCopy region = {};
 				region.bufferRowLength = (rsx_pitch / internal_bpp);
 				region.imageSubresource = { src->aspect(), 0, 0, 1 };
 				region.imageOffset = { src_area.x1, src_area.y1, 0 };
 				region.imageExtent = { transfer_width, transfer_height, 1 };
 				auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
 				region.bufferOffset = mapping.first;
 				vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, &region);
 			}
 			src->pop_layout(cmd);
 			// Create event object for this transfer and queue signal op
 			dma_fence = std::make_unique<vk::event>(*m_device);
 			dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT);
 			// Set cb flag for queued dma operations
 			cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
 			if (get_context() == rsx::texture_upload_context::dma)
 			{
 				// Save readback hint in case transformation is required later
 				switch (internal_bpp)
 				{
 				case 2:
 					gcm_format = CELL_GCM_TEXTURE_R5G6B5;
 					break;
 				case 4:
 				default:
 					gcm_format = CELL_GCM_TEXTURE_A8R8G8B8;
 					break;
 				}
 			}
 			synchronized = true;
 			sync_timestamp = get_system_time();
 		}
 		void copy_texture(vk::command_buffer& cmd, bool miss)
 		{
@ -610,202 +457,7 @@ namespace vk
 			return mapping;
 		}
-		void copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const
+		void copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const;
 		{
 			const auto dst_aspect = dst->aspect();
 			const auto dst_bpp = vk::get_format_texel_width(dst->format());
 			for (const auto &section : sections_to_transfer)
 			{
 				if (!section.src)
 					continue;
 				const bool typeless = section.src->aspect() != dst_aspect ||
 					!formats_are_bitcast_compatible(dst, section.src);
 				// Avoid inserting unnecessary barrier GENERAL->TRANSFER_SRC->GENERAL in active render targets
 				const auto preferred_layout = (section.src->current_layout != VK_IMAGE_LAYOUT_GENERAL) ?
 					VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL;
 				section.src->push_layout(cmd, preferred_layout);
 				auto src_image = section.src;
 				auto src_x = section.src_x;
 				auto src_y = section.src_y;
 				auto src_w = section.src_w;
 				auto src_h = section.src_h;
 				rsx::flags32_t transform = section.xform;
 				if (section.xform == rsx::surface_transform::coordinate_transform)
 				{
 					// Dimensions were given in 'dst' space. Work out the real source coordinates
 					const auto src_bpp = vk::get_format_texel_width(section.src->format());
 					src_x = (src_x * dst_bpp) / src_bpp;
 					src_w = ::aligned_div<u16>(src_w * dst_bpp, src_bpp);
 					transform &= ~(rsx::surface_transform::coordinate_transform);
 				}
 				if (auto surface = dynamic_cast<vk::render_target*>(section.src))
 				{
 					surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
 				}
 				if (typeless) [[unlikely]]
 				{
 					const auto src_bpp = vk::get_format_texel_width(section.src->format());
 					const u16 convert_w = u16(src_w * src_bpp) / dst_bpp;
 					const u16 convert_x = u16(src_x * src_bpp) / dst_bpp;
 					if (convert_w == section.dst_w && src_h == section.dst_h &&
 						transform == rsx::surface_transform::identity &&
 						section.level == 0 && section.dst_z == 0)
 					{
 						// Optimization to avoid double transfer
 						// TODO: Handle level and layer offsets
 						const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
 						const areai dst_rect = coordi{{ section.dst_x, section.dst_y }, { section.dst_w, section.dst_h }};
 						vk::copy_image_typeless(cmd, section.src, dst, src_rect, dst_rect, 1);
 						section.src->pop_layout(cmd);
 						continue;
 					}
 					src_image = vk::get_typeless_helper(dst->format(), dst->format_class(), convert_x + convert_w, src_y + src_h);
 					src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
 					const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
 					const areai dst_rect = coordi{{ convert_x, src_y }, { convert_w, src_h }};
 					vk::copy_image_typeless(cmd, section.src, src_image, src_rect, dst_rect, 1);
 					src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
 					src_x = convert_x;
 					src_w = convert_w;
 				}
 				ensure(src_image->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src_image->current_layout == VK_IMAGE_LAYOUT_GENERAL);
 				// Final aspect mask of the 'final' transfer source
 				const auto new_src_aspect = src_image->aspect();
 				if (src_w == section.dst_w && src_h == section.dst_h && transform == rsx::surface_transform::identity) [[likely]]
 				{
 					VkImageCopy copy_rgn;
 					copy_rgn.srcOffset = { src_x, src_y, 0 };
 					copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
 					copy_rgn.dstSubresource = { dst_aspect, 0, 0, 1 };
 					copy_rgn.srcSubresource = { new_src_aspect, 0, 0, 1 };
 					copy_rgn.extent = { src_w, src_h, 1 };
 					if (dst->info.imageType == VK_IMAGE_TYPE_3D)
 					{
 						copy_rgn.dstOffset.z = section.dst_z;
 					}
 					else
 					{
 						copy_rgn.dstSubresource.baseArrayLayer = section.dst_z;
 						copy_rgn.dstSubresource.mipLevel = section.level;
 					}
 					vkCmdCopyImage(cmd, src_image->value, src_image->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
 				}
 				else
 				{
 					ensure(section.dst_z == 0);
 					u16 dst_x = section.dst_x, dst_y = section.dst_y;
 					vk::image* _dst;
 					if (src_image->info.format == dst->info.format && section.level == 0) [[likely]]
 					{
 						_dst = dst;
 					}
 					else
 					{
 						// Either a bitcast is required or a scale+copy to mipmap level
 						_dst = vk::get_typeless_helper(src_image->format(), src_image->format_class(), dst->width(), dst->height() * 2);
 						_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
 					}
 					if (transform == rsx::surface_transform::identity)
 					{
 						vk::copy_scaled_image(cmd, src_image, _dst,
 							coordi{ { src_x, src_y }, { src_w, src_h } },
 							coordi{ { section.dst_x, section.dst_y }, { section.dst_w, section.dst_h } },
 							1, src_image->format() == _dst->format(),
 							VK_FILTER_NEAREST);
 					}
 					else if (transform == rsx::surface_transform::argb_to_bgra)
 					{
 						VkBufferImageCopy copy{};
 						copy.imageExtent = { src_w, src_h, 1 };
 						copy.imageOffset = { src_x, src_y, 0 };
 						copy.imageSubresource = { src_image->aspect(), 0, 0, 1 };
 						const auto mem_length = src_w * src_h * dst_bpp;
 						auto scratch_buf = vk::get_scratch_buffer(mem_length);
 						vkCmdCopyImageToBuffer(cmd, src_image->value, src_image->current_layout, scratch_buf->value, 1, &copy);
 						vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
 							VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
 						auto shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
 						shuffle_kernel->run(cmd, scratch_buf, mem_length);
 						vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
 							VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
 						auto tmp = vk::get_typeless_helper(src_image->format(), src_image->format_class(), section.dst_x + section.dst_w, section.dst_y + section.dst_h);
 						tmp->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
 						copy.imageOffset = { 0, 0, 0 };
 						vkCmdCopyBufferToImage(cmd, scratch_buf->value, tmp->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy);
 						dst_x = 0;
 						dst_y = 0;
 						if (src_w != section.dst_w || src_h != section.dst_h)
 						{
 							// Optionally scale if needed
 							if (tmp == _dst) [[unlikely]]
 							{
 								dst_y = src_h;
 							}
 							vk::copy_scaled_image(cmd, tmp, _dst,
 								areai{ 0, 0, src_w, static_cast<s32>(src_h) },
 								coordi{ { dst_x, dst_y }, { section.dst_w, section.dst_h } },
 								1, tmp->info.format == _dst->info.format,
 								VK_FILTER_NEAREST);
 						}
 						else
 						{
 							_dst = tmp;
 						}
 					}
 					else
 					{
 						fmt::throw_exception("Unreachable");
 					}
 					if (_dst != dst) [[unlikely]]
 					{
 						// Casting comes after the scaling!
 						VkImageCopy copy_rgn;
 						copy_rgn.srcOffset = { s32(dst_x), s32(dst_y), 0 };
 						copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
 						copy_rgn.dstSubresource = { dst_aspect, section.level, 0, 1 };
 						copy_rgn.srcSubresource = { _dst->aspect(), 0, 0, 1 };
 						copy_rgn.extent = { section.dst_w, section.dst_h, 1 };
 						_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
 						vkCmdCopyImage(cmd, _dst->value, _dst->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
 					}
 				}
 				section.src->pop_layout(cmd);
 			}
 		}
 		vk::image* get_template_from_collection_impl(const std::vector<copy_region_descriptor>& sections_to_transfer) const
 		{
--- a/rpcs3/GLGSRender.vcxproj
+++ b/rpcs3/GLGSRender.vcxproj
@ -107,6 +107,7 @@
    <ClCompile Include="Emu\RSX\GL\OpenGL.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLTexture.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/rpcs3/GLGSRender.vcxproj.filters
+++ b/rpcs3/GLGSRender.vcxproj.filters
@ -14,6 +14,7 @@
    <ClCompile Include="Emu\RSX\GL\GLShaderInterpreter.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Emu\RSX\GL\GLTexture.h" />
--- a/rpcs3/Loader/PSF.cpp
+++ b/rpcs3/Loader/PSF.cpp
@ -1,6 +1,8 @@
 #include "stdafx.h"
 #include "PSF.h"
 #include "util/asm.hpp"
 LOG_CHANNEL(psf_log, "PSF");
 template<>
@ -208,7 +210,7 @@ namespace psf
 		}
 		// Align next section (data) offset
-		key_offset = ::align(key_offset, 4);
+		key_offset = utils::align(key_offset, 4);
 		// Generate header
 		header_t header;
--- a/rpcs3/VKGSRender.vcxproj
+++ b/rpcs3/VKGSRender.vcxproj
@ -67,6 +67,7 @@
    <ClCompile Include="Emu\RSX\VK\VKTexture.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKVertexBuffers.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKVertexProgram.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKTextureCache.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKMemAlloc.cpp" />
  </ItemGroup>
  <ItemGroup>
--- a/rpcs3/VKGSRender.vcxproj.filters
+++ b/rpcs3/VKGSRender.vcxproj.filters
@ -18,6 +18,7 @@
    <ClCompile Include="Emu\RSX\VK\VKTexture.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKVertexBuffers.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKVertexProgram.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKTextureCache.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKMemAlloc.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKCommandStream.cpp" />
    <ClCompile Include="Emu\RSX\VK\VKQueryPool.cpp" />
--- a/rpcs3/rpcs3qt/cheat_manager.cpp
+++ b/rpcs3/rpcs3qt/cheat_manager.cpp
@ -18,6 +18,7 @@
 #include "Emu/Cell/PPUFunction.h"
 #include "util/yaml.hpp"
 #include "util/asm.hpp"
 #include "util/to_endian.hpp"
 #include "Utilities/StrUtil.h"
 #include "Utilities/bin_patch.h" // get_patches_path()
@ -418,17 +419,17 @@ bool cheat_engine::set_value(const u32 offset, const T value)
 			if (exec_code_at_end && exec_code_at_start)
 			{
-				size = align<u32>(addr + size, 4) - (addr & -4);
+				size = utils::align<u32>(addr + size, 4) - (addr & -4);
 				addr &= -4;
 			}
 			else if (exec_code_at_end)
 			{
-				size -= align<u32>(size - 4096 + (addr & 4095), 4);
+				size -= utils::align<u32>(size - 4096 + (addr & 4095), 4);
-				addr = align<u32>(addr, 4096);
+				addr = utils::align<u32>(addr, 4096);
 			}
 			else if (exec_code_at_start)
 			{
-				size = align<u32>(4096 - (addr & 4095), 4);
+				size = utils::align<u32>(4096 - (addr & 4095), 4);
 				addr &= -4;
 			}
--- a/rpcs3/rpcs3qt/debugger_frame.cpp
+++ b/rpcs3/rpcs3qt/debugger_frame.cpp
@ -27,6 +27,8 @@
 #include <QVBoxLayout>
 #include <QTimer>
 #include "util/asm.hpp"
 constexpr auto qstr = QString::fromStdString;
 debugger_frame::debugger_frame(std::shared_ptr<gui_settings> settings, QWidget *parent)
@ -573,7 +575,7 @@ void debugger_frame::ShowGotoAddressDialog()
 	if (cpu)
 	{
 		// -1 turns into 0
-		u32 pc = ::align<u32>(cpu->get_pc(), 4);
+		u32 pc = utils::align<u32>(cpu->get_pc(), 4);
 		address_preview_label->setText(QString("Address: 0x%1").arg(pc, 8, 16, QChar('0')));
 		expression_input->setPlaceholderText(QString("0x%1").arg(pc, 8, 16, QChar('0')));
 	}
@ -605,7 +607,7 @@ void debugger_frame::ShowGotoAddressDialog()
 	if (diag->exec() == QDialog::Accepted)
 	{
 		// -1 turns into 0
-		u32 address = ::align<u32>(cpu ? cpu->get_pc() : 0, 4);
+		u32 address = utils::align<u32>(cpu ? cpu->get_pc() : 0, 4);
 		if (expression_input->text().isEmpty())
 		{
--- a/rpcs3/rpcs3qt/memory_viewer_panel.cpp
+++ b/rpcs3/rpcs3qt/memory_viewer_panel.cpp
@ -15,6 +15,8 @@
 #include <QWheelEvent>
 #include <shared_mutex>
 #include "util/asm.hpp"
 constexpr auto qstr = QString::fromStdString;
 memory_viewer_panel::memory_viewer_panel(QWidget* parent, u32 addr)
@ -209,7 +211,7 @@ memory_viewer_panel::memory_viewer_panel(QWidget* parent, u32 addr)
 	{
 		bool ok;
 		const QString text = m_addr_line->text();
-		m_addr = (text.startsWith("0x", Qt::CaseInsensitive) ? text.right(text.size() - 2) : text).toULong(&ok, 16); 
+		m_addr = (text.startsWith("0x", Qt::CaseInsensitive) ? text.right(text.size() - 2) : text).toULong(&ok, 16);
 		m_addr -= m_addr % (m_colcount * 4); // Align by amount of bytes in a row
 		m_addr_line->setText(QString("%1").arg(m_addr, 8, 16, QChar('0')));	// get 8 digits in input line
 		ShowMemory();
@ -293,7 +295,7 @@ void memory_viewer_panel::resizeEvent(QResizeEvent *event)
 std::string memory_viewer_panel::getHeaderAtAddr(u32 addr)
 {
 	// Check if its an SPU Local Storage beginning
-	const u32 spu_boundary = ::align<u32>(addr, SPU_LS_SIZE);
+	const u32 spu_boundary = utils::align<u32>(addr, SPU_LS_SIZE);
 	if (spu_boundary <= addr + m_colcount * 4 - 1)
 	{
--- a/rpcs3/rpcs3qt/register_editor_dialog.cpp
+++ b/rpcs3/rpcs3qt/register_editor_dialog.cpp
@ -15,6 +15,7 @@
 #include <charconv>
 #include "util/v128.hpp"
 #include "util/asm.hpp"
 constexpr auto qstr = QString::fromStdString;
 inline std::string sstr(const QString& _in) { return _in.toStdString(); }
@ -30,7 +31,7 @@ enum registers : int
 	ppu_ff31 = ppu_ff0 + 31,
 	ppu_v0,
 	ppu_v31 = ppu_v0 + 31,
-	spu_r0 = ::align(ppu_v31 + 1u, 128),
+	spu_r0 = utils::align(ppu_v31 + 1u, 128),
 	spu_r127 = spu_r0 + 127,
 	PPU_CR,
 	PPU_LR,
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@ -34,6 +34,7 @@
 #include <thread>
 #include "util/sysinfo.hpp"
 #include "util/asm.hpp"
 #ifdef WITH_DISCORD_RPC
 #include "_discord_utils.h"
@ -1809,7 +1810,7 @@ void settings_dialog::SnapSlider(QSlider *slider, int interval)
 		{
 			return;
 		}
-		slider->setValue(::rounded_div(value, interval) * interval);
+		slider->setValue(utils::rounded_div(value, interval) * interval);
 	});
 }
--- a/rpcs3/util/asm.hpp
+++ b/rpcs3/util/asm.hpp
@ -292,6 +292,32 @@ namespace utils
 		do _mm_pause();
 		while (__rdtsc() - start < cycles);
 	}
 	// Align to power of 2
 	template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
 	constexpr T align(T value, ullong align)
 	{
 		return static_cast<T>((value + (align - 1)) & (0 - align));
 	}
 	// General purpose aligned division, the result is rounded up not truncated
 	template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
 	constexpr T aligned_div(T value, ullong align)
 	{
 		return static_cast<T>((value + align - 1) / align);
 	}
 	// General purpose aligned division, the result is rounded to nearest
 	template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
 	constexpr T rounded_div(T value, std::conditional_t<std::is_signed<T>::value, llong, ullong> align)
 	{
 		if constexpr (std::is_unsigned<T>::value)
 		{
 			return static_cast<T>((value + (align / 2)) / align);
 		}
 		return static_cast<T>((value + (value < 0 ? 0 - align : align) / 2) / align);
 	}
 } // namespace utils
 using utils::busy_wait;
--- a/rpcs3/util/sysinfo.cpp
+++ b/rpcs3/util/sysinfo.cpp
@ -15,6 +15,8 @@
 #include <errno.h>
 #endif
 #include "util/asm.hpp"
 inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
 {
 	int regs[4];
@ -298,7 +300,7 @@ std::string utils::get_OS_version()
 static constexpr ullong round_tsc(ullong val)
 {
-	return ::rounded_div(val, 1'000'000) * 1'000'000;
+	return utils::rounded_div(val, 1'000'000) * 1'000'000;
 }
 ullong utils::get_tsc_freq()
--- a/rpcs3/util/types.hpp
+++ b/rpcs3/util/types.hpp
@ -595,31 +595,6 @@ struct f16
 	}
 };
 template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
 constexpr T align(T value, ullong align)
 {
 	return static_cast<T>((value + (align - 1)) & (0 - align));
 }
 // General purpose aligned division, the result is rounded up not truncated
 template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
 constexpr T aligned_div(T value, ullong align)
 {
 	return static_cast<T>((value + align - 1) / align);
 }
 // General purpose aligned division, the result is rounded to nearest
 template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
 constexpr T rounded_div(T value, std::conditional_t<std::is_signed<T>::value, llong, ullong> align)
 {
 	if constexpr (std::is_unsigned<T>::value)
 	{
 		return static_cast<T>((value + (align / 2)) / align);
 	}
 	return static_cast<T>((value + (value < 0 ? 0 - align : align) / 2) / align);
 }
 template <typename T, typename T2>
 inline u32 offset32(T T2::*const mptr)
 {
--- a/rpcs3/util/vm_native.cpp
+++ b/rpcs3/util/vm_native.cpp
@ -1,6 +1,7 @@
 #include "stdafx.h"
 #include "util/logs.hpp"
 #include "util/vm.hpp"
 #include "util/asm.hpp"
 #ifdef _WIN32
 #include "util/dyn_lib.hpp"
 #include <Windows.h>
@ -209,7 +210,7 @@ namespace utils
 	}
 	shm::shm(u32 size, u32 flags)
-		: m_size(::align(size, 0x10000))
+		: m_size(utils::align(size, 0x10000))
 		, m_flags(flags)
 		, m_ptr(0)
 	{
@ -306,7 +307,7 @@ namespace utils
 		{
 			const u64 res64 = reinterpret_cast<u64>(::mmap(reinterpret_cast<void*>(ptr64), m_size + 0xf000, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0));
-			const u64 aligned = ::align(res64, 0x10000);
+			const u64 aligned = utils::align(res64, 0x10000);
 			const auto result = ::mmap(reinterpret_cast<void*>(aligned), m_size, +prot, MAP_SHARED | MAP_FIXED, m_file, 0);
 			// Now cleanup remnants