From eec11bfba93e4e1c8d0c6105685ea70aee8000c1 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Fri, 18 Dec 2020 17:43:34 +0300 Subject: [PATCH] Move align helpers to util/asm.hpp Also add some files: GLTextureCache.cpp VKTextureCache.cpp --- Utilities/File.cpp | 4 +- Utilities/JIT.cpp | 19 +- rpcs3/Crypto/unedat.cpp | 5 +- rpcs3/Emu/CMakeLists.txt | 2 + rpcs3/Emu/Cell/Modules/cellDmux.cpp | 8 +- rpcs3/Emu/Cell/Modules/cellSaveData.cpp | 8 +- rpcs3/Emu/Cell/Modules/cellVdec.cpp | 5 +- rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp | 3 +- rpcs3/Emu/Cell/PPUModule.cpp | 13 +- rpcs3/Emu/Cell/PPUThread.cpp | 4 +- rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 6 +- rpcs3/Emu/Cell/SPUThread.cpp | 2 +- rpcs3/Emu/Cell/lv2/sys_memory.cpp | 5 +- rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp | 4 +- rpcs3/Emu/Cell/lv2/sys_spu.cpp | 2 +- rpcs3/Emu/Memory/vm.cpp | 14 +- rpcs3/Emu/NP/np_handler.cpp | 4 +- rpcs3/Emu/RSX/Capture/rsx_replay.cpp | 3 +- rpcs3/Emu/RSX/Common/TextureUtils.cpp | 12 +- rpcs3/Emu/RSX/Common/ring_buffer_helper.h | 9 +- rpcs3/Emu/RSX/Common/surface_store.cpp | 12 +- rpcs3/Emu/RSX/Common/surface_store.h | 4 +- rpcs3/Emu/RSX/GL/GLCompute.h | 4 +- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 2 +- rpcs3/Emu/RSX/GL/GLHelpers.h | 17 +- rpcs3/Emu/RSX/GL/GLTexture.cpp | 12 +- rpcs3/Emu/RSX/GL/GLTextureCache.cpp | 191 ++++++++++++ rpcs3/Emu/RSX/GL/GLTextureCache.h | 184 +---------- rpcs3/Emu/RSX/RSXThread.cpp | 48 ++- rpcs3/Emu/RSX/RSXThread.h | 46 +-- rpcs3/Emu/RSX/VK/VKCompute.h | 8 +- rpcs3/Emu/RSX/VK/VKDMA.cpp | 6 +- rpcs3/Emu/RSX/VK/VKHelpers.cpp | 8 +- rpcs3/Emu/RSX/VK/VKPresent.cpp | 4 +- rpcs3/Emu/RSX/VK/VKResolveHelper.h | 4 +- rpcs3/Emu/RSX/VK/VKTexture.cpp | 18 +- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 360 ++++++++++++++++++++++ rpcs3/Emu/RSX/VK/VKTextureCache.h | 352 +-------------------- rpcs3/GLGSRender.vcxproj | 1 + rpcs3/GLGSRender.vcxproj.filters | 1 + rpcs3/Loader/PSF.cpp | 4 +- rpcs3/VKGSRender.vcxproj | 1 + rpcs3/VKGSRender.vcxproj.filters | 1 + rpcs3/rpcs3qt/cheat_manager.cpp | 9 +- rpcs3/rpcs3qt/debugger_frame.cpp | 6 +- rpcs3/rpcs3qt/memory_viewer_panel.cpp | 6 +- rpcs3/rpcs3qt/register_editor_dialog.cpp | 3 +- rpcs3/rpcs3qt/settings_dialog.cpp | 3 +- rpcs3/util/asm.hpp | 26 ++ rpcs3/util/sysinfo.cpp | 4 +- rpcs3/util/types.hpp | 25 -- rpcs3/util/vm_native.cpp | 5 +- 52 files changed, 794 insertions(+), 713 deletions(-) create mode 100644 rpcs3/Emu/RSX/GL/GLTextureCache.cpp create mode 100644 rpcs3/Emu/RSX/VK/VKTextureCache.cpp diff --git a/Utilities/File.cpp b/Utilities/File.cpp index 141251b5c0..f1f52ae816 100644 --- a/Utilities/File.cpp +++ b/Utilities/File.cpp @@ -10,6 +10,8 @@ #include #include +#include "util/asm.hpp" + using namespace std::literals::string_literals; #ifdef _WIN32 @@ -1725,7 +1727,7 @@ u64 fs::get_dir_size(const std::string& path, u64 rounding_alignment) if (!entry.is_directory) { - result += ::align(entry.size, rounding_alignment); + result += utils::align(entry.size, rounding_alignment); } else { diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index 83e8c130a2..14c006af12 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -6,6 +6,7 @@ #include "util/logs.hpp" #include "mutex.h" #include "util/vm.hpp" +#include "util/asm.hpp" #include #include @@ -52,8 +53,8 @@ static u8* add_jit_memory(usz size, uint align) // Simple allocation by incrementing pointer to the next free data const u64 pos = Ctr.atomic_op([&](u64& ctr) -> u64 { - const u64 _pos = ::align(ctr & 0xffff'ffff, align); - const u64 _new = ::align(_pos + size, align); + const u64 _pos = utils::align(ctr & 0xffff'ffff, align); + const u64 _new = utils::align(_pos + size, align); if (_new > 0x40000000) [[unlikely]] { @@ -69,7 +70,7 @@ static u8* add_jit_memory(usz size, uint align) // Check the necessity to commit more memory if (_new > olda) [[unlikely]] { - newa = ::align(_new, 0x200000); + newa = utils::align(_new, 0x200000); } ctr += _new - (ctr & 0xffff'ffff); @@ -223,7 +224,7 @@ asmjit::Runtime& asmjit::get_global_runtime() return asmjit::kErrorNoCodeGenerated; } - void* p = m_pos.fetch_add(::align(codeSize, 4096)); + void* p = m_pos.fetch_add(utils::align(codeSize, 4096)); if (!p || m_pos > m_max) [[unlikely]] { *dst = nullptr; @@ -237,7 +238,7 @@ asmjit::Runtime& asmjit::get_global_runtime() return asmjit::kErrorInvalidState; } - utils::memory_protect(p, ::align(codeSize, 4096), utils::protection::rx); + utils::memory_protect(p, utils::align(codeSize, 4096), utils::protection::rx); flush(p, relocSize); *dst = p; @@ -351,8 +352,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager return nullptr; } - const u64 olda = ::align(oldp, align); - const u64 newp = ::align(olda + size, align); + const u64 olda = utils::align(oldp, align); + const u64 newp = utils::align(olda + size, align); if ((newp - 1) / c_max_size != oldp / c_max_size) { @@ -363,8 +364,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager if ((oldp - 1) / c_page_size != (newp - 1) / c_page_size) { // Allocate pages on demand - const u64 pagea = ::align(oldp, c_page_size); - const u64 psize = ::align(newp - pagea, c_page_size); + const u64 pagea = utils::align(oldp, c_page_size); + const u64 psize = utils::align(newp - pagea, c_page_size); utils::memory_commit(this->ptr + pagea, psize, prot); } diff --git a/rpcs3/Crypto/unedat.cpp b/rpcs3/Crypto/unedat.cpp index d6059de6c1..50aab0aa32 100644 --- a/rpcs3/Crypto/unedat.cpp +++ b/rpcs3/Crypto/unedat.cpp @@ -6,6 +6,7 @@ #include #include "util/v128.hpp" +#include "util/asm.hpp" LOG_CHANNEL(edat_log, "EDAT"); @@ -949,7 +950,7 @@ bool EDATADecrypter::ReadHeader() }*/ file_size = edatHeader.file_size; - total_blocks = ::aligned_div(edatHeader.file_size, edatHeader.block_size); + total_blocks = utils::aligned_div(edatHeader.file_size, edatHeader.block_size); return true; } @@ -962,7 +963,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size) // now we need to offset things to account for the actual 'range' requested const u64 startOffset = pos % edatHeader.block_size; - const u32 num_blocks = static_cast(::aligned_div(startOffset + size, edatHeader.block_size)); + const u32 num_blocks = static_cast(utils::aligned_div(startOffset + size, edatHeader.block_size)); const u64 bufSize = num_blocks*edatHeader.block_size; if (data_buf_size < (bufSize)) { diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index b13c6cc7d7..df0dbe6429 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -428,6 +428,7 @@ target_sources(rpcs3_emu PRIVATE RSX/GL/GLTexture.cpp RSX/GL/GLVertexBuffers.cpp RSX/GL/GLVertexProgram.cpp + RSX/GL/GLTextureCache.cpp RSX/GL/OpenGL.cpp ) @@ -454,6 +455,7 @@ if(TARGET 3rdparty_vulkan) RSX/VK/VKTexture.cpp RSX/VK/VKVertexBuffers.cpp RSX/VK/VKVertexProgram.cpp + RSX/VK/VKTextureCache.cpp ) endif() diff --git a/rpcs3/Emu/Cell/Modules/cellDmux.cpp b/rpcs3/Emu/Cell/Modules/cellDmux.cpp index eb66739524..df3607620c 100644 --- a/rpcs3/Emu/Cell/Modules/cellDmux.cpp +++ b/rpcs3/Emu/Cell/Modules/cellDmux.cpp @@ -7,7 +7,7 @@ #include "cellPamf.h" #include "cellDmux.h" -#include +#include "util/asm.hpp" LOG_CHANNEL(cellDmux); @@ -753,9 +753,9 @@ PesHeader::PesHeader(DemuxerStream& stream) } ElementaryStream::ElementaryStream(Demuxer* dmux, u32 addr, u32 size, u32 fidMajor, u32 fidMinor, u32 sup1, u32 sup2, vm::ptr cbFunc, u32 cbArg, u32 spec) - : put(align(addr, 128)) + : put(utils::align(addr, 128)) , dmux(dmux) - , memAddr(align(addr, 128)) + , memAddr(utils::align(addr, 128)) , memSize(size - (addr - memAddr)) , fidMajor(fidMajor) , fidMinor(fidMinor) @@ -847,7 +847,7 @@ void ElementaryStream::push_au(u32 size, u64 dts, u64 pts, u64 userdata, bool ra addr = put; - put = align(put + 128 + size, 128); + put = utils::align(put + 128 + size, 128); put_count++; } diff --git a/rpcs3/Emu/Cell/Modules/cellSaveData.cpp b/rpcs3/Emu/Cell/Modules/cellSaveData.cpp index 499d615e69..37ea8fecf5 100644 --- a/rpcs3/Emu/Cell/Modules/cellSaveData.cpp +++ b/rpcs3/Emu/Cell/Modules/cellSaveData.cpp @@ -20,6 +20,8 @@ #include #include +#include "util/asm.hpp" + LOG_CHANNEL(cellSaveData); template<> @@ -953,7 +955,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v { if (!file.is_directory) { - size_bytes += ::align(file.size, 1024); + size_bytes += utils::align(file.size, 1024); } } @@ -1334,7 +1336,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v { statGet->fileNum++; - size_bytes += ::align(entry.size, 1024); // firmware rounds this value up + size_bytes += utils::align(entry.size, 1024); // firmware rounds this value up if (statGet->fileListNum >= setBuf->fileListMax) continue; @@ -1892,7 +1894,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v // add file list per FS order to PARAM.SFO std::string final_blist; final_blist = fmt::merge(blist, "/"); - psf::assign(psf, "RPCS3_BLIST", psf::string(::align(::size32(final_blist) + 1, 4), final_blist)); + psf::assign(psf, "RPCS3_BLIST", psf::string(utils::align(::size32(final_blist) + 1, 4), final_blist)); // Write all files in temporary directory auto& fsfo = all_files["PARAM.SFO"]; diff --git a/rpcs3/Emu/Cell/Modules/cellVdec.cpp b/rpcs3/Emu/Cell/Modules/cellVdec.cpp index 648952383c..93045fbd0b 100644 --- a/rpcs3/Emu/Cell/Modules/cellVdec.cpp +++ b/rpcs3/Emu/Cell/Modules/cellVdec.cpp @@ -34,6 +34,7 @@ extern "C" #include #include "Utilities/lockless.h" #include +#include "util/asm.hpp" std::mutex g_mutex_avcodec_open2; @@ -879,7 +880,7 @@ error_code cellVdecGetPicture(u32 handle, vm::cptr format, vm sws_scale(vdec->sws, in_data, in_line, 0, h, out_data, out_line); - //const u32 buf_size = align(av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1), 128); + //const u32 buf_size = utils::align(av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1), 128); //// TODO: zero padding bytes @@ -974,7 +975,7 @@ error_code cellVdecGetPicItem(u32 handle, vm::pptr picItem) info->startAddr = 0x00000123; // invalid value (no address for picture) const int buffer_size = av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1); ensure(buffer_size >= 0); - info->size = align(buffer_size, 128); + info->size = utils::align(buffer_size, 128); info->auNum = 1; info->auPts[0].lower = static_cast(pts); info->auPts[0].upper = static_cast(pts >> 32); diff --git a/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp b/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp index 1baf1b0bca..e1a4e66a32 100644 --- a/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp +++ b/rpcs3/Emu/Cell/Modules/sceNpTrophy.cpp @@ -20,6 +20,7 @@ #include "Emu/Cell/lv2/sys_process.h" #include +#include "util/asm.hpp" LOG_CHANNEL(sceNpTrophy); @@ -1109,7 +1110,7 @@ error_code sceNpTrophyGetGameProgress(u32 context, u32 handle, vm::ptr perc const u32 trp_count = ctxt->tropusr->GetTrophiesCount(); // Round result to nearest (TODO: Check 0 trophies) - *percentage = trp_count ? ::rounded_div(unlocked * 100, trp_count) : 0; + *percentage = trp_count ? utils::rounded_div(unlocked * 100, trp_count) : 0; if (trp_count == 0 || trp_count > 128) { diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp index 53ac4c4f08..34f525bbd3 100644 --- a/rpcs3/Emu/Cell/PPUModule.cpp +++ b/rpcs3/Emu/Cell/PPUModule.cpp @@ -22,6 +22,7 @@ #include #include #include +#include "util/asm.hpp" LOG_CHANNEL(ppu_loader); @@ -263,7 +264,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link) } // Set memory protection to read-only - vm::page_protect(ppu_function_manager::addr, ::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable); + vm::page_protect(ppu_function_manager::addr, utils::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable); // Initialize function names const bool is_first = g_ppu_function_names.empty(); @@ -319,7 +320,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link) } else { - const u32 next = ::align(alloc_addr, variable.second.align); + const u32 next = utils::align(alloc_addr, variable.second.align); const u32 end = next + variable.second.size; if (!next || (end >> 12 != alloc_addr >> 12)) @@ -1500,7 +1501,7 @@ void ppu_load_exec(const ppu_exec_object& elf) for (const auto& arg : Emu.argv) { - const u32 arg_size = ::align(::size32(arg) + 1, 0x10); + const u32 arg_size = utils::align(::size32(arg) + 1, 0x10); const u32 arg_addr = vm::alloc(arg_size, vm::main); std::memcpy(vm::base(arg_addr), arg.data(), arg_size); @@ -1513,7 +1514,7 @@ void ppu_load_exec(const ppu_exec_object& elf) for (const auto& arg : Emu.envp) { - const u32 arg_size = ::align(::size32(arg) + 1, 0x10); + const u32 arg_size = utils::align(::size32(arg) + 1, 0x10); const u32 arg_addr = vm::alloc(arg_size, vm::main); std::memcpy(vm::base(arg_addr), arg.data(), arg_size); @@ -1533,7 +1534,7 @@ void ppu_load_exec(const ppu_exec_object& elf) case 0x70: primary_stacksize = 1024 * 1024; break; // SYS_PROCESS_PRIMARY_STACK_SIZE_1M default: { - primary_stacksize = ::align(std::clamp(sz, 0x10000, 0x100000), 4096); + primary_stacksize = utils::align(std::clamp(sz, 0x10000, 0x100000), 4096); break; } } @@ -1636,7 +1637,7 @@ void ppu_load_exec(const ppu_exec_object& elf) if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz && (prog.p_flags & 0x2) == 0u /* W */) { // Set memory protection to read-only when necessary - ensure(vm::page_protect(addr, ::align(size, 0x1000), 0, 0, vm::page_writable)); + ensure(vm::page_protect(addr, utils::align(size, 0x1000), 0, 0, vm::page_writable)); } } } diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 2e462f8e87..9b472040ab 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -242,7 +242,7 @@ extern void ppu_register_range(u32 addr, u32 size) // Register executable range at utils::memory_commit(&ppu_ref(addr), size * 2, utils::protection::rw); - vm::page_protect(addr, align(size, 0x10000), 0, vm::page_executable); + vm::page_protect(addr, utils::align(size, 0x10000), 0, vm::page_executable); const u64 fallback = g_cfg.core.ppu_decoder == ppu_decoder_type::llvm ? reinterpret_cast(ppu_recompiler_fallback) : reinterpret_cast(ppu_fallback); @@ -1098,7 +1098,7 @@ u32 ppu_thread::stack_push(u32 size, u32 align_v) ppu_thread& context = static_cast(*cpu); const u32 old_pos = vm::cast(context.gpr[1]); - context.gpr[1] -= align(size + 4, 8); // room minimal possible size + context.gpr[1] -= utils::align(size + 4, 8); // room minimal possible size context.gpr[1] &= ~(u64{align_v} - 1); // fix stack alignment if (old_pos >= context.stack_addr && old_pos < context.stack_addr + context.stack_size && context.gpr[1] < context.stack_addr) diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 689b6032e7..0b5e180302 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -288,7 +288,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func) words_align = 64; const u32 starta = start & -64; - const u32 enda = ::align(end, 64); + const u32 enda = utils::align(end, 64); const u32 sizea = (enda - starta) / 64; ensure(sizea); @@ -369,7 +369,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func) words_align = 32; const u32 starta = start & -32; - const u32 enda = ::align(end, 32); + const u32 enda = utils::align(end, 32); const u32 sizea = (enda - starta) / 32; ensure(sizea); @@ -491,7 +491,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func) words_align = 32; const u32 starta = start & -32; - const u32 enda = ::align(end, 32); + const u32 enda = utils::align(end, 32); const u32 sizea = (enda - starta) / 32; ensure(sizea); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index bfa0a336a9..cd15901925 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -2338,7 +2338,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* } u32 range_addr = eal & -128; - u32 range_end = ::align(eal + size, 128); + u32 range_end = utils::align(eal + size, 128); // Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?) if (range_addr >> 16 != (range_end - 1) >> 16) diff --git a/rpcs3/Emu/Cell/lv2/sys_memory.cpp b/rpcs3/Emu/Cell/lv2/sys_memory.cpp index 3340cb8295..79d569fa06 100644 --- a/rpcs3/Emu/Cell/lv2/sys_memory.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_memory.cpp @@ -8,6 +8,7 @@ #include "Emu/IdManager.h" #include "util/vm.hpp" +#include "util/asm.hpp" LOG_CHANNEL(sys_memory); @@ -57,7 +58,7 @@ error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptralloc(size, nullptr, align)) { @@ -128,7 +129,7 @@ error_code sys_memory_allocate_from_container(cpu_thread& cpu, u32 size, u32 cid return ct.ret; } - if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, ::align(size, 0x10000000), 0x401)) + if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, utils::align(size, 0x10000000), 0x401)) { if (u32 addr = area->alloc(size)) { diff --git a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp index 65cb1ba2e3..613f835d4d 100644 --- a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp @@ -12,6 +12,8 @@ #include "sys_mmapper.h" #include "sys_memory.h" +#include "util/asm.hpp" + LOG_CHANNEL(sys_ppu_thread); // Simple structure to cleanup previous thread, because can't remove its own thread @@ -388,7 +390,7 @@ error_code _sys_ppu_thread_create(ppu_thread& ppu, vm::ptr thread_id, vm::p g_fxo->get()->clean(0); // Compute actual stack size and allocate - const u32 stack_size = ::align(std::max(_stacksz, 4096), 4096); + const u32 stack_size = utils::align(std::max(_stacksz, 4096), 4096); const auto dct = g_fxo->get(); diff --git a/rpcs3/Emu/Cell/lv2/sys_spu.cpp b/rpcs3/Emu/Cell/lv2/sys_spu.cpp index da70bf7865..f715335f00 100644 --- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp @@ -99,7 +99,7 @@ void sys_spu_image::load(const fs::file& stream) this->nsegs = 0; this->segs = vm::null; - vm::page_protect(segs.addr(), ::align(mem_size, 4096), 0, 0, vm::page_writable); + vm::page_protect(segs.addr(), utils::align(mem_size, 4096), 0, 0, vm::page_writable); } void sys_spu_image::free() diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 06e337ad5a..576126c6fb 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -974,13 +974,13 @@ namespace vm if (state & page_1m_size) { - i = ::align(i + 1, 0x100000 / 4096); + i = utils::align(i + 1, 0x100000 / 4096); continue; } if (state & page_64k_size) { - i = ::align(i + 1, 0x10000 / 4096); + i = utils::align(i + 1, 0x10000 / 4096); continue; } @@ -1177,7 +1177,7 @@ namespace vm const u32 min_page_size = flags & 0x100 ? 0x1000 : 0x10000; // Align to minimal page size - const u32 size = ::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0); + const u32 size = utils::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0); // Check alignment (it's page allocation, so passing small values there is just silly) if (align < min_page_size || align != (0x80000000u >> std::countl_zero(align))) @@ -1217,7 +1217,7 @@ namespace vm vm::writer_lock lock(0); // Search for an appropriate place (unoptimized) - for (u32 addr = ::align(this->addr, align); u64{addr} + size <= u64{this->addr} + this->size; addr += align) + for (u32 addr = utils::align(this->addr, align); u64{addr} + size <= u64{this->addr} + this->size; addr += align) { if (try_alloc(addr, pflags, size, std::move(shm))) { @@ -1240,7 +1240,7 @@ namespace vm const u32 min_page_size = flags & 0x100 ? 0x1000 : 0x10000; // Align to minimal page size - const u32 size = ::align(orig_size, min_page_size); + const u32 size = utils::align(orig_size, min_page_size); // return if addr or size is invalid if (!size || addr < this->addr || orig_size > size || addr + u64{size} > this->addr + u64{this->size} || flags & 0x10) @@ -1410,7 +1410,7 @@ namespace vm static std::shared_ptr _find_map(u32 size, u32 align, u64 flags) { - for (u32 addr = ::align(0x20000000, align); addr - 1 < 0xC0000000 - 1; addr += align) + for (u32 addr = utils::align(0x20000000, align); addr - 1 < 0xC0000000 - 1; addr += align) { if (_test_map(addr, size)) { @@ -1485,7 +1485,7 @@ namespace vm vm::writer_lock lock(0); // Align to minimal page size - const u32 size = ::align(orig_size, 0x10000); + const u32 size = utils::align(orig_size, 0x10000); // Check alignment if (align < 0x10000 || align != (0x80000000u >> std::countl_zero(align))) diff --git a/rpcs3/Emu/NP/np_handler.cpp b/rpcs3/Emu/NP/np_handler.cpp index 5c00d186bb..f624c7d520 100644 --- a/rpcs3/Emu/NP/np_handler.cpp +++ b/rpcs3/Emu/NP/np_handler.cpp @@ -32,6 +32,8 @@ #include #endif +#include "util/asm.hpp" + LOG_CHANNEL(sys_net); LOG_CHANNEL(sceNp2); LOG_CHANNEL(sceNp); @@ -384,7 +386,7 @@ vm::addr_t np_handler::allocate(u32 size) return vm::cast(static_cast(0)); // Align allocs - const u32 alloc_size = ::align(size, 4); + const u32 alloc_size = utils::align(size, 4); if (alloc_size > mpool_avail) { sceNp.error("Not enough memory available in NP pool!"); diff --git a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp index c9f4091a36..220dd52773 100644 --- a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp +++ b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp @@ -7,6 +7,7 @@ #include "Emu/RSX/RSXThread.h" #include +#include "util/asm.hpp" namespace rsx { @@ -23,7 +24,7 @@ namespace rsx } // User memory + fifo size - buffer_size = ::align(buffer_size, 0x100000) + 0x10000000; + buffer_size = utils::align(buffer_size, 0x100000) + 0x10000000; // We are not allowed to drain all memory so add a little g_fxo->init(buffer_size + 0x1000000); diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.cpp b/rpcs3/Emu/RSX/Common/TextureUtils.cpp index 67baa736c8..ea0cd557f1 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp +++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp @@ -4,6 +4,8 @@ #include "../RSXThread.h" #include "../rsx_utils.h" +#include "util/asm.hpp" + namespace { // FIXME: GSL as_span break build if template parameter is non const with current revision. @@ -346,8 +348,8 @@ namespace } else { - current_subresource_layout.width_in_block = aligned_div(miplevel_width_in_texel, block_edge_in_texel); - current_subresource_layout.height_in_block = aligned_div(miplevel_height_in_texel, block_edge_in_texel); + current_subresource_layout.width_in_block = utils::aligned_div(miplevel_width_in_texel, block_edge_in_texel); + current_subresource_layout.height_in_block = utils::aligned_div(miplevel_height_in_texel, block_edge_in_texel); } if (padded_row) @@ -375,7 +377,7 @@ namespace miplevel_height_in_texel = std::max(miplevel_height_in_texel / 2, 1); } - offset_in_src = align(offset_in_src, 128); + offset_in_src = utils::align(offset_in_src, 128); } return result; @@ -922,8 +924,8 @@ namespace rsx usz result = 0; for (u16 i = 0; i < mipmap; ++i) { - usz rowPitch = align(block_size_in_byte * width_in_blocks, row_pitch_alignment); - result += align(rowPitch * height_in_blocks * depth, mipmap_alignment); + usz rowPitch = utils::align(block_size_in_byte * width_in_blocks, row_pitch_alignment); + result += utils::align(rowPitch * height_in_blocks * depth, mipmap_alignment); height_in_blocks = std::max(height_in_blocks / 2, 1); width_in_blocks = std::max(width_in_blocks / 2, 1); } diff --git a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h index ae59bc5685..b06b75c89f 100644 --- a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h +++ b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h @@ -1,6 +1,7 @@ #pragma once #include "util/logs.hpp" +#include "util/asm.hpp" /** * Ring buffer memory helper : @@ -19,8 +20,8 @@ protected: template bool can_alloc(usz size) const { - usz alloc_size = align(size, Alignment); - usz aligned_put_pos = align(m_put_pos, Alignment); + usz alloc_size = utils::align(size, Alignment); + usz aligned_put_pos = utils::align(m_put_pos, Alignment); if (aligned_put_pos + alloc_size < m_size) { // range before get @@ -83,8 +84,8 @@ public: template usz alloc(usz size) { - const usz alloc_size = align(size, Alignment); - const usz aligned_put_pos = align(m_put_pos, Alignment); + const usz alloc_size = utils::align(size, Alignment); + const usz aligned_put_pos = utils::align(m_put_pos, Alignment); if (!can_alloc(size) && !grow(aligned_put_pos + alloc_size)) { diff --git a/rpcs3/Emu/RSX/Common/surface_store.cpp b/rpcs3/Emu/RSX/Common/surface_store.cpp index 72d1008fc9..53981ac6db 100644 --- a/rpcs3/Emu/RSX/Common/surface_store.cpp +++ b/rpcs3/Emu/RSX/Common/surface_store.cpp @@ -1,6 +1,8 @@ #include "stdafx.h" #include "surface_store.h" +#include "util/asm.hpp" + namespace rsx { namespace utility @@ -23,20 +25,20 @@ namespace rsx { switch (format) { - case surface_color_format::b8: return align(width, 256); + case surface_color_format::b8: return utils::align(width, 256); case surface_color_format::g8b8: case surface_color_format::x1r5g5b5_o1r5g5b5: case surface_color_format::x1r5g5b5_z1r5g5b5: - case surface_color_format::r5g6b5: return align(width * 2, 256); + case surface_color_format::r5g6b5: return utils::align(width * 2, 256); case surface_color_format::a8b8g8r8: case surface_color_format::x8b8g8r8_o8b8g8r8: case surface_color_format::x8b8g8r8_z8b8g8r8: case surface_color_format::x8r8g8b8_o8r8g8b8: case surface_color_format::x8r8g8b8_z8r8g8b8: case surface_color_format::x32: - case surface_color_format::a8r8g8b8: return align(width * 4, 256); - case surface_color_format::w16z16y16x16: return align(width * 8, 256); - case surface_color_format::w32z32y32x32: return align(width * 16, 256); + case surface_color_format::a8r8g8b8: return utils::align(width * 4, 256); + case surface_color_format::w16z16y16x16: return utils::align(width * 8, 256); + case surface_color_format::w32z32y32x32: return utils::align(width * 16, 256); } fmt::throw_exception("Unknown color surface format"); } diff --git a/rpcs3/Emu/RSX/Common/surface_store.h b/rpcs3/Emu/RSX/Common/surface_store.h index 039592e40e..90fde1bd3c 100644 --- a/rpcs3/Emu/RSX/Common/surface_store.h +++ b/rpcs3/Emu/RSX/Common/surface_store.h @@ -5,6 +5,8 @@ #include "../rsx_utils.h" #include +#include "util/asm.hpp" + namespace rsx { namespace utility @@ -918,7 +920,7 @@ namespace rsx { // Width is calculated in the coordinate-space of the requester; normalize info.src_area.x = (info.src_area.x * required_bpp) / surface_bpp; - info.src_area.width = align(width * required_bpp, surface_bpp) / surface_bpp; + info.src_area.width = utils::align(width * required_bpp, surface_bpp) / surface_bpp; } else { diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 2f6b7773b0..5906dcb0f2 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -4,6 +4,8 @@ #include "Emu/IdManager.h" #include "GLHelpers.h" +#include "util/asm.hpp" + namespace gl { struct compute_task @@ -224,7 +226,7 @@ namespace gl m_data_length = data_length; const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; - const auto num_bytes_to_process = align(data_length, num_bytes_per_invocation); + const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation); const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation; if ((num_bytes_to_process + data_offset) > data->size()) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index dee0cf0617..ececc28ef6 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -740,7 +740,7 @@ void GLGSRender::load_program_env() if (update_fragment_env) m_fragment_env_buffer->reserve_storage_on_heap(128); if (update_vertex_env) m_vertex_env_buffer->reserve_storage_on_heap(256); if (update_fragment_texture_env) m_texture_parameters_buffer->reserve_storage_on_heap(256); - if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(align(fragment_constants_size, 256)); + if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256)); if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192); if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128); diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index 52a36873e3..4e0bc10609 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -16,6 +16,7 @@ #include "Utilities/mutex.h" #include "Utilities/geometry.h" #include "util/logs.hpp" +#include "util/asm.hpp" #define GL_FRAGMENT_TEXTURES_START 0 #define GL_VERTEX_TEXTURES_START (GL_FRAGMENT_TEXTURES_START + 16) @@ -808,7 +809,7 @@ namespace gl virtual std::pair alloc_from_heap(u32 alloc_size, u16 alignment) { u32 offset = m_data_loc; - if (m_data_loc) offset = align(offset, alignment); + if (m_data_loc) offset = utils::align(offset, alignment); if ((offset + alloc_size) > m_size) { @@ -827,7 +828,7 @@ namespace gl } //Align data loc to 256; allows some "guard" region so we dont trample our own data inadvertently - m_data_loc = align(offset + alloc_size, 256); + m_data_loc = utils::align(offset + alloc_size, 256); return std::make_pair(static_cast(m_memory_mapping) + offset, offset); } @@ -897,9 +898,9 @@ namespace gl ensure(m_memory_mapping == nullptr); u32 offset = m_data_loc; - if (m_data_loc) offset = align(offset, 256); + if (m_data_loc) offset = utils::align(offset, 256); - const u32 block_size = align(alloc_size + 16, 256); //Overallocate just in case we need to realign base + const u32 block_size = utils::align(alloc_size + 16, 256); //Overallocate just in case we need to realign base if ((offset + block_size) > m_size) { @@ -933,10 +934,10 @@ namespace gl std::pair alloc_from_heap(u32 alloc_size, u16 alignment) override { u32 offset = m_data_loc; - if (m_data_loc) offset = align(offset, alignment); + if (m_data_loc) offset = utils::align(offset, alignment); u32 padding = (offset - m_data_loc); - u32 real_size = align(padding + alloc_size, alignment); //Ensures we leave the loc pointer aligned after we exit + u32 real_size = utils::align(padding + alloc_size, alignment); //Ensures we leave the loc pointer aligned after we exit if (real_size > m_mapped_bytes) { @@ -946,10 +947,10 @@ namespace gl reserve_storage_on_heap(std::max(real_size, 4096U)); offset = m_data_loc; - if (m_data_loc) offset = align(offset, alignment); + if (m_data_loc) offset = utils::align(offset, alignment); padding = (offset - m_data_loc); - real_size = align(padding + alloc_size, alignment); + real_size = utils::align(padding + alloc_size, alignment); } m_data_loc = offset + real_size; diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index b416a02aac..2948bf49ca 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -6,6 +6,8 @@ #include "../RSXThread.h" #include "../RSXTexture.h" +#include "util/asm.hpp" + namespace gl { buffer g_typeless_transfer_buffer; @@ -614,8 +616,8 @@ namespace gl { //Compressed formats have a 4-byte alignment //TODO: Verify that samplers are not affected by the padding - width = align(width, 4); - height = align(height, 4); + width = utils::align(width, 4); + height = utils::align(height, 4); } GLenum target; @@ -654,7 +656,7 @@ namespace gl { caps.supports_vtc_decoding = gl::get_driver_caps().vendor_NVIDIA; - unpack_settings.row_length(align(dst->width(), 4)); + unpack_settings.row_length(utils::align(dst->width(), 4)); unpack_settings.apply(); glBindTexture(static_cast(dst->get_target()), dst->id()); @@ -664,7 +666,7 @@ namespace gl for (const rsx::subresource_layout& layout : input_layouts) { upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps); - const sizei image_size{ align(layout.width_in_texel, 4), align(layout.height_in_texel, 4) }; + const sizei image_size{utils::align(layout.width_in_texel, 4), utils::align(layout.height_in_texel, 4)}; switch (dst->get_target()) { @@ -835,7 +837,7 @@ namespace gl void upload_texture(texture* dst, u32 gcm_format, bool is_swizzled, const std::vector& subresources_layout) { // Calculate staging buffer size - const u32 aligned_pitch = align(dst->pitch(), 4); + const u32 aligned_pitch = utils::align(dst->pitch(), 4); usz texture_data_sz = dst->depth() * dst->height() * aligned_pitch; std::vector data_upload_buf(texture_data_sz); diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp new file mode 100644 index 0000000000..040ee067bf --- /dev/null +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp @@ -0,0 +1,191 @@ +#include "stdafx.h" +#include "Emu/RSX/RSXThread.h" +#include "GLTexture.h" +#include "GLTextureCache.h" + +#include "util/asm.hpp" + +namespace gl +{ + void cached_texture_section::finish_flush() + { + // Free resources + glUnmapBuffer(GL_PIXEL_PACK_BUFFER); + glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); + + const auto valid_range = get_confirmed_range_delta(); + const u32 valid_offset = valid_range.first; + const u32 valid_length = valid_range.second; + void *dst = get_ptr(get_section_base() + valid_offset); + + if (!gl::get_driver_caps().ARB_compute_shader_supported) + { + switch (type) + { + case gl::texture::type::sbyte: + case gl::texture::type::ubyte: + { + // byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty + ensure(!pack_unpack_swap_bytes); + break; + } + case gl::texture::type::uint_24_8: + { + // Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24 + // In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3 + ensure(pack_unpack_swap_bytes == false); + ensure(real_pitch == (width * 4)); + if (rsx_pitch == real_pitch) [[likely]] + { + stream_data_to_memory_swapped_u32(dst, dst, valid_length / 4, 4); + } + else + { + const u32 num_rows = utils::align(valid_length, rsx_pitch) / rsx_pitch; + u8* data = static_cast(dst); + for (u32 row = 0; row < num_rows; ++row) + { + stream_data_to_memory_swapped_u32(data, data, width, 4); + data += rsx_pitch; + } + } + break; + } + default: + break; + } + } + + if (is_swizzled()) + { + // This format is completely worthless to CPU processing algorithms where cache lines on die are linear. + // If this is happening, usually it means it was not a planned readback (e.g shared pages situation) + rsx_log.warning("[Performance warning] CPU readback of swizzled data"); + + // Read-modify-write to avoid corrupting already resident memory outside texture region + std::vector tmp_data(rsx_pitch * height); + std::memcpy(tmp_data.data(), dst, tmp_data.size()); + + switch (type) + { + case gl::texture::type::uint_8_8_8_8: + case gl::texture::type::uint_24_8: + rsx::convert_linear_swizzle(tmp_data.data(), dst, width, height, rsx_pitch); + break; + case gl::texture::type::ushort_5_6_5: + case gl::texture::type::ushort: + rsx::convert_linear_swizzle(tmp_data.data(), dst, width, height, rsx_pitch); + break; + default: + rsx_log.error("Unexpected swizzled texture format 0x%x", static_cast(format)); + } + } + + if (context == rsx::texture_upload_context::framebuffer_storage) + { + // Update memory tag + static_cast(vram_texture)->sync_tag(); + } + } + + void texture_cache::copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector& sources) const + { + const auto dst_bpp = dst_image->pitch() / dst_image->width(); + const auto dst_aspect = dst_image->aspect(); + + for (const auto &slice : sources) + { + if (!slice.src) + continue; + + const bool typeless = dst_aspect != slice.src->aspect() || + !formats_are_bitcast_compatible(static_cast(slice.src->get_internal_format()), static_cast(dst_image->get_internal_format())); + + std::unique_ptr tmp; + auto src_image = slice.src; + auto src_x = slice.src_x; + auto src_y = slice.src_y; + auto src_w = slice.src_w; + auto src_h = slice.src_h; + + if (slice.xform == rsx::surface_transform::coordinate_transform) + { + // Dimensions were given in 'dst' space. Work out the real source coordinates + const auto src_bpp = slice.src->pitch() / slice.src->width(); + src_x = (src_x * dst_bpp) / src_bpp; + src_w = utils::aligned_div(src_w * dst_bpp, src_bpp); + } + + if (auto surface = dynamic_cast(slice.src)) + { + surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h); + } + + if (typeless) [[unlikely]] + { + const auto src_bpp = slice.src->pitch() / slice.src->width(); + const u16 convert_w = u16(slice.src->width() * src_bpp) / dst_bpp; + tmp = std::make_unique(GL_TEXTURE_2D, convert_w, slice.src->height(), 1, 1, static_cast(dst_image->get_internal_format())); + + src_image = tmp.get(); + + // Compute src region in dst format layout + const u16 src_w2 = u16(src_w * src_bpp) / dst_bpp; + const u16 src_x2 = u16(src_x * src_bpp) / dst_bpp; + + if (src_w2 == slice.dst_w && src_h == slice.dst_h && slice.level == 0) + { + // Optimization, avoid typeless copy to tmp followed by data copy to dst + // Combine the two transfers into one + const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } }; + const coord3u dst_region = { { slice.dst_x, slice.dst_y, slice.dst_z }, { slice.dst_w, slice.dst_h, 1 } }; + gl::copy_typeless(dst_image, slice.src, dst_region, src_region); + + continue; + } + + const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } }; + const coord3u dst_region = { { src_x2, src_y, 0 }, { src_w2, src_h, 1 } }; + gl::copy_typeless(src_image, slice.src, dst_region, src_region); + + src_x = src_x2; + src_w = src_w2; + } + + if (src_w == slice.dst_w && src_h == slice.dst_h) + { + glCopyImageSubData(src_image->id(), GL_TEXTURE_2D, 0, src_x, src_y, 0, + dst_image->id(), static_cast(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, src_w, src_h, 1); + } + else + { + ensure(dst_image->get_target() == gl::texture::target::texture2D); + + auto _blitter = gl::g_hw_blitter; + const areai src_rect = { src_x, src_y, src_x + src_w, src_y + src_h }; + const areai dst_rect = { slice.dst_x, slice.dst_y, slice.dst_x + slice.dst_w, slice.dst_y + slice.dst_h }; + + gl::texture* _dst; + if (src_image->get_internal_format() == dst_image->get_internal_format() && slice.level == 0) + { + _dst = dst_image; + } + else + { + tmp = std::make_unique(GL_TEXTURE_2D, dst_rect.x2, dst_rect.y2, 1, 1, static_cast(slice.src->get_internal_format())); + _dst = tmp.get(); + } + + _blitter->scale_image(cmd, src_image, _dst, + src_rect, dst_rect, false, {}); + + if (_dst != dst_image) + { + // Data cast comes after scaling + glCopyImageSubData(tmp->id(), GL_TEXTURE_2D, 0, slice.dst_x, slice.dst_y, 0, + dst_image->id(), static_cast(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, slice.dst_w, slice.dst_h, 1); + } + } + } + } +} diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index 8527b826ef..a0174b22f4 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -62,7 +62,7 @@ namespace gl void init_buffer(const gl::texture* src) { const u32 vram_size = src->pitch() * src->height(); - const u32 buffer_size = align(vram_size, 4096); + const u32 buffer_size = utils::align(vram_size, 4096); if (pbo) { @@ -333,86 +333,7 @@ namespace gl return glMapBufferRange(GL_PIXEL_PACK_BUFFER, offset, size, GL_MAP_READ_BIT); } - void finish_flush() - { - // Free resources - glUnmapBuffer(GL_PIXEL_PACK_BUFFER); - glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); - - const auto valid_range = get_confirmed_range_delta(); - const u32 valid_offset = valid_range.first; - const u32 valid_length = valid_range.second; - void *dst = get_ptr(get_section_base() + valid_offset); - - if (!gl::get_driver_caps().ARB_compute_shader_supported) - { - switch (type) - { - case gl::texture::type::sbyte: - case gl::texture::type::ubyte: - { - // byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty - ensure(!pack_unpack_swap_bytes); - break; - } - case gl::texture::type::uint_24_8: - { - // Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24 - // In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3 - ensure(pack_unpack_swap_bytes == false); - ensure(real_pitch == (width * 4)); - if (rsx_pitch == real_pitch) [[likely]] - { - stream_data_to_memory_swapped_u32(dst, dst, valid_length / 4, 4); - } - else - { - const u32 num_rows = align(valid_length, rsx_pitch) / rsx_pitch; - u8* data = static_cast(dst); - for (u32 row = 0; row < num_rows; ++row) - { - stream_data_to_memory_swapped_u32(data, data, width, 4); - data += rsx_pitch; - } - } - break; - } - default: - break; - } - } - - if (is_swizzled()) - { - // This format is completely worthless to CPU processing algorithms where cache lines on die are linear. - // If this is happening, usually it means it was not a planned readback (e.g shared pages situation) - rsx_log.warning("[Performance warning] CPU readback of swizzled data"); - - // Read-modify-write to avoid corrupting already resident memory outside texture region - std::vector tmp_data(rsx_pitch * height); - std::memcpy(tmp_data.data(), dst, tmp_data.size()); - - switch (type) - { - case gl::texture::type::uint_8_8_8_8: - case gl::texture::type::uint_24_8: - rsx::convert_linear_swizzle(tmp_data.data(), dst, width, height, rsx_pitch); - break; - case gl::texture::type::ushort_5_6_5: - case gl::texture::type::ushort: - rsx::convert_linear_swizzle(tmp_data.data(), dst, width, height, rsx_pitch); - break; - default: - rsx_log.error("Unexpected swizzled texture format 0x%x", static_cast(format)); - } - } - - if (context == rsx::texture_upload_context::framebuffer_storage) - { - // Update memory tag - static_cast(vram_texture)->sync_tag(); - } - } + void finish_flush(); /** * Misc @@ -637,106 +558,7 @@ namespace gl } } - void copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector& sources) const - { - const auto dst_bpp = dst_image->pitch() / dst_image->width(); - const auto dst_aspect = dst_image->aspect(); - - for (const auto &slice : sources) - { - if (!slice.src) - continue; - - const bool typeless = dst_aspect != slice.src->aspect() || - !formats_are_bitcast_compatible(static_cast(slice.src->get_internal_format()), static_cast(dst_image->get_internal_format())); - - std::unique_ptr tmp; - auto src_image = slice.src; - auto src_x = slice.src_x; - auto src_y = slice.src_y; - auto src_w = slice.src_w; - auto src_h = slice.src_h; - - if (slice.xform == rsx::surface_transform::coordinate_transform) - { - // Dimensions were given in 'dst' space. Work out the real source coordinates - const auto src_bpp = slice.src->pitch() / slice.src->width(); - src_x = (src_x * dst_bpp) / src_bpp; - src_w = ::aligned_div(src_w * dst_bpp, src_bpp); - } - - if (auto surface = dynamic_cast(slice.src)) - { - surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h); - } - - if (typeless) [[unlikely]] - { - const auto src_bpp = slice.src->pitch() / slice.src->width(); - const u16 convert_w = u16(slice.src->width() * src_bpp) / dst_bpp; - tmp = std::make_unique(GL_TEXTURE_2D, convert_w, slice.src->height(), 1, 1, static_cast(dst_image->get_internal_format())); - - src_image = tmp.get(); - - // Compute src region in dst format layout - const u16 src_w2 = u16(src_w * src_bpp) / dst_bpp; - const u16 src_x2 = u16(src_x * src_bpp) / dst_bpp; - - if (src_w2 == slice.dst_w && src_h == slice.dst_h && slice.level == 0) - { - // Optimization, avoid typeless copy to tmp followed by data copy to dst - // Combine the two transfers into one - const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } }; - const coord3u dst_region = { { slice.dst_x, slice.dst_y, slice.dst_z }, { slice.dst_w, slice.dst_h, 1 } }; - gl::copy_typeless(dst_image, slice.src, dst_region, src_region); - - continue; - } - - const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } }; - const coord3u dst_region = { { src_x2, src_y, 0 }, { src_w2, src_h, 1 } }; - gl::copy_typeless(src_image, slice.src, dst_region, src_region); - - src_x = src_x2; - src_w = src_w2; - } - - if (src_w == slice.dst_w && src_h == slice.dst_h) - { - glCopyImageSubData(src_image->id(), GL_TEXTURE_2D, 0, src_x, src_y, 0, - dst_image->id(), static_cast(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, src_w, src_h, 1); - } - else - { - ensure(dst_image->get_target() == gl::texture::target::texture2D); - - auto _blitter = gl::g_hw_blitter; - const areai src_rect = { src_x, src_y, src_x + src_w, src_y + src_h }; - const areai dst_rect = { slice.dst_x, slice.dst_y, slice.dst_x + slice.dst_w, slice.dst_y + slice.dst_h }; - - gl::texture* _dst; - if (src_image->get_internal_format() == dst_image->get_internal_format() && slice.level == 0) - { - _dst = dst_image; - } - else - { - tmp = std::make_unique(GL_TEXTURE_2D, dst_rect.x2, dst_rect.y2, 1, 1, static_cast(slice.src->get_internal_format())); - _dst = tmp.get(); - } - - _blitter->scale_image(cmd, src_image, _dst, - src_rect, dst_rect, false, {}); - - if (_dst != dst_image) - { - // Data cast comes after scaling - glCopyImageSubData(tmp->id(), GL_TEXTURE_2D, 0, slice.dst_x, slice.dst_y, 0, - dst_image->id(), static_cast(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, slice.dst_w, slice.dst_h, 1); - } - } - } - } + void copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector& sources) const; gl::texture* get_template_from_collection_impl(const std::vector& sections_to_transfer) const { diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index c27abf1f7e..d1067f406f 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -139,6 +139,52 @@ namespace rsx fmt::throw_exception("rsx::get_address(offset=0x%x, location=0x%x): %s%s", offset, location, msg, src_loc{line, col, file, func}); } + std::pair interleaved_range_info::calculate_required_range(u32 first, u32 count) const + { + if (single_vertex) + { + return { 0, 1 }; + } + + const u32 max_index = (first + count) - 1; + u32 _max_index = 0; + u32 _min_index = first; + + for (const auto &attrib : locations) + { + if (attrib.frequency <= 1) [[likely]] + { + _max_index = max_index; + } + else + { + if (attrib.modulo) + { + if (max_index >= attrib.frequency) + { + // Actually uses the modulo operator + _min_index = 0; + _max_index = attrib.frequency - 1; + } + else + { + // Same as having no modulo + _max_index = max_index; + } + } + else + { + // Division operator + _min_index = std::min(_min_index, first / attrib.frequency); + _max_index = std::max(_max_index, utils::aligned_div(max_index, attrib.frequency)); + } + } + } + + ensure(_max_index >= _min_index); + return { _min_index, (_max_index - _min_index) + 1 }; + } + u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size) { switch (type) @@ -2521,7 +2567,7 @@ namespace rsx } // Some cases do not need full delay - remaining = ::aligned_div(remaining, div); + remaining = utils::aligned_div(remaining, div); const u64 until = get_system_time() + remaining; while (true) diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 5b55e1e55f..9477337fba 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -246,51 +246,7 @@ namespace rsx rsx::simple_array locations; // Check if we need to upload a full unoptimized range, i.e [0-max_index] - std::pair calculate_required_range(u32 first, u32 count) const - { - if (single_vertex) - { - return { 0, 1 }; - } - - const u32 max_index = (first + count) - 1; - u32 _max_index = 0; - u32 _min_index = first; - - for (const auto &attrib : locations) - { - if (attrib.frequency <= 1) [[likely]] - { - _max_index = max_index; - } - else - { - if (attrib.modulo) - { - if (max_index >= attrib.frequency) - { - // Actually uses the modulo operator - _min_index = 0; - _max_index = attrib.frequency - 1; - } - else - { - // Same as having no modulo - _max_index = max_index; - } - } - else - { - // Division operator - _min_index = std::min(_min_index, first / attrib.frequency); - _max_index = std::max(_max_index, aligned_div(max_index, attrib.frequency)); - } - } - } - - ensure(_max_index >= _min_index); - return { _min_index, (_max_index - _min_index) + 1 }; - } + std::pair calculate_required_range(u32 first, u32 count) const; }; enum attribute_buffer_placement : u8 diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index f0ea98efc9..b09a91f3f0 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -5,6 +5,8 @@ #include "Utilities/StrUtil.h" #include "Emu/IdManager.h" +#include "util/asm.hpp" + #define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame namespace vk @@ -296,7 +298,7 @@ namespace vk "%vars" "\n"; - const auto parameters_size = align(push_constants_size, 16) / 16; + const auto parameters_size = utils::align(push_constants_size, 16) / 16; const std::pair syntax_replace[] = { { "%ws", std::to_string(optimal_group_size) }, @@ -943,7 +945,7 @@ namespace vk set_parameters(cmd); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); - const u32 linear_invocations = aligned_div(data_length, num_bytes_per_invocation); + const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); compute_task::run(cmd, linear_invocations); } }; @@ -997,7 +999,7 @@ namespace vk word_count = num_words; block_length = num_words * 4; - const u32 linear_invocations = aligned_div(word_count, optimal_group_size); + const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size); compute_task::run(cmd, linear_invocations); } }; diff --git a/rpcs3/Emu/RSX/VK/VKDMA.cpp b/rpcs3/Emu/RSX/VK/VKDMA.cpp index 90d1440f22..81437cef28 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.cpp +++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp @@ -3,6 +3,8 @@ #include "VKResourceManager.h" #include "VKDMA.h" +#include "util/asm.hpp" + namespace vk { static constexpr usz s_dma_block_length = 0x01000000; @@ -85,7 +87,7 @@ namespace vk { if (!inheritance_info.parent) { - const u32 start = align(range.start, s_page_size); + const u32 start = utils::align(range.start, s_page_size); const u32 end = ((range.end + 1) & s_page_align); for (u32 page = start; page < end; page += s_page_size) @@ -259,7 +261,7 @@ namespace vk } dma_block* block_head = nullptr; - auto block_end = align(limit, s_dma_block_length); + auto block_end = utils::align(limit, s_dma_block_length); // Reverse scan to try and find the minimum required length in case of other chaining for (auto block = last_block; block != first_block; block -= s_dma_block_length) diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index c7022a1019..a3aaad8907 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -132,7 +132,7 @@ namespace vk { // Create new heap. All sizes are aligned up by 64M, upto 1GiB const usz size_limit = 1024 * 0x100000; - const usz aligned_new_size = align(m_size + size, 64 * 0x100000); + const usz aligned_new_size = utils::align(m_size + size, 64 * 0x100000); if (aligned_new_size >= size_limit) { @@ -351,8 +351,8 @@ namespace vk { auto create_texture = [&]() { - u32 new_width = align(requested_width, 1024u); - u32 new_height = align(requested_height, 1024u); + u32 new_width = utils::align(requested_width, 1024u); + u32 new_height = utils::align(requested_height, 1024u); return new vk::image(*g_current_renderer, g_current_renderer->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_IMAGE_TYPE_2D, format, new_width, new_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, @@ -388,7 +388,7 @@ namespace vk if (!g_scratch_buffer) { // Choose optimal size - const u64 alloc_size = std::max(64 * 0x100000, align(min_required_size, 0x100000)); + const u64 alloc_size = std::max(64 * 0x100000, utils::align(min_required_size, 0x100000)); g_scratch_buffer = std::make_unique(*g_current_renderer, alloc_size, g_current_renderer->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp index c440f85c25..8d20697105 100644 --- a/rpcs3/Emu/RSX/VK/VKPresent.cpp +++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp @@ -2,6 +2,8 @@ #include "VKGSRender.h" #include "Emu/Cell/Modules/cellVideoOut.h" +#include "util/asm.hpp" + void VKGSRender::reinitialize_swapchain() { m_swapchain_dims.width = m_frame->client_width(); @@ -651,7 +653,7 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info) const usz sshot_size = buffer_height * buffer_width * 4; - vk::buffer sshot_vkbuf(*m_device, align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + vk::buffer sshot_vkbuf(*m_device, utils::align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); VkBufferImageCopy copy_info; diff --git a/rpcs3/Emu/RSX/VK/VKResolveHelper.h b/rpcs3/Emu/RSX/VK/VKResolveHelper.h index 9be843692b..952bb26518 100644 --- a/rpcs3/Emu/RSX/VK/VKResolveHelper.h +++ b/rpcs3/Emu/RSX/VK/VKResolveHelper.h @@ -131,8 +131,8 @@ namespace vk multisampled = msaa_image; resolve = resolve_image; - const u32 invocations_x = align(resolve_image->width(), cs_wave_x) / cs_wave_x; - const u32 invocations_y = align(resolve_image->height(), cs_wave_y) / cs_wave_y; + const u32 invocations_x = utils::align(resolve_image->width(), cs_wave_x) / cs_wave_x; + const u32 invocations_y = utils::align(resolve_image->height(), cs_wave_y) / cs_wave_y; compute_task::run(cmd, invocations_x, invocations_y, 1); } diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 9d095b6a2c..2038d75b7d 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -7,6 +7,8 @@ #include "VKRenderPass.h" #include "VKRenderTargets.h" +#include "util/asm.hpp" + namespace vk { VkComponentMapping default_component_map() @@ -89,7 +91,7 @@ namespace vk ensure(dst->size() >= allocation_end); const auto data_offset = u32(region.bufferOffset); - const auto z32_offset = align(data_offset + packed16_length, 256); + const auto z32_offset = utils::align(data_offset + packed16_length, 256); // 1. Copy the depth to buffer VkBufferImageCopy region2; @@ -135,8 +137,8 @@ namespace vk ensure(dst->size() >= allocation_end); const auto data_offset = u32(region.bufferOffset); - const auto z_offset = align(data_offset + packed_length, 256); - const auto s_offset = align(z_offset + in_depth_size, 256); + const auto z_offset = utils::align(data_offset + packed_length, 256); + const auto s_offset = utils::align(z_offset + in_depth_size, 256); // 1. Copy the depth and stencil blocks to separate banks VkBufferImageCopy sub_regions[2]; @@ -225,7 +227,7 @@ namespace vk ensure(src->size() >= allocation_end); const auto data_offset = u32(region.bufferOffset); - const auto z32_offset = align(data_offset + packed16_length, 256); + const auto z32_offset = utils::align(data_offset + packed16_length, 256); // 1. Pre-compute barrier vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length, @@ -260,8 +262,8 @@ namespace vk ensure(src->size() >= allocation_end); // "Out of memory (compute heap). Lower your resolution scale setting." const auto data_offset = u32(region.bufferOffset); - const auto z_offset = align(data_offset + packed_length, 256); - const auto s_offset = align(z_offset + in_depth_size, 256); + const auto z_offset = utils::align(data_offset + packed_length, 256); + const auto s_offset = utils::align(z_offset + in_depth_size, 256); // Zero out the stencil block vkCmdFillBuffer(cmd, src->value, s_offset, in_stencil_size, 0); @@ -821,7 +823,7 @@ namespace vk const auto src_offset = section.bufferOffset; // Align output to 128-byte boundary to keep some drivers happy - dst_offset = align(dst_offset, 128); + dst_offset = utils::align(dst_offset, 128); u32 data_length = 0; for (unsigned i = 0, j = packet.first; i < packet.second; ++i, ++j) @@ -930,7 +932,7 @@ namespace vk if (layout.level == 0) { // Align mip0 on a 128-byte boundary - scratch_offset = align(scratch_offset, 128); + scratch_offset = utils::align(scratch_offset, 128); } // Copy from upload heap to scratch mem diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp new file mode 100644 index 0000000000..116e05edc2 --- /dev/null +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -0,0 +1,360 @@ +#include "stdafx.h" +#include "VKGSRender.h" +#include "VKTextureCache.h" + +#include "util/asm.hpp" + +namespace vk +{ + void cached_texture_section::dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch) + { + ensure(src->samples() == 1); + + if (!m_device) + { + m_device = &cmd.get_command_pool().get_owner(); + } + + if (dma_fence) + { + // NOTE: This can be reached if previously synchronized, or a special path happens. + // If a hard flush occurred while this surface was flush_always the cache would have reset its protection afterwards. + // DMA resource would still be present but already used to flush previously. + vk::get_resource_manager()->dispose(dma_fence); + } + + if (vk::is_renderpass_open(cmd)) + { + vk::end_renderpass(cmd); + } + + src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + + const auto internal_bpp = vk::get_format_texel_width(src->format()); + const auto transfer_width = static_cast(src_area.width()); + const auto transfer_height = static_cast(src_area.height()); + real_pitch = internal_bpp * transfer_width; + rsx_pitch = pitch; + + const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT; + if (require_format_conversion || pack_unpack_swap_bytes) + { + const auto section_length = valid_range.length(); + const auto transfer_pitch = real_pitch; + const auto task_length = transfer_pitch * src_area.height(); + + auto working_buffer = vk::get_scratch_buffer(task_length); + auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length); + + VkBufferImageCopy region = {}; + region.imageSubresource = { src->aspect(), 0, 0, 1 }; + region.imageOffset = { src_area.x1, src_area.y1, 0 }; + region.imageExtent = { transfer_width, transfer_height, 1 }; + vk::copy_image_to_buffer(cmd, src, working_buffer, region, (require_format_conversion && pack_unpack_swap_bytes)); + + // NOTE: For depth/stencil formats, copying to buffer and byteswap are combined into one step above + if (pack_unpack_swap_bytes && !require_format_conversion) + { + const auto texel_layout = vk::get_format_element_size(src->format()); + const auto elem_size = texel_layout.first; + vk::cs_shuffle_base *shuffle_kernel; + + if (elem_size == 2) + { + shuffle_kernel = vk::get_compute_task(); + } + else if (elem_size == 4) + { + shuffle_kernel = vk::get_compute_task(); + } + else + { + ensure(get_context() == rsx::texture_upload_context::dma); + shuffle_kernel = nullptr; + } + + if (shuffle_kernel) + { + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + shuffle_kernel->run(cmd, working_buffer, task_length); + + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + } + } + + if (rsx_pitch == real_pitch) [[likely]] + { + VkBufferCopy copy = {}; + copy.dstOffset = final_mapping.first; + copy.size = section_length; + vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, ©); + } + else + { + if (context != rsx::texture_upload_context::dma) + { + // Partial load for the bits outside the existing image + // NOTE: A true DMA section would have been prepped beforehand + // TODO: Parial range load/flush + vk::load_dma(valid_range.start, section_length); + } + + std::vector copy; + copy.reserve(transfer_height); + + u32 dst_offset = final_mapping.first; + u32 src_offset = 0; + + for (unsigned row = 0; row < transfer_height; ++row) + { + copy.push_back({ src_offset, dst_offset, transfer_pitch }); + src_offset += real_pitch; + dst_offset += rsx_pitch; + } + + vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data()); + } + } + else + { + VkBufferImageCopy region = {}; + region.bufferRowLength = (rsx_pitch / internal_bpp); + region.imageSubresource = { src->aspect(), 0, 0, 1 }; + region.imageOffset = { src_area.x1, src_area.y1, 0 }; + region.imageExtent = { transfer_width, transfer_height, 1 }; + + auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length()); + region.bufferOffset = mapping.first; + vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, ®ion); + } + + src->pop_layout(cmd); + + // Create event object for this transfer and queue signal op + dma_fence = std::make_unique(*m_device); + dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT); + + // Set cb flag for queued dma operations + cmd.set_flag(vk::command_buffer::cb_has_dma_transfer); + + if (get_context() == rsx::texture_upload_context::dma) + { + // Save readback hint in case transformation is required later + switch (internal_bpp) + { + case 2: + gcm_format = CELL_GCM_TEXTURE_R5G6B5; + break; + case 4: + default: + gcm_format = CELL_GCM_TEXTURE_A8R8G8B8; + break; + } + } + + synchronized = true; + sync_timestamp = get_system_time(); + } + + void texture_cache::copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector& sections_to_transfer) const + { + const auto dst_aspect = dst->aspect(); + const auto dst_bpp = vk::get_format_texel_width(dst->format()); + + for (const auto §ion : sections_to_transfer) + { + if (!section.src) + continue; + + const bool typeless = section.src->aspect() != dst_aspect || + !formats_are_bitcast_compatible(dst, section.src); + + // Avoid inserting unnecessary barrier GENERAL->TRANSFER_SRC->GENERAL in active render targets + const auto preferred_layout = (section.src->current_layout != VK_IMAGE_LAYOUT_GENERAL) ? + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL; + + section.src->push_layout(cmd, preferred_layout); + + auto src_image = section.src; + auto src_x = section.src_x; + auto src_y = section.src_y; + auto src_w = section.src_w; + auto src_h = section.src_h; + + rsx::flags32_t transform = section.xform; + if (section.xform == rsx::surface_transform::coordinate_transform) + { + // Dimensions were given in 'dst' space. Work out the real source coordinates + const auto src_bpp = vk::get_format_texel_width(section.src->format()); + src_x = (src_x * dst_bpp) / src_bpp; + src_w = utils::aligned_div(src_w * dst_bpp, src_bpp); + + transform &= ~(rsx::surface_transform::coordinate_transform); + } + + if (auto surface = dynamic_cast(section.src)) + { + surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h); + } + + if (typeless) [[unlikely]] + { + const auto src_bpp = vk::get_format_texel_width(section.src->format()); + const u16 convert_w = u16(src_w * src_bpp) / dst_bpp; + const u16 convert_x = u16(src_x * src_bpp) / dst_bpp; + + if (convert_w == section.dst_w && src_h == section.dst_h && + transform == rsx::surface_transform::identity && + section.level == 0 && section.dst_z == 0) + { + // Optimization to avoid double transfer + // TODO: Handle level and layer offsets + const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }}; + const areai dst_rect = coordi{{ section.dst_x, section.dst_y }, { section.dst_w, section.dst_h }}; + vk::copy_image_typeless(cmd, section.src, dst, src_rect, dst_rect, 1); + + section.src->pop_layout(cmd); + continue; + } + + src_image = vk::get_typeless_helper(dst->format(), dst->format_class(), convert_x + convert_w, src_y + src_h); + src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + + const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }}; + const areai dst_rect = coordi{{ convert_x, src_y }, { convert_w, src_h }}; + vk::copy_image_typeless(cmd, section.src, src_image, src_rect, dst_rect, 1); + src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + + src_x = convert_x; + src_w = convert_w; + } + + ensure(src_image->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src_image->current_layout == VK_IMAGE_LAYOUT_GENERAL); + + // Final aspect mask of the 'final' transfer source + const auto new_src_aspect = src_image->aspect(); + + if (src_w == section.dst_w && src_h == section.dst_h && transform == rsx::surface_transform::identity) [[likely]] + { + VkImageCopy copy_rgn; + copy_rgn.srcOffset = { src_x, src_y, 0 }; + copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 }; + copy_rgn.dstSubresource = { dst_aspect, 0, 0, 1 }; + copy_rgn.srcSubresource = { new_src_aspect, 0, 0, 1 }; + copy_rgn.extent = { src_w, src_h, 1 }; + + if (dst->info.imageType == VK_IMAGE_TYPE_3D) + { + copy_rgn.dstOffset.z = section.dst_z; + } + else + { + copy_rgn.dstSubresource.baseArrayLayer = section.dst_z; + copy_rgn.dstSubresource.mipLevel = section.level; + } + + vkCmdCopyImage(cmd, src_image->value, src_image->current_layout, dst->value, dst->current_layout, 1, ©_rgn); + } + else + { + ensure(section.dst_z == 0); + + u16 dst_x = section.dst_x, dst_y = section.dst_y; + vk::image* _dst; + + if (src_image->info.format == dst->info.format && section.level == 0) [[likely]] + { + _dst = dst; + } + else + { + // Either a bitcast is required or a scale+copy to mipmap level + _dst = vk::get_typeless_helper(src_image->format(), src_image->format_class(), dst->width(), dst->height() * 2); + _dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + } + + if (transform == rsx::surface_transform::identity) + { + vk::copy_scaled_image(cmd, src_image, _dst, + coordi{ { src_x, src_y }, { src_w, src_h } }, + coordi{ { section.dst_x, section.dst_y }, { section.dst_w, section.dst_h } }, + 1, src_image->format() == _dst->format(), + VK_FILTER_NEAREST); + } + else if (transform == rsx::surface_transform::argb_to_bgra) + { + VkBufferImageCopy copy{}; + copy.imageExtent = { src_w, src_h, 1 }; + copy.imageOffset = { src_x, src_y, 0 }; + copy.imageSubresource = { src_image->aspect(), 0, 0, 1 }; + + const auto mem_length = src_w * src_h * dst_bpp; + auto scratch_buf = vk::get_scratch_buffer(mem_length); + vkCmdCopyImageToBuffer(cmd, src_image->value, src_image->current_layout, scratch_buf->value, 1, ©); + + vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + auto shuffle_kernel = vk::get_compute_task(); + shuffle_kernel->run(cmd, scratch_buf, mem_length); + + vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + + auto tmp = vk::get_typeless_helper(src_image->format(), src_image->format_class(), section.dst_x + section.dst_w, section.dst_y + section.dst_h); + tmp->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + + copy.imageOffset = { 0, 0, 0 }; + vkCmdCopyBufferToImage(cmd, scratch_buf->value, tmp->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©); + + dst_x = 0; + dst_y = 0; + + if (src_w != section.dst_w || src_h != section.dst_h) + { + // Optionally scale if needed + if (tmp == _dst) [[unlikely]] + { + dst_y = src_h; + } + + vk::copy_scaled_image(cmd, tmp, _dst, + areai{ 0, 0, src_w, static_cast(src_h) }, + coordi{ { dst_x, dst_y }, { section.dst_w, section.dst_h } }, + 1, tmp->info.format == _dst->info.format, + VK_FILTER_NEAREST); + } + else + { + _dst = tmp; + } + } + else + { + fmt::throw_exception("Unreachable"); + } + + if (_dst != dst) [[unlikely]] + { + // Casting comes after the scaling! + VkImageCopy copy_rgn; + copy_rgn.srcOffset = { s32(dst_x), s32(dst_y), 0 }; + copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 }; + copy_rgn.dstSubresource = { dst_aspect, section.level, 0, 1 }; + copy_rgn.srcSubresource = { _dst->aspect(), 0, 0, 1 }; + copy_rgn.extent = { section.dst_w, section.dst_h, 1 }; + + _dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + vkCmdCopyImage(cmd, _dst->value, _dst->current_layout, dst->value, dst->current_layout, 1, ©_rgn); + } + } + + section.src->pop_layout(cmd); + } + } +} diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index 31471673f6..fdeaa08b89 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -167,160 +167,7 @@ namespace vk return flushed; } - void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch) - { - ensure(src->samples() == 1); - - if (!m_device) - { - m_device = &cmd.get_command_pool().get_owner(); - } - - if (dma_fence) - { - // NOTE: This can be reached if previously synchronized, or a special path happens. - // If a hard flush occurred while this surface was flush_always the cache would have reset its protection afterwards. - // DMA resource would still be present but already used to flush previously. - vk::get_resource_manager()->dispose(dma_fence); - } - - if (vk::is_renderpass_open(cmd)) - { - vk::end_renderpass(cmd); - } - - src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); - - const auto internal_bpp = vk::get_format_texel_width(src->format()); - const auto transfer_width = static_cast(src_area.width()); - const auto transfer_height = static_cast(src_area.height()); - real_pitch = internal_bpp * transfer_width; - rsx_pitch = pitch; - - const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT; - if (require_format_conversion || pack_unpack_swap_bytes) - { - const auto section_length = valid_range.length(); - const auto transfer_pitch = real_pitch; - const auto task_length = transfer_pitch * src_area.height(); - - auto working_buffer = vk::get_scratch_buffer(task_length); - auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length); - - VkBufferImageCopy region = {}; - region.imageSubresource = { src->aspect(), 0, 0, 1 }; - region.imageOffset = { src_area.x1, src_area.y1, 0 }; - region.imageExtent = { transfer_width, transfer_height, 1 }; - vk::copy_image_to_buffer(cmd, src, working_buffer, region, (require_format_conversion && pack_unpack_swap_bytes)); - - // NOTE: For depth/stencil formats, copying to buffer and byteswap are combined into one step above - if (pack_unpack_swap_bytes && !require_format_conversion) - { - const auto texel_layout = vk::get_format_element_size(src->format()); - const auto elem_size = texel_layout.first; - vk::cs_shuffle_base *shuffle_kernel; - - if (elem_size == 2) - { - shuffle_kernel = vk::get_compute_task(); - } - else if (elem_size == 4) - { - shuffle_kernel = vk::get_compute_task(); - } - else - { - ensure(get_context() == rsx::texture_upload_context::dma); - shuffle_kernel = nullptr; - } - - if (shuffle_kernel) - { - vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - - shuffle_kernel->run(cmd, working_buffer, task_length); - - vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); - } - } - - if (rsx_pitch == real_pitch) [[likely]] - { - VkBufferCopy copy = {}; - copy.dstOffset = final_mapping.first; - copy.size = section_length; - vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, ©); - } - else - { - if (context != rsx::texture_upload_context::dma) - { - // Partial load for the bits outside the existing image - // NOTE: A true DMA section would have been prepped beforehand - // TODO: Parial range load/flush - vk::load_dma(valid_range.start, section_length); - } - - std::vector copy; - copy.reserve(transfer_height); - - u32 dst_offset = final_mapping.first; - u32 src_offset = 0; - - for (unsigned row = 0; row < transfer_height; ++row) - { - copy.push_back({ src_offset, dst_offset, transfer_pitch }); - src_offset += real_pitch; - dst_offset += rsx_pitch; - } - - vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data()); - } - } - else - { - VkBufferImageCopy region = {}; - region.bufferRowLength = (rsx_pitch / internal_bpp); - region.imageSubresource = { src->aspect(), 0, 0, 1 }; - region.imageOffset = { src_area.x1, src_area.y1, 0 }; - region.imageExtent = { transfer_width, transfer_height, 1 }; - - auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length()); - region.bufferOffset = mapping.first; - vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, ®ion); - } - - src->pop_layout(cmd); - - // Create event object for this transfer and queue signal op - dma_fence = std::make_unique(*m_device); - dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT); - - // Set cb flag for queued dma operations - cmd.set_flag(vk::command_buffer::cb_has_dma_transfer); - - if (get_context() == rsx::texture_upload_context::dma) - { - // Save readback hint in case transformation is required later - switch (internal_bpp) - { - case 2: - gcm_format = CELL_GCM_TEXTURE_R5G6B5; - break; - case 4: - default: - gcm_format = CELL_GCM_TEXTURE_A8R8G8B8; - break; - } - } - - synchronized = true; - sync_timestamp = get_system_time(); - } + void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch); void copy_texture(vk::command_buffer& cmd, bool miss) { @@ -610,202 +457,7 @@ namespace vk return mapping; } - void copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector& sections_to_transfer) const - { - const auto dst_aspect = dst->aspect(); - const auto dst_bpp = vk::get_format_texel_width(dst->format()); - - for (const auto §ion : sections_to_transfer) - { - if (!section.src) - continue; - - const bool typeless = section.src->aspect() != dst_aspect || - !formats_are_bitcast_compatible(dst, section.src); - - // Avoid inserting unnecessary barrier GENERAL->TRANSFER_SRC->GENERAL in active render targets - const auto preferred_layout = (section.src->current_layout != VK_IMAGE_LAYOUT_GENERAL) ? - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL; - - section.src->push_layout(cmd, preferred_layout); - - auto src_image = section.src; - auto src_x = section.src_x; - auto src_y = section.src_y; - auto src_w = section.src_w; - auto src_h = section.src_h; - - rsx::flags32_t transform = section.xform; - if (section.xform == rsx::surface_transform::coordinate_transform) - { - // Dimensions were given in 'dst' space. Work out the real source coordinates - const auto src_bpp = vk::get_format_texel_width(section.src->format()); - src_x = (src_x * dst_bpp) / src_bpp; - src_w = ::aligned_div(src_w * dst_bpp, src_bpp); - - transform &= ~(rsx::surface_transform::coordinate_transform); - } - - if (auto surface = dynamic_cast(section.src)) - { - surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h); - } - - if (typeless) [[unlikely]] - { - const auto src_bpp = vk::get_format_texel_width(section.src->format()); - const u16 convert_w = u16(src_w * src_bpp) / dst_bpp; - const u16 convert_x = u16(src_x * src_bpp) / dst_bpp; - - if (convert_w == section.dst_w && src_h == section.dst_h && - transform == rsx::surface_transform::identity && - section.level == 0 && section.dst_z == 0) - { - // Optimization to avoid double transfer - // TODO: Handle level and layer offsets - const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }}; - const areai dst_rect = coordi{{ section.dst_x, section.dst_y }, { section.dst_w, section.dst_h }}; - vk::copy_image_typeless(cmd, section.src, dst, src_rect, dst_rect, 1); - - section.src->pop_layout(cmd); - continue; - } - - src_image = vk::get_typeless_helper(dst->format(), dst->format_class(), convert_x + convert_w, src_y + src_h); - src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - - const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }}; - const areai dst_rect = coordi{{ convert_x, src_y }, { convert_w, src_h }}; - vk::copy_image_typeless(cmd, section.src, src_image, src_rect, dst_rect, 1); - src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); - - src_x = convert_x; - src_w = convert_w; - } - - ensure(src_image->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src_image->current_layout == VK_IMAGE_LAYOUT_GENERAL); - - // Final aspect mask of the 'final' transfer source - const auto new_src_aspect = src_image->aspect(); - - if (src_w == section.dst_w && src_h == section.dst_h && transform == rsx::surface_transform::identity) [[likely]] - { - VkImageCopy copy_rgn; - copy_rgn.srcOffset = { src_x, src_y, 0 }; - copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 }; - copy_rgn.dstSubresource = { dst_aspect, 0, 0, 1 }; - copy_rgn.srcSubresource = { new_src_aspect, 0, 0, 1 }; - copy_rgn.extent = { src_w, src_h, 1 }; - - if (dst->info.imageType == VK_IMAGE_TYPE_3D) - { - copy_rgn.dstOffset.z = section.dst_z; - } - else - { - copy_rgn.dstSubresource.baseArrayLayer = section.dst_z; - copy_rgn.dstSubresource.mipLevel = section.level; - } - - vkCmdCopyImage(cmd, src_image->value, src_image->current_layout, dst->value, dst->current_layout, 1, ©_rgn); - } - else - { - ensure(section.dst_z == 0); - - u16 dst_x = section.dst_x, dst_y = section.dst_y; - vk::image* _dst; - - if (src_image->info.format == dst->info.format && section.level == 0) [[likely]] - { - _dst = dst; - } - else - { - // Either a bitcast is required or a scale+copy to mipmap level - _dst = vk::get_typeless_helper(src_image->format(), src_image->format_class(), dst->width(), dst->height() * 2); - _dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - } - - if (transform == rsx::surface_transform::identity) - { - vk::copy_scaled_image(cmd, src_image, _dst, - coordi{ { src_x, src_y }, { src_w, src_h } }, - coordi{ { section.dst_x, section.dst_y }, { section.dst_w, section.dst_h } }, - 1, src_image->format() == _dst->format(), - VK_FILTER_NEAREST); - } - else if (transform == rsx::surface_transform::argb_to_bgra) - { - VkBufferImageCopy copy{}; - copy.imageExtent = { src_w, src_h, 1 }; - copy.imageOffset = { src_x, src_y, 0 }; - copy.imageSubresource = { src_image->aspect(), 0, 0, 1 }; - - const auto mem_length = src_w * src_h * dst_bpp; - auto scratch_buf = vk::get_scratch_buffer(mem_length); - vkCmdCopyImageToBuffer(cmd, src_image->value, src_image->current_layout, scratch_buf->value, 1, ©); - - vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - - auto shuffle_kernel = vk::get_compute_task(); - shuffle_kernel->run(cmd, scratch_buf, mem_length); - - vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); - - auto tmp = vk::get_typeless_helper(src_image->format(), src_image->format_class(), section.dst_x + section.dst_w, section.dst_y + section.dst_h); - tmp->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - - copy.imageOffset = { 0, 0, 0 }; - vkCmdCopyBufferToImage(cmd, scratch_buf->value, tmp->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©); - - dst_x = 0; - dst_y = 0; - - if (src_w != section.dst_w || src_h != section.dst_h) - { - // Optionally scale if needed - if (tmp == _dst) [[unlikely]] - { - dst_y = src_h; - } - - vk::copy_scaled_image(cmd, tmp, _dst, - areai{ 0, 0, src_w, static_cast(src_h) }, - coordi{ { dst_x, dst_y }, { section.dst_w, section.dst_h } }, - 1, tmp->info.format == _dst->info.format, - VK_FILTER_NEAREST); - } - else - { - _dst = tmp; - } - } - else - { - fmt::throw_exception("Unreachable"); - } - - if (_dst != dst) [[unlikely]] - { - // Casting comes after the scaling! - VkImageCopy copy_rgn; - copy_rgn.srcOffset = { s32(dst_x), s32(dst_y), 0 }; - copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 }; - copy_rgn.dstSubresource = { dst_aspect, section.level, 0, 1 }; - copy_rgn.srcSubresource = { _dst->aspect(), 0, 0, 1 }; - copy_rgn.extent = { section.dst_w, section.dst_h, 1 }; - - _dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); - vkCmdCopyImage(cmd, _dst->value, _dst->current_layout, dst->value, dst->current_layout, 1, ©_rgn); - } - } - - section.src->pop_layout(cmd); - } - } + void copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector& sections_to_transfer) const; vk::image* get_template_from_collection_impl(const std::vector& sections_to_transfer) const { diff --git a/rpcs3/GLGSRender.vcxproj b/rpcs3/GLGSRender.vcxproj index f49cbef01e..e2ae2e13ad 100644 --- a/rpcs3/GLGSRender.vcxproj +++ b/rpcs3/GLGSRender.vcxproj @@ -107,6 +107,7 @@ + diff --git a/rpcs3/GLGSRender.vcxproj.filters b/rpcs3/GLGSRender.vcxproj.filters index 8d24eb74ba..7ba169eba9 100644 --- a/rpcs3/GLGSRender.vcxproj.filters +++ b/rpcs3/GLGSRender.vcxproj.filters @@ -14,6 +14,7 @@ + diff --git a/rpcs3/Loader/PSF.cpp b/rpcs3/Loader/PSF.cpp index 60d8cc875f..6d0952aa15 100644 --- a/rpcs3/Loader/PSF.cpp +++ b/rpcs3/Loader/PSF.cpp @@ -1,6 +1,8 @@ #include "stdafx.h" #include "PSF.h" +#include "util/asm.hpp" + LOG_CHANNEL(psf_log, "PSF"); template<> @@ -208,7 +210,7 @@ namespace psf } // Align next section (data) offset - key_offset = ::align(key_offset, 4); + key_offset = utils::align(key_offset, 4); // Generate header header_t header; diff --git a/rpcs3/VKGSRender.vcxproj b/rpcs3/VKGSRender.vcxproj index 3f3eabafce..be1582e34e 100644 --- a/rpcs3/VKGSRender.vcxproj +++ b/rpcs3/VKGSRender.vcxproj @@ -67,6 +67,7 @@ + diff --git a/rpcs3/VKGSRender.vcxproj.filters b/rpcs3/VKGSRender.vcxproj.filters index d9ff7e0d59..8ce096bb50 100644 --- a/rpcs3/VKGSRender.vcxproj.filters +++ b/rpcs3/VKGSRender.vcxproj.filters @@ -18,6 +18,7 @@ + diff --git a/rpcs3/rpcs3qt/cheat_manager.cpp b/rpcs3/rpcs3qt/cheat_manager.cpp index b534f8b347..ea7abd3c92 100644 --- a/rpcs3/rpcs3qt/cheat_manager.cpp +++ b/rpcs3/rpcs3qt/cheat_manager.cpp @@ -18,6 +18,7 @@ #include "Emu/Cell/PPUFunction.h" #include "util/yaml.hpp" +#include "util/asm.hpp" #include "util/to_endian.hpp" #include "Utilities/StrUtil.h" #include "Utilities/bin_patch.h" // get_patches_path() @@ -418,17 +419,17 @@ bool cheat_engine::set_value(const u32 offset, const T value) if (exec_code_at_end && exec_code_at_start) { - size = align(addr + size, 4) - (addr & -4); + size = utils::align(addr + size, 4) - (addr & -4); addr &= -4; } else if (exec_code_at_end) { - size -= align(size - 4096 + (addr & 4095), 4); - addr = align(addr, 4096); + size -= utils::align(size - 4096 + (addr & 4095), 4); + addr = utils::align(addr, 4096); } else if (exec_code_at_start) { - size = align(4096 - (addr & 4095), 4); + size = utils::align(4096 - (addr & 4095), 4); addr &= -4; } diff --git a/rpcs3/rpcs3qt/debugger_frame.cpp b/rpcs3/rpcs3qt/debugger_frame.cpp index 011c0e49fa..7ab9da67ee 100644 --- a/rpcs3/rpcs3qt/debugger_frame.cpp +++ b/rpcs3/rpcs3qt/debugger_frame.cpp @@ -27,6 +27,8 @@ #include #include +#include "util/asm.hpp" + constexpr auto qstr = QString::fromStdString; debugger_frame::debugger_frame(std::shared_ptr settings, QWidget *parent) @@ -573,7 +575,7 @@ void debugger_frame::ShowGotoAddressDialog() if (cpu) { // -1 turns into 0 - u32 pc = ::align(cpu->get_pc(), 4); + u32 pc = utils::align(cpu->get_pc(), 4); address_preview_label->setText(QString("Address: 0x%1").arg(pc, 8, 16, QChar('0'))); expression_input->setPlaceholderText(QString("0x%1").arg(pc, 8, 16, QChar('0'))); } @@ -605,7 +607,7 @@ void debugger_frame::ShowGotoAddressDialog() if (diag->exec() == QDialog::Accepted) { // -1 turns into 0 - u32 address = ::align(cpu ? cpu->get_pc() : 0, 4); + u32 address = utils::align(cpu ? cpu->get_pc() : 0, 4); if (expression_input->text().isEmpty()) { diff --git a/rpcs3/rpcs3qt/memory_viewer_panel.cpp b/rpcs3/rpcs3qt/memory_viewer_panel.cpp index 7e91f6009c..2e79798db8 100644 --- a/rpcs3/rpcs3qt/memory_viewer_panel.cpp +++ b/rpcs3/rpcs3qt/memory_viewer_panel.cpp @@ -15,6 +15,8 @@ #include #include +#include "util/asm.hpp" + constexpr auto qstr = QString::fromStdString; memory_viewer_panel::memory_viewer_panel(QWidget* parent, u32 addr) @@ -209,7 +211,7 @@ memory_viewer_panel::memory_viewer_panel(QWidget* parent, u32 addr) { bool ok; const QString text = m_addr_line->text(); - m_addr = (text.startsWith("0x", Qt::CaseInsensitive) ? text.right(text.size() - 2) : text).toULong(&ok, 16); + m_addr = (text.startsWith("0x", Qt::CaseInsensitive) ? text.right(text.size() - 2) : text).toULong(&ok, 16); m_addr -= m_addr % (m_colcount * 4); // Align by amount of bytes in a row m_addr_line->setText(QString("%1").arg(m_addr, 8, 16, QChar('0'))); // get 8 digits in input line ShowMemory(); @@ -293,7 +295,7 @@ void memory_viewer_panel::resizeEvent(QResizeEvent *event) std::string memory_viewer_panel::getHeaderAtAddr(u32 addr) { // Check if its an SPU Local Storage beginning - const u32 spu_boundary = ::align(addr, SPU_LS_SIZE); + const u32 spu_boundary = utils::align(addr, SPU_LS_SIZE); if (spu_boundary <= addr + m_colcount * 4 - 1) { diff --git a/rpcs3/rpcs3qt/register_editor_dialog.cpp b/rpcs3/rpcs3qt/register_editor_dialog.cpp index ecf4268579..b16a288932 100644 --- a/rpcs3/rpcs3qt/register_editor_dialog.cpp +++ b/rpcs3/rpcs3qt/register_editor_dialog.cpp @@ -15,6 +15,7 @@ #include #include "util/v128.hpp" +#include "util/asm.hpp" constexpr auto qstr = QString::fromStdString; inline std::string sstr(const QString& _in) { return _in.toStdString(); } @@ -30,7 +31,7 @@ enum registers : int ppu_ff31 = ppu_ff0 + 31, ppu_v0, ppu_v31 = ppu_v0 + 31, - spu_r0 = ::align(ppu_v31 + 1u, 128), + spu_r0 = utils::align(ppu_v31 + 1u, 128), spu_r127 = spu_r0 + 127, PPU_CR, PPU_LR, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index c0761f2c1e..9ec73cf9ae 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -34,6 +34,7 @@ #include #include "util/sysinfo.hpp" +#include "util/asm.hpp" #ifdef WITH_DISCORD_RPC #include "_discord_utils.h" @@ -1809,7 +1810,7 @@ void settings_dialog::SnapSlider(QSlider *slider, int interval) { return; } - slider->setValue(::rounded_div(value, interval) * interval); + slider->setValue(utils::rounded_div(value, interval) * interval); }); } diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp index 0d8c24bb3a..ef72fbb50e 100644 --- a/rpcs3/util/asm.hpp +++ b/rpcs3/util/asm.hpp @@ -292,6 +292,32 @@ namespace utils do _mm_pause(); while (__rdtsc() - start < cycles); } + + // Align to power of 2 + template ::value && std::is_unsigned::value>> + constexpr T align(T value, ullong align) + { + return static_cast((value + (align - 1)) & (0 - align)); + } + + // General purpose aligned division, the result is rounded up not truncated + template ::value && std::is_unsigned::value>> + constexpr T aligned_div(T value, ullong align) + { + return static_cast((value + align - 1) / align); + } + + // General purpose aligned division, the result is rounded to nearest + template ::value>> + constexpr T rounded_div(T value, std::conditional_t::value, llong, ullong> align) + { + if constexpr (std::is_unsigned::value) + { + return static_cast((value + (align / 2)) / align); + } + + return static_cast((value + (value < 0 ? 0 - align : align) / 2) / align); + } } // namespace utils using utils::busy_wait; diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp index 826ec1220d..d9d798766b 100755 --- a/rpcs3/util/sysinfo.cpp +++ b/rpcs3/util/sysinfo.cpp @@ -15,6 +15,8 @@ #include #endif +#include "util/asm.hpp" + inline std::array utils::get_cpuid(u32 func, u32 subfunc) { int regs[4]; @@ -298,7 +300,7 @@ std::string utils::get_OS_version() static constexpr ullong round_tsc(ullong val) { - return ::rounded_div(val, 1'000'000) * 1'000'000; + return utils::rounded_div(val, 1'000'000) * 1'000'000; } ullong utils::get_tsc_freq() diff --git a/rpcs3/util/types.hpp b/rpcs3/util/types.hpp index 93ceb1c65b..c591badcdb 100644 --- a/rpcs3/util/types.hpp +++ b/rpcs3/util/types.hpp @@ -595,31 +595,6 @@ struct f16 } }; -template ::value && std::is_unsigned::value>> -constexpr T align(T value, ullong align) -{ - return static_cast((value + (align - 1)) & (0 - align)); -} - -// General purpose aligned division, the result is rounded up not truncated -template ::value && std::is_unsigned::value>> -constexpr T aligned_div(T value, ullong align) -{ - return static_cast((value + align - 1) / align); -} - -// General purpose aligned division, the result is rounded to nearest -template ::value>> -constexpr T rounded_div(T value, std::conditional_t::value, llong, ullong> align) -{ - if constexpr (std::is_unsigned::value) - { - return static_cast((value + (align / 2)) / align); - } - - return static_cast((value + (value < 0 ? 0 - align : align) / 2) / align); -} - template inline u32 offset32(T T2::*const mptr) { diff --git a/rpcs3/util/vm_native.cpp b/rpcs3/util/vm_native.cpp index 47b132e73a..7320c3b75e 100644 --- a/rpcs3/util/vm_native.cpp +++ b/rpcs3/util/vm_native.cpp @@ -1,6 +1,7 @@ #include "stdafx.h" #include "util/logs.hpp" #include "util/vm.hpp" +#include "util/asm.hpp" #ifdef _WIN32 #include "util/dyn_lib.hpp" #include @@ -209,7 +210,7 @@ namespace utils } shm::shm(u32 size, u32 flags) - : m_size(::align(size, 0x10000)) + : m_size(utils::align(size, 0x10000)) , m_flags(flags) , m_ptr(0) { @@ -306,7 +307,7 @@ namespace utils { const u64 res64 = reinterpret_cast(::mmap(reinterpret_cast(ptr64), m_size + 0xf000, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0)); - const u64 aligned = ::align(res64, 0x10000); + const u64 aligned = utils::align(res64, 0x10000); const auto result = ::mmap(reinterpret_cast(aligned), m_size, +prot, MAP_SHARED | MAP_FIXED, m_file, 0); // Now cleanup remnants