1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 02:32:36 +01:00

Move align helpers to util/asm.hpp

Also add some files:
GLTextureCache.cpp
VKTextureCache.cpp
This commit is contained in:
Nekotekina 2020-12-18 17:43:34 +03:00
parent d254a5736b
commit eec11bfba9
52 changed files with 794 additions and 713 deletions

View File

@ -10,6 +10,8 @@
#include <typeinfo>
#include <map>
#include "util/asm.hpp"
using namespace std::literals::string_literals;
#ifdef _WIN32
@ -1725,7 +1727,7 @@ u64 fs::get_dir_size(const std::string& path, u64 rounding_alignment)
if (!entry.is_directory)
{
result += ::align(entry.size, rounding_alignment);
result += utils::align(entry.size, rounding_alignment);
}
else
{

View File

@ -6,6 +6,7 @@
#include "util/logs.hpp"
#include "mutex.h"
#include "util/vm.hpp"
#include "util/asm.hpp"
#include <immintrin.h>
#include <zlib.h>
@ -52,8 +53,8 @@ static u8* add_jit_memory(usz size, uint align)
// Simple allocation by incrementing pointer to the next free data
const u64 pos = Ctr.atomic_op([&](u64& ctr) -> u64
{
const u64 _pos = ::align(ctr & 0xffff'ffff, align);
const u64 _new = ::align(_pos + size, align);
const u64 _pos = utils::align(ctr & 0xffff'ffff, align);
const u64 _new = utils::align(_pos + size, align);
if (_new > 0x40000000) [[unlikely]]
{
@ -69,7 +70,7 @@ static u8* add_jit_memory(usz size, uint align)
// Check the necessity to commit more memory
if (_new > olda) [[unlikely]]
{
newa = ::align(_new, 0x200000);
newa = utils::align(_new, 0x200000);
}
ctr += _new - (ctr & 0xffff'ffff);
@ -223,7 +224,7 @@ asmjit::Runtime& asmjit::get_global_runtime()
return asmjit::kErrorNoCodeGenerated;
}
void* p = m_pos.fetch_add(::align(codeSize, 4096));
void* p = m_pos.fetch_add(utils::align(codeSize, 4096));
if (!p || m_pos > m_max) [[unlikely]]
{
*dst = nullptr;
@ -237,7 +238,7 @@ asmjit::Runtime& asmjit::get_global_runtime()
return asmjit::kErrorInvalidState;
}
utils::memory_protect(p, ::align(codeSize, 4096), utils::protection::rx);
utils::memory_protect(p, utils::align(codeSize, 4096), utils::protection::rx);
flush(p, relocSize);
*dst = p;
@ -351,8 +352,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
return nullptr;
}
const u64 olda = ::align(oldp, align);
const u64 newp = ::align(olda + size, align);
const u64 olda = utils::align(oldp, align);
const u64 newp = utils::align(olda + size, align);
if ((newp - 1) / c_max_size != oldp / c_max_size)
{
@ -363,8 +364,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
if ((oldp - 1) / c_page_size != (newp - 1) / c_page_size)
{
// Allocate pages on demand
const u64 pagea = ::align(oldp, c_page_size);
const u64 psize = ::align(newp - pagea, c_page_size);
const u64 pagea = utils::align(oldp, c_page_size);
const u64 psize = utils::align(newp - pagea, c_page_size);
utils::memory_commit(this->ptr + pagea, psize, prot);
}

View File

@ -6,6 +6,7 @@
#include <cmath>
#include "util/v128.hpp"
#include "util/asm.hpp"
LOG_CHANNEL(edat_log, "EDAT");
@ -949,7 +950,7 @@ bool EDATADecrypter::ReadHeader()
}*/
file_size = edatHeader.file_size;
total_blocks = ::aligned_div(edatHeader.file_size, edatHeader.block_size);
total_blocks = utils::aligned_div(edatHeader.file_size, edatHeader.block_size);
return true;
}
@ -962,7 +963,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size)
// now we need to offset things to account for the actual 'range' requested
const u64 startOffset = pos % edatHeader.block_size;
const u32 num_blocks = static_cast<u32>(::aligned_div(startOffset + size, edatHeader.block_size));
const u32 num_blocks = static_cast<u32>(utils::aligned_div(startOffset + size, edatHeader.block_size));
const u64 bufSize = num_blocks*edatHeader.block_size;
if (data_buf_size < (bufSize))
{

View File

@ -428,6 +428,7 @@ target_sources(rpcs3_emu PRIVATE
RSX/GL/GLTexture.cpp
RSX/GL/GLVertexBuffers.cpp
RSX/GL/GLVertexProgram.cpp
RSX/GL/GLTextureCache.cpp
RSX/GL/OpenGL.cpp
)
@ -454,6 +455,7 @@ if(TARGET 3rdparty_vulkan)
RSX/VK/VKTexture.cpp
RSX/VK/VKVertexBuffers.cpp
RSX/VK/VKVertexProgram.cpp
RSX/VK/VKTextureCache.cpp
)
endif()

View File

@ -7,7 +7,7 @@
#include "cellPamf.h"
#include "cellDmux.h"
#include <thread>
#include "util/asm.hpp"
LOG_CHANNEL(cellDmux);
@ -753,9 +753,9 @@ PesHeader::PesHeader(DemuxerStream& stream)
}
ElementaryStream::ElementaryStream(Demuxer* dmux, u32 addr, u32 size, u32 fidMajor, u32 fidMinor, u32 sup1, u32 sup2, vm::ptr<CellDmuxCbEsMsg> cbFunc, u32 cbArg, u32 spec)
: put(align(addr, 128))
: put(utils::align(addr, 128))
, dmux(dmux)
, memAddr(align(addr, 128))
, memAddr(utils::align(addr, 128))
, memSize(size - (addr - memAddr))
, fidMajor(fidMajor)
, fidMinor(fidMinor)
@ -847,7 +847,7 @@ void ElementaryStream::push_au(u32 size, u64 dts, u64 pts, u64 userdata, bool ra
addr = put;
put = align(put + 128 + size, 128);
put = utils::align(put + 128 + size, 128);
put_count++;
}

View File

@ -20,6 +20,8 @@
#include <mutex>
#include <algorithm>
#include "util/asm.hpp"
LOG_CHANNEL(cellSaveData);
template<>
@ -953,7 +955,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
{
if (!file.is_directory)
{
size_bytes += ::align(file.size, 1024);
size_bytes += utils::align(file.size, 1024);
}
}
@ -1334,7 +1336,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
{
statGet->fileNum++;
size_bytes += ::align(entry.size, 1024); // firmware rounds this value up
size_bytes += utils::align(entry.size, 1024); // firmware rounds this value up
if (statGet->fileListNum >= setBuf->fileListMax)
continue;
@ -1892,7 +1894,7 @@ static NEVER_INLINE error_code savedata_op(ppu_thread& ppu, u32 operation, u32 v
// add file list per FS order to PARAM.SFO
std::string final_blist;
final_blist = fmt::merge(blist, "/");
psf::assign(psf, "RPCS3_BLIST", psf::string(::align(::size32(final_blist) + 1, 4), final_blist));
psf::assign(psf, "RPCS3_BLIST", psf::string(utils::align(::size32(final_blist) + 1, 4), final_blist));
// Write all files in temporary directory
auto& fsfo = all_files["PARAM.SFO"];

View File

@ -34,6 +34,7 @@ extern "C"
#include <cmath>
#include "Utilities/lockless.h"
#include <variant>
#include "util/asm.hpp"
std::mutex g_mutex_avcodec_open2;
@ -879,7 +880,7 @@ error_code cellVdecGetPicture(u32 handle, vm::cptr<CellVdecPicFormat> format, vm
sws_scale(vdec->sws, in_data, in_line, 0, h, out_data, out_line);
//const u32 buf_size = align(av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1), 128);
//const u32 buf_size = utils::align(av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1), 128);
//// TODO: zero padding bytes
@ -974,7 +975,7 @@ error_code cellVdecGetPicItem(u32 handle, vm::pptr<CellVdecPicItem> picItem)
info->startAddr = 0x00000123; // invalid value (no address for picture)
const int buffer_size = av_image_get_buffer_size(vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1);
ensure(buffer_size >= 0);
info->size = align<u32>(buffer_size, 128);
info->size = utils::align<u32>(buffer_size, 128);
info->auNum = 1;
info->auPts[0].lower = static_cast<u32>(pts);
info->auPts[0].upper = static_cast<u32>(pts >> 32);

View File

@ -20,6 +20,7 @@
#include "Emu/Cell/lv2/sys_process.h"
#include <cmath>
#include "util/asm.hpp"
LOG_CHANNEL(sceNpTrophy);
@ -1109,7 +1110,7 @@ error_code sceNpTrophyGetGameProgress(u32 context, u32 handle, vm::ptr<s32> perc
const u32 trp_count = ctxt->tropusr->GetTrophiesCount();
// Round result to nearest (TODO: Check 0 trophies)
*percentage = trp_count ? ::rounded_div(unlocked * 100, trp_count) : 0;
*percentage = trp_count ? utils::rounded_div(unlocked * 100, trp_count) : 0;
if (trp_count == 0 || trp_count > 128)
{

View File

@ -22,6 +22,7 @@
#include <map>
#include <set>
#include <algorithm>
#include "util/asm.hpp"
LOG_CHANNEL(ppu_loader);
@ -263,7 +264,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link)
}
// Set memory protection to read-only
vm::page_protect(ppu_function_manager::addr, ::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable);
vm::page_protect(ppu_function_manager::addr, utils::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable);
// Initialize function names
const bool is_first = g_ppu_function_names.empty();
@ -319,7 +320,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link)
}
else
{
const u32 next = ::align(alloc_addr, variable.second.align);
const u32 next = utils::align(alloc_addr, variable.second.align);
const u32 end = next + variable.second.size;
if (!next || (end >> 12 != alloc_addr >> 12))
@ -1500,7 +1501,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
for (const auto& arg : Emu.argv)
{
const u32 arg_size = ::align(::size32(arg) + 1, 0x10);
const u32 arg_size = utils::align(::size32(arg) + 1, 0x10);
const u32 arg_addr = vm::alloc(arg_size, vm::main);
std::memcpy(vm::base(arg_addr), arg.data(), arg_size);
@ -1513,7 +1514,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
for (const auto& arg : Emu.envp)
{
const u32 arg_size = ::align(::size32(arg) + 1, 0x10);
const u32 arg_size = utils::align(::size32(arg) + 1, 0x10);
const u32 arg_addr = vm::alloc(arg_size, vm::main);
std::memcpy(vm::base(arg_addr), arg.data(), arg_size);
@ -1533,7 +1534,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
case 0x70: primary_stacksize = 1024 * 1024; break; // SYS_PROCESS_PRIMARY_STACK_SIZE_1M
default:
{
primary_stacksize = ::align<u32>(std::clamp<u32>(sz, 0x10000, 0x100000), 4096);
primary_stacksize = utils::align<u32>(std::clamp<u32>(sz, 0x10000, 0x100000), 4096);
break;
}
}
@ -1636,7 +1637,7 @@ void ppu_load_exec(const ppu_exec_object& elf)
if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz && (prog.p_flags & 0x2) == 0u /* W */)
{
// Set memory protection to read-only when necessary
ensure(vm::page_protect(addr, ::align(size, 0x1000), 0, 0, vm::page_writable));
ensure(vm::page_protect(addr, utils::align(size, 0x1000), 0, 0, vm::page_writable));
}
}
}

View File

@ -242,7 +242,7 @@ extern void ppu_register_range(u32 addr, u32 size)
// Register executable range at
utils::memory_commit(&ppu_ref(addr), size * 2, utils::protection::rw);
vm::page_protect(addr, align(size, 0x10000), 0, vm::page_executable);
vm::page_protect(addr, utils::align(size, 0x10000), 0, vm::page_executable);
const u64 fallback = g_cfg.core.ppu_decoder == ppu_decoder_type::llvm ? reinterpret_cast<uptr>(ppu_recompiler_fallback) : reinterpret_cast<uptr>(ppu_fallback);
@ -1098,7 +1098,7 @@ u32 ppu_thread::stack_push(u32 size, u32 align_v)
ppu_thread& context = static_cast<ppu_thread&>(*cpu);
const u32 old_pos = vm::cast(context.gpr[1]);
context.gpr[1] -= align(size + 4, 8); // room minimal possible size
context.gpr[1] -= utils::align(size + 4, 8); // room minimal possible size
context.gpr[1] &= ~(u64{align_v} - 1); // fix stack alignment
if (old_pos >= context.stack_addr && old_pos < context.stack_addr + context.stack_size && context.gpr[1] < context.stack_addr)

View File

@ -288,7 +288,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
words_align = 64;
const u32 starta = start & -64;
const u32 enda = ::align(end, 64);
const u32 enda = utils::align(end, 64);
const u32 sizea = (enda - starta) / 64;
ensure(sizea);
@ -369,7 +369,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
words_align = 32;
const u32 starta = start & -32;
const u32 enda = ::align(end, 32);
const u32 enda = utils::align(end, 32);
const u32 sizea = (enda - starta) / 32;
ensure(sizea);
@ -491,7 +491,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
words_align = 32;
const u32 starta = start & -32;
const u32 enda = ::align(end, 32);
const u32 enda = utils::align(end, 32);
const u32 sizea = (enda - starta) / 32;
ensure(sizea);

View File

@ -2338,7 +2338,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
}
u32 range_addr = eal & -128;
u32 range_end = ::align(eal + size, 128);
u32 range_end = utils::align(eal + size, 128);
// Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?)
if (range_addr >> 16 != (range_end - 1) >> 16)

View File

@ -8,6 +8,7 @@
#include "Emu/IdManager.h"
#include "util/vm.hpp"
#include "util/asm.hpp"
LOG_CHANNEL(sys_memory);
@ -57,7 +58,7 @@ error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptr<u32
return CELL_ENOMEM;
}
if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, ::align(size, 0x10000000), 0x401))
if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, utils::align(size, 0x10000000), 0x401))
{
if (u32 addr = area->alloc(size, nullptr, align))
{
@ -128,7 +129,7 @@ error_code sys_memory_allocate_from_container(cpu_thread& cpu, u32 size, u32 cid
return ct.ret;
}
if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, ::align(size, 0x10000000), 0x401))
if (const auto area = vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, utils::align(size, 0x10000000), 0x401))
{
if (u32 addr = area->alloc(size))
{

View File

@ -12,6 +12,8 @@
#include "sys_mmapper.h"
#include "sys_memory.h"
#include "util/asm.hpp"
LOG_CHANNEL(sys_ppu_thread);
// Simple structure to cleanup previous thread, because can't remove its own thread
@ -388,7 +390,7 @@ error_code _sys_ppu_thread_create(ppu_thread& ppu, vm::ptr<u64> thread_id, vm::p
g_fxo->get<ppu_thread_cleaner>()->clean(0);
// Compute actual stack size and allocate
const u32 stack_size = ::align<u32>(std::max<u32>(_stacksz, 4096), 4096);
const u32 stack_size = utils::align<u32>(std::max<u32>(_stacksz, 4096), 4096);
const auto dct = g_fxo->get<lv2_memory_container>();

View File

@ -99,7 +99,7 @@ void sys_spu_image::load(const fs::file& stream)
this->nsegs = 0;
this->segs = vm::null;
vm::page_protect(segs.addr(), ::align(mem_size, 4096), 0, 0, vm::page_writable);
vm::page_protect(segs.addr(), utils::align(mem_size, 4096), 0, 0, vm::page_writable);
}
void sys_spu_image::free()

View File

@ -974,13 +974,13 @@ namespace vm
if (state & page_1m_size)
{
i = ::align(i + 1, 0x100000 / 4096);
i = utils::align(i + 1, 0x100000 / 4096);
continue;
}
if (state & page_64k_size)
{
i = ::align(i + 1, 0x10000 / 4096);
i = utils::align(i + 1, 0x10000 / 4096);
continue;
}
@ -1177,7 +1177,7 @@ namespace vm
const u32 min_page_size = flags & 0x100 ? 0x1000 : 0x10000;
// Align to minimal page size
const u32 size = ::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0);
const u32 size = utils::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0);
// Check alignment (it's page allocation, so passing small values there is just silly)
if (align < min_page_size || align != (0x80000000u >> std::countl_zero(align)))
@ -1217,7 +1217,7 @@ namespace vm
vm::writer_lock lock(0);
// Search for an appropriate place (unoptimized)
for (u32 addr = ::align(this->addr, align); u64{addr} + size <= u64{this->addr} + this->size; addr += align)
for (u32 addr = utils::align(this->addr, align); u64{addr} + size <= u64{this->addr} + this->size; addr += align)
{
if (try_alloc(addr, pflags, size, std::move(shm)))
{
@ -1240,7 +1240,7 @@ namespace vm
const u32 min_page_size = flags & 0x100 ? 0x1000 : 0x10000;
// Align to minimal page size
const u32 size = ::align(orig_size, min_page_size);
const u32 size = utils::align(orig_size, min_page_size);
// return if addr or size is invalid
if (!size || addr < this->addr || orig_size > size || addr + u64{size} > this->addr + u64{this->size} || flags & 0x10)
@ -1410,7 +1410,7 @@ namespace vm
static std::shared_ptr<block_t> _find_map(u32 size, u32 align, u64 flags)
{
for (u32 addr = ::align<u32>(0x20000000, align); addr - 1 < 0xC0000000 - 1; addr += align)
for (u32 addr = utils::align<u32>(0x20000000, align); addr - 1 < 0xC0000000 - 1; addr += align)
{
if (_test_map(addr, size))
{
@ -1485,7 +1485,7 @@ namespace vm
vm::writer_lock lock(0);
// Align to minimal page size
const u32 size = ::align(orig_size, 0x10000);
const u32 size = utils::align(orig_size, 0x10000);
// Check alignment
if (align < 0x10000 || align != (0x80000000u >> std::countl_zero(align)))

View File

@ -32,6 +32,8 @@
#include <net/if_dl.h>
#endif
#include "util/asm.hpp"
LOG_CHANNEL(sys_net);
LOG_CHANNEL(sceNp2);
LOG_CHANNEL(sceNp);
@ -384,7 +386,7 @@ vm::addr_t np_handler::allocate(u32 size)
return vm::cast(static_cast<u64>(0));
// Align allocs
const u32 alloc_size = ::align(size, 4);
const u32 alloc_size = utils::align(size, 4);
if (alloc_size > mpool_avail)
{
sceNp.error("Not enough memory available in NP pool!");

View File

@ -7,6 +7,7 @@
#include "Emu/RSX/RSXThread.h"
#include <map>
#include "util/asm.hpp"
namespace rsx
{
@ -23,7 +24,7 @@ namespace rsx
}
// User memory + fifo size
buffer_size = ::align<u32>(buffer_size, 0x100000) + 0x10000000;
buffer_size = utils::align<u32>(buffer_size, 0x100000) + 0x10000000;
// We are not allowed to drain all memory so add a little
g_fxo->init<lv2_memory_container>(buffer_size + 0x1000000);

View File

@ -4,6 +4,8 @@
#include "../RSXThread.h"
#include "../rsx_utils.h"
#include "util/asm.hpp"
namespace
{
// FIXME: GSL as_span break build if template parameter is non const with current revision.
@ -346,8 +348,8 @@ namespace
}
else
{
current_subresource_layout.width_in_block = aligned_div(miplevel_width_in_texel, block_edge_in_texel);
current_subresource_layout.height_in_block = aligned_div(miplevel_height_in_texel, block_edge_in_texel);
current_subresource_layout.width_in_block = utils::aligned_div(miplevel_width_in_texel, block_edge_in_texel);
current_subresource_layout.height_in_block = utils::aligned_div(miplevel_height_in_texel, block_edge_in_texel);
}
if (padded_row)
@ -375,7 +377,7 @@ namespace
miplevel_height_in_texel = std::max(miplevel_height_in_texel / 2, 1);
}
offset_in_src = align(offset_in_src, 128);
offset_in_src = utils::align(offset_in_src, 128);
}
return result;
@ -922,8 +924,8 @@ namespace rsx
usz result = 0;
for (u16 i = 0; i < mipmap; ++i)
{
usz rowPitch = align(block_size_in_byte * width_in_blocks, row_pitch_alignment);
result += align(rowPitch * height_in_blocks * depth, mipmap_alignment);
usz rowPitch = utils::align(block_size_in_byte * width_in_blocks, row_pitch_alignment);
result += utils::align(rowPitch * height_in_blocks * depth, mipmap_alignment);
height_in_blocks = std::max<usz>(height_in_blocks / 2, 1);
width_in_blocks = std::max<usz>(width_in_blocks / 2, 1);
}

View File

@ -1,6 +1,7 @@
#pragma once
#include "util/logs.hpp"
#include "util/asm.hpp"
/**
* Ring buffer memory helper :
@ -19,8 +20,8 @@ protected:
template<int Alignment>
bool can_alloc(usz size) const
{
usz alloc_size = align(size, Alignment);
usz aligned_put_pos = align(m_put_pos, Alignment);
usz alloc_size = utils::align(size, Alignment);
usz aligned_put_pos = utils::align(m_put_pos, Alignment);
if (aligned_put_pos + alloc_size < m_size)
{
// range before get
@ -83,8 +84,8 @@ public:
template<int Alignment>
usz alloc(usz size)
{
const usz alloc_size = align(size, Alignment);
const usz aligned_put_pos = align(m_put_pos, Alignment);
const usz alloc_size = utils::align(size, Alignment);
const usz aligned_put_pos = utils::align(m_put_pos, Alignment);
if (!can_alloc<Alignment>(size) && !grow(aligned_put_pos + alloc_size))
{

View File

@ -1,6 +1,8 @@
#include "stdafx.h"
#include "surface_store.h"
#include "util/asm.hpp"
namespace rsx
{
namespace utility
@ -23,20 +25,20 @@ namespace rsx
{
switch (format)
{
case surface_color_format::b8: return align(width, 256);
case surface_color_format::b8: return utils::align(width, 256);
case surface_color_format::g8b8:
case surface_color_format::x1r5g5b5_o1r5g5b5:
case surface_color_format::x1r5g5b5_z1r5g5b5:
case surface_color_format::r5g6b5: return align(width * 2, 256);
case surface_color_format::r5g6b5: return utils::align(width * 2, 256);
case surface_color_format::a8b8g8r8:
case surface_color_format::x8b8g8r8_o8b8g8r8:
case surface_color_format::x8b8g8r8_z8b8g8r8:
case surface_color_format::x8r8g8b8_o8r8g8b8:
case surface_color_format::x8r8g8b8_z8r8g8b8:
case surface_color_format::x32:
case surface_color_format::a8r8g8b8: return align(width * 4, 256);
case surface_color_format::w16z16y16x16: return align(width * 8, 256);
case surface_color_format::w32z32y32x32: return align(width * 16, 256);
case surface_color_format::a8r8g8b8: return utils::align(width * 4, 256);
case surface_color_format::w16z16y16x16: return utils::align(width * 8, 256);
case surface_color_format::w32z32y32x32: return utils::align(width * 16, 256);
}
fmt::throw_exception("Unknown color surface format");
}

View File

@ -5,6 +5,8 @@
#include "../rsx_utils.h"
#include <list>
#include "util/asm.hpp"
namespace rsx
{
namespace utility
@ -918,7 +920,7 @@ namespace rsx
{
// Width is calculated in the coordinate-space of the requester; normalize
info.src_area.x = (info.src_area.x * required_bpp) / surface_bpp;
info.src_area.width = align(width * required_bpp, surface_bpp) / surface_bpp;
info.src_area.width = utils::align(width * required_bpp, surface_bpp) / surface_bpp;
}
else
{

View File

@ -4,6 +4,8 @@
#include "Emu/IdManager.h"
#include "GLHelpers.h"
#include "util/asm.hpp"
namespace gl
{
struct compute_task
@ -224,7 +226,7 @@ namespace gl
m_data_length = data_length;
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
const auto num_bytes_to_process = align(data_length, num_bytes_per_invocation);
const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
if ((num_bytes_to_process + data_offset) > data->size())

View File

@ -740,7 +740,7 @@ void GLGSRender::load_program_env()
if (update_fragment_env) m_fragment_env_buffer->reserve_storage_on_heap(128);
if (update_vertex_env) m_vertex_env_buffer->reserve_storage_on_heap(256);
if (update_fragment_texture_env) m_texture_parameters_buffer->reserve_storage_on_heap(256);
if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(align(fragment_constants_size, 256));
if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256));
if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192);
if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128);

View File

@ -16,6 +16,7 @@
#include "Utilities/mutex.h"
#include "Utilities/geometry.h"
#include "util/logs.hpp"
#include "util/asm.hpp"
#define GL_FRAGMENT_TEXTURES_START 0
#define GL_VERTEX_TEXTURES_START (GL_FRAGMENT_TEXTURES_START + 16)
@ -808,7 +809,7 @@ namespace gl
virtual std::pair<void*, u32> alloc_from_heap(u32 alloc_size, u16 alignment)
{
u32 offset = m_data_loc;
if (m_data_loc) offset = align(offset, alignment);
if (m_data_loc) offset = utils::align(offset, alignment);
if ((offset + alloc_size) > m_size)
{
@ -827,7 +828,7 @@ namespace gl
}
//Align data loc to 256; allows some "guard" region so we dont trample our own data inadvertently
m_data_loc = align(offset + alloc_size, 256);
m_data_loc = utils::align(offset + alloc_size, 256);
return std::make_pair(static_cast<char*>(m_memory_mapping) + offset, offset);
}
@ -897,9 +898,9 @@ namespace gl
ensure(m_memory_mapping == nullptr);
u32 offset = m_data_loc;
if (m_data_loc) offset = align(offset, 256);
if (m_data_loc) offset = utils::align(offset, 256);
const u32 block_size = align(alloc_size + 16, 256); //Overallocate just in case we need to realign base
const u32 block_size = utils::align(alloc_size + 16, 256); //Overallocate just in case we need to realign base
if ((offset + block_size) > m_size)
{
@ -933,10 +934,10 @@ namespace gl
std::pair<void*, u32> alloc_from_heap(u32 alloc_size, u16 alignment) override
{
u32 offset = m_data_loc;
if (m_data_loc) offset = align(offset, alignment);
if (m_data_loc) offset = utils::align(offset, alignment);
u32 padding = (offset - m_data_loc);
u32 real_size = align(padding + alloc_size, alignment); //Ensures we leave the loc pointer aligned after we exit
u32 real_size = utils::align(padding + alloc_size, alignment); //Ensures we leave the loc pointer aligned after we exit
if (real_size > m_mapped_bytes)
{
@ -946,10 +947,10 @@ namespace gl
reserve_storage_on_heap(std::max(real_size, 4096U));
offset = m_data_loc;
if (m_data_loc) offset = align(offset, alignment);
if (m_data_loc) offset = utils::align(offset, alignment);
padding = (offset - m_data_loc);
real_size = align(padding + alloc_size, alignment);
real_size = utils::align(padding + alloc_size, alignment);
}
m_data_loc = offset + real_size;

View File

@ -6,6 +6,8 @@
#include "../RSXThread.h"
#include "../RSXTexture.h"
#include "util/asm.hpp"
namespace gl
{
buffer g_typeless_transfer_buffer;
@ -614,8 +616,8 @@ namespace gl
{
//Compressed formats have a 4-byte alignment
//TODO: Verify that samplers are not affected by the padding
width = align(width, 4);
height = align(height, 4);
width = utils::align(width, 4);
height = utils::align(height, 4);
}
GLenum target;
@ -654,7 +656,7 @@ namespace gl
{
caps.supports_vtc_decoding = gl::get_driver_caps().vendor_NVIDIA;
unpack_settings.row_length(align(dst->width(), 4));
unpack_settings.row_length(utils::align(dst->width(), 4));
unpack_settings.apply();
glBindTexture(static_cast<GLenum>(dst->get_target()), dst->id());
@ -664,7 +666,7 @@ namespace gl
for (const rsx::subresource_layout& layout : input_layouts)
{
upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps);
const sizei image_size{ align(layout.width_in_texel, 4), align(layout.height_in_texel, 4) };
const sizei image_size{utils::align(layout.width_in_texel, 4), utils::align(layout.height_in_texel, 4)};
switch (dst->get_target())
{
@ -835,7 +837,7 @@ namespace gl
void upload_texture(texture* dst, u32 gcm_format, bool is_swizzled, const std::vector<rsx::subresource_layout>& subresources_layout)
{
// Calculate staging buffer size
const u32 aligned_pitch = align<u32>(dst->pitch(), 4);
const u32 aligned_pitch = utils::align<u32>(dst->pitch(), 4);
usz texture_data_sz = dst->depth() * dst->height() * aligned_pitch;
std::vector<std::byte> data_upload_buf(texture_data_sz);

View File

@ -0,0 +1,191 @@
#include "stdafx.h"
#include "Emu/RSX/RSXThread.h"
#include "GLTexture.h"
#include "GLTextureCache.h"
#include "util/asm.hpp"
namespace gl
{
void cached_texture_section::finish_flush()
{
// Free resources
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
const auto valid_range = get_confirmed_range_delta();
const u32 valid_offset = valid_range.first;
const u32 valid_length = valid_range.second;
void *dst = get_ptr(get_section_base() + valid_offset);
if (!gl::get_driver_caps().ARB_compute_shader_supported)
{
switch (type)
{
case gl::texture::type::sbyte:
case gl::texture::type::ubyte:
{
// byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty
ensure(!pack_unpack_swap_bytes);
break;
}
case gl::texture::type::uint_24_8:
{
// Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24
// In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3
ensure(pack_unpack_swap_bytes == false);
ensure(real_pitch == (width * 4));
if (rsx_pitch == real_pitch) [[likely]]
{
stream_data_to_memory_swapped_u32<true>(dst, dst, valid_length / 4, 4);
}
else
{
const u32 num_rows = utils::align(valid_length, rsx_pitch) / rsx_pitch;
u8* data = static_cast<u8*>(dst);
for (u32 row = 0; row < num_rows; ++row)
{
stream_data_to_memory_swapped_u32<true>(data, data, width, 4);
data += rsx_pitch;
}
}
break;
}
default:
break;
}
}
if (is_swizzled())
{
// This format is completely worthless to CPU processing algorithms where cache lines on die are linear.
// If this is happening, usually it means it was not a planned readback (e.g shared pages situation)
rsx_log.warning("[Performance warning] CPU readback of swizzled data");
// Read-modify-write to avoid corrupting already resident memory outside texture region
std::vector<u8> tmp_data(rsx_pitch * height);
std::memcpy(tmp_data.data(), dst, tmp_data.size());
switch (type)
{
case gl::texture::type::uint_8_8_8_8:
case gl::texture::type::uint_24_8:
rsx::convert_linear_swizzle<u32, false>(tmp_data.data(), dst, width, height, rsx_pitch);
break;
case gl::texture::type::ushort_5_6_5:
case gl::texture::type::ushort:
rsx::convert_linear_swizzle<u16, false>(tmp_data.data(), dst, width, height, rsx_pitch);
break;
default:
rsx_log.error("Unexpected swizzled texture format 0x%x", static_cast<u32>(format));
}
}
if (context == rsx::texture_upload_context::framebuffer_storage)
{
// Update memory tag
static_cast<gl::render_target*>(vram_texture)->sync_tag();
}
}
void texture_cache::copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const
{
const auto dst_bpp = dst_image->pitch() / dst_image->width();
const auto dst_aspect = dst_image->aspect();
for (const auto &slice : sources)
{
if (!slice.src)
continue;
const bool typeless = dst_aspect != slice.src->aspect() ||
!formats_are_bitcast_compatible(static_cast<GLenum>(slice.src->get_internal_format()), static_cast<GLenum>(dst_image->get_internal_format()));
std::unique_ptr<gl::texture> tmp;
auto src_image = slice.src;
auto src_x = slice.src_x;
auto src_y = slice.src_y;
auto src_w = slice.src_w;
auto src_h = slice.src_h;
if (slice.xform == rsx::surface_transform::coordinate_transform)
{
// Dimensions were given in 'dst' space. Work out the real source coordinates
const auto src_bpp = slice.src->pitch() / slice.src->width();
src_x = (src_x * dst_bpp) / src_bpp;
src_w = utils::aligned_div<u16>(src_w * dst_bpp, src_bpp);
}
if (auto surface = dynamic_cast<gl::render_target*>(slice.src))
{
surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
}
if (typeless) [[unlikely]]
{
const auto src_bpp = slice.src->pitch() / slice.src->width();
const u16 convert_w = u16(slice.src->width() * src_bpp) / dst_bpp;
tmp = std::make_unique<texture>(GL_TEXTURE_2D, convert_w, slice.src->height(), 1, 1, static_cast<GLenum>(dst_image->get_internal_format()));
src_image = tmp.get();
// Compute src region in dst format layout
const u16 src_w2 = u16(src_w * src_bpp) / dst_bpp;
const u16 src_x2 = u16(src_x * src_bpp) / dst_bpp;
if (src_w2 == slice.dst_w && src_h == slice.dst_h && slice.level == 0)
{
// Optimization, avoid typeless copy to tmp followed by data copy to dst
// Combine the two transfers into one
const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
const coord3u dst_region = { { slice.dst_x, slice.dst_y, slice.dst_z }, { slice.dst_w, slice.dst_h, 1 } };
gl::copy_typeless(dst_image, slice.src, dst_region, src_region);
continue;
}
const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
const coord3u dst_region = { { src_x2, src_y, 0 }, { src_w2, src_h, 1 } };
gl::copy_typeless(src_image, slice.src, dst_region, src_region);
src_x = src_x2;
src_w = src_w2;
}
if (src_w == slice.dst_w && src_h == slice.dst_h)
{
glCopyImageSubData(src_image->id(), GL_TEXTURE_2D, 0, src_x, src_y, 0,
dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, src_w, src_h, 1);
}
else
{
ensure(dst_image->get_target() == gl::texture::target::texture2D);
auto _blitter = gl::g_hw_blitter;
const areai src_rect = { src_x, src_y, src_x + src_w, src_y + src_h };
const areai dst_rect = { slice.dst_x, slice.dst_y, slice.dst_x + slice.dst_w, slice.dst_y + slice.dst_h };
gl::texture* _dst;
if (src_image->get_internal_format() == dst_image->get_internal_format() && slice.level == 0)
{
_dst = dst_image;
}
else
{
tmp = std::make_unique<texture>(GL_TEXTURE_2D, dst_rect.x2, dst_rect.y2, 1, 1, static_cast<GLenum>(slice.src->get_internal_format()));
_dst = tmp.get();
}
_blitter->scale_image(cmd, src_image, _dst,
src_rect, dst_rect, false, {});
if (_dst != dst_image)
{
// Data cast comes after scaling
glCopyImageSubData(tmp->id(), GL_TEXTURE_2D, 0, slice.dst_x, slice.dst_y, 0,
dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, slice.dst_w, slice.dst_h, 1);
}
}
}
}
}

View File

@ -62,7 +62,7 @@ namespace gl
void init_buffer(const gl::texture* src)
{
const u32 vram_size = src->pitch() * src->height();
const u32 buffer_size = align(vram_size, 4096);
const u32 buffer_size = utils::align(vram_size, 4096);
if (pbo)
{
@ -333,86 +333,7 @@ namespace gl
return glMapBufferRange(GL_PIXEL_PACK_BUFFER, offset, size, GL_MAP_READ_BIT);
}
void finish_flush()
{
// Free resources
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
const auto valid_range = get_confirmed_range_delta();
const u32 valid_offset = valid_range.first;
const u32 valid_length = valid_range.second;
void *dst = get_ptr(get_section_base() + valid_offset);
if (!gl::get_driver_caps().ARB_compute_shader_supported)
{
switch (type)
{
case gl::texture::type::sbyte:
case gl::texture::type::ubyte:
{
// byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty
ensure(!pack_unpack_swap_bytes);
break;
}
case gl::texture::type::uint_24_8:
{
// Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24
// In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3
ensure(pack_unpack_swap_bytes == false);
ensure(real_pitch == (width * 4));
if (rsx_pitch == real_pitch) [[likely]]
{
stream_data_to_memory_swapped_u32<true>(dst, dst, valid_length / 4, 4);
}
else
{
const u32 num_rows = align(valid_length, rsx_pitch) / rsx_pitch;
u8* data = static_cast<u8*>(dst);
for (u32 row = 0; row < num_rows; ++row)
{
stream_data_to_memory_swapped_u32<true>(data, data, width, 4);
data += rsx_pitch;
}
}
break;
}
default:
break;
}
}
if (is_swizzled())
{
// This format is completely worthless to CPU processing algorithms where cache lines on die are linear.
// If this is happening, usually it means it was not a planned readback (e.g shared pages situation)
rsx_log.warning("[Performance warning] CPU readback of swizzled data");
// Read-modify-write to avoid corrupting already resident memory outside texture region
std::vector<u8> tmp_data(rsx_pitch * height);
std::memcpy(tmp_data.data(), dst, tmp_data.size());
switch (type)
{
case gl::texture::type::uint_8_8_8_8:
case gl::texture::type::uint_24_8:
rsx::convert_linear_swizzle<u32, false>(tmp_data.data(), dst, width, height, rsx_pitch);
break;
case gl::texture::type::ushort_5_6_5:
case gl::texture::type::ushort:
rsx::convert_linear_swizzle<u16, false>(tmp_data.data(), dst, width, height, rsx_pitch);
break;
default:
rsx_log.error("Unexpected swizzled texture format 0x%x", static_cast<u32>(format));
}
}
if (context == rsx::texture_upload_context::framebuffer_storage)
{
// Update memory tag
static_cast<gl::render_target*>(vram_texture)->sync_tag();
}
}
void finish_flush();
/**
* Misc
@ -637,106 +558,7 @@ namespace gl
}
}
void copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const
{
const auto dst_bpp = dst_image->pitch() / dst_image->width();
const auto dst_aspect = dst_image->aspect();
for (const auto &slice : sources)
{
if (!slice.src)
continue;
const bool typeless = dst_aspect != slice.src->aspect() ||
!formats_are_bitcast_compatible(static_cast<GLenum>(slice.src->get_internal_format()), static_cast<GLenum>(dst_image->get_internal_format()));
std::unique_ptr<gl::texture> tmp;
auto src_image = slice.src;
auto src_x = slice.src_x;
auto src_y = slice.src_y;
auto src_w = slice.src_w;
auto src_h = slice.src_h;
if (slice.xform == rsx::surface_transform::coordinate_transform)
{
// Dimensions were given in 'dst' space. Work out the real source coordinates
const auto src_bpp = slice.src->pitch() / slice.src->width();
src_x = (src_x * dst_bpp) / src_bpp;
src_w = ::aligned_div<u16>(src_w * dst_bpp, src_bpp);
}
if (auto surface = dynamic_cast<gl::render_target*>(slice.src))
{
surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
}
if (typeless) [[unlikely]]
{
const auto src_bpp = slice.src->pitch() / slice.src->width();
const u16 convert_w = u16(slice.src->width() * src_bpp) / dst_bpp;
tmp = std::make_unique<texture>(GL_TEXTURE_2D, convert_w, slice.src->height(), 1, 1, static_cast<GLenum>(dst_image->get_internal_format()));
src_image = tmp.get();
// Compute src region in dst format layout
const u16 src_w2 = u16(src_w * src_bpp) / dst_bpp;
const u16 src_x2 = u16(src_x * src_bpp) / dst_bpp;
if (src_w2 == slice.dst_w && src_h == slice.dst_h && slice.level == 0)
{
// Optimization, avoid typeless copy to tmp followed by data copy to dst
// Combine the two transfers into one
const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
const coord3u dst_region = { { slice.dst_x, slice.dst_y, slice.dst_z }, { slice.dst_w, slice.dst_h, 1 } };
gl::copy_typeless(dst_image, slice.src, dst_region, src_region);
continue;
}
const coord3u src_region = { { src_x, src_y, 0 }, { src_w, src_h, 1 } };
const coord3u dst_region = { { src_x2, src_y, 0 }, { src_w2, src_h, 1 } };
gl::copy_typeless(src_image, slice.src, dst_region, src_region);
src_x = src_x2;
src_w = src_w2;
}
if (src_w == slice.dst_w && src_h == slice.dst_h)
{
glCopyImageSubData(src_image->id(), GL_TEXTURE_2D, 0, src_x, src_y, 0,
dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, src_w, src_h, 1);
}
else
{
ensure(dst_image->get_target() == gl::texture::target::texture2D);
auto _blitter = gl::g_hw_blitter;
const areai src_rect = { src_x, src_y, src_x + src_w, src_y + src_h };
const areai dst_rect = { slice.dst_x, slice.dst_y, slice.dst_x + slice.dst_w, slice.dst_y + slice.dst_h };
gl::texture* _dst;
if (src_image->get_internal_format() == dst_image->get_internal_format() && slice.level == 0)
{
_dst = dst_image;
}
else
{
tmp = std::make_unique<texture>(GL_TEXTURE_2D, dst_rect.x2, dst_rect.y2, 1, 1, static_cast<GLenum>(slice.src->get_internal_format()));
_dst = tmp.get();
}
_blitter->scale_image(cmd, src_image, _dst,
src_rect, dst_rect, false, {});
if (_dst != dst_image)
{
// Data cast comes after scaling
glCopyImageSubData(tmp->id(), GL_TEXTURE_2D, 0, slice.dst_x, slice.dst_y, 0,
dst_image->id(), static_cast<GLenum>(dst_image->get_target()), slice.level, slice.dst_x, slice.dst_y, slice.dst_z, slice.dst_w, slice.dst_h, 1);
}
}
}
}
void copy_transfer_regions_impl(gl::command_context& cmd, gl::texture* dst_image, const std::vector<copy_region_descriptor>& sources) const;
gl::texture* get_template_from_collection_impl(const std::vector<copy_region_descriptor>& sections_to_transfer) const
{

View File

@ -139,6 +139,52 @@ namespace rsx
fmt::throw_exception("rsx::get_address(offset=0x%x, location=0x%x): %s%s", offset, location, msg, src_loc{line, col, file, func});
}
std::pair<u32, u32> interleaved_range_info::calculate_required_range(u32 first, u32 count) const
{
if (single_vertex)
{
return { 0, 1 };
}
const u32 max_index = (first + count) - 1;
u32 _max_index = 0;
u32 _min_index = first;
for (const auto &attrib : locations)
{
if (attrib.frequency <= 1) [[likely]]
{
_max_index = max_index;
}
else
{
if (attrib.modulo)
{
if (max_index >= attrib.frequency)
{
// Actually uses the modulo operator
_min_index = 0;
_max_index = attrib.frequency - 1;
}
else
{
// Same as having no modulo
_max_index = max_index;
}
}
else
{
// Division operator
_min_index = std::min(_min_index, first / attrib.frequency);
_max_index = std::max<u32>(_max_index, utils::aligned_div(max_index, attrib.frequency));
}
}
}
ensure(_max_index >= _min_index);
return { _min_index, (_max_index - _min_index) + 1 };
}
u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size)
{
switch (type)
@ -2521,7 +2567,7 @@ namespace rsx
}
// Some cases do not need full delay
remaining = ::aligned_div(remaining, div);
remaining = utils::aligned_div(remaining, div);
const u64 until = get_system_time() + remaining;
while (true)

View File

@ -246,51 +246,7 @@ namespace rsx
rsx::simple_array<interleaved_attribute_t> locations;
// Check if we need to upload a full unoptimized range, i.e [0-max_index]
std::pair<u32, u32> calculate_required_range(u32 first, u32 count) const
{
if (single_vertex)
{
return { 0, 1 };
}
const u32 max_index = (first + count) - 1;
u32 _max_index = 0;
u32 _min_index = first;
for (const auto &attrib : locations)
{
if (attrib.frequency <= 1) [[likely]]
{
_max_index = max_index;
}
else
{
if (attrib.modulo)
{
if (max_index >= attrib.frequency)
{
// Actually uses the modulo operator
_min_index = 0;
_max_index = attrib.frequency - 1;
}
else
{
// Same as having no modulo
_max_index = max_index;
}
}
else
{
// Division operator
_min_index = std::min(_min_index, first / attrib.frequency);
_max_index = std::max<u32>(_max_index, aligned_div(max_index, attrib.frequency));
}
}
}
ensure(_max_index >= _min_index);
return { _min_index, (_max_index - _min_index) + 1 };
}
std::pair<u32, u32> calculate_required_range(u32 first, u32 count) const;
};
enum attribute_buffer_placement : u8

View File

@ -5,6 +5,8 @@
#include "Utilities/StrUtil.h"
#include "Emu/IdManager.h"
#include "util/asm.hpp"
#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame
namespace vk
@ -296,7 +298,7 @@ namespace vk
"%vars"
"\n";
const auto parameters_size = align(push_constants_size, 16) / 16;
const auto parameters_size = utils::align(push_constants_size, 16) / 16;
const std::pair<std::string, std::string> syntax_replace[] =
{
{ "%ws", std::to_string(optimal_group_size) },
@ -943,7 +945,7 @@ namespace vk
set_parameters(cmd);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
const u32 linear_invocations = aligned_div(data_length, num_bytes_per_invocation);
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
compute_task::run(cmd, linear_invocations);
}
};
@ -997,7 +999,7 @@ namespace vk
word_count = num_words;
block_length = num_words * 4;
const u32 linear_invocations = aligned_div(word_count, optimal_group_size);
const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
compute_task::run(cmd, linear_invocations);
}
};

View File

@ -3,6 +3,8 @@
#include "VKResourceManager.h"
#include "VKDMA.h"
#include "util/asm.hpp"
namespace vk
{
static constexpr usz s_dma_block_length = 0x01000000;
@ -85,7 +87,7 @@ namespace vk
{
if (!inheritance_info.parent)
{
const u32 start = align(range.start, s_page_size);
const u32 start = utils::align(range.start, s_page_size);
const u32 end = ((range.end + 1) & s_page_align);
for (u32 page = start; page < end; page += s_page_size)
@ -259,7 +261,7 @@ namespace vk
}
dma_block* block_head = nullptr;
auto block_end = align(limit, s_dma_block_length);
auto block_end = utils::align(limit, s_dma_block_length);
// Reverse scan to try and find the minimum required length in case of other chaining
for (auto block = last_block; block != first_block; block -= s_dma_block_length)

View File

@ -132,7 +132,7 @@ namespace vk
{
// Create new heap. All sizes are aligned up by 64M, upto 1GiB
const usz size_limit = 1024 * 0x100000;
const usz aligned_new_size = align(m_size + size, 64 * 0x100000);
const usz aligned_new_size = utils::align(m_size + size, 64 * 0x100000);
if (aligned_new_size >= size_limit)
{
@ -351,8 +351,8 @@ namespace vk
{
auto create_texture = [&]()
{
u32 new_width = align(requested_width, 1024u);
u32 new_height = align(requested_height, 1024u);
u32 new_width = utils::align(requested_width, 1024u);
u32 new_height = utils::align(requested_height, 1024u);
return new vk::image(*g_current_renderer, g_current_renderer->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
VK_IMAGE_TYPE_2D, format, new_width, new_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
@ -388,7 +388,7 @@ namespace vk
if (!g_scratch_buffer)
{
// Choose optimal size
const u64 alloc_size = std::max<u64>(64 * 0x100000, align(min_required_size, 0x100000));
const u64 alloc_size = std::max<u64>(64 * 0x100000, utils::align(min_required_size, 0x100000));
g_scratch_buffer = std::make_unique<vk::buffer>(*g_current_renderer, alloc_size,
g_current_renderer->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,

View File

@ -2,6 +2,8 @@
#include "VKGSRender.h"
#include "Emu/Cell/Modules/cellVideoOut.h"
#include "util/asm.hpp"
void VKGSRender::reinitialize_swapchain()
{
m_swapchain_dims.width = m_frame->client_width();
@ -651,7 +653,7 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info)
const usz sshot_size = buffer_height * buffer_width * 4;
vk::buffer sshot_vkbuf(*m_device, align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
vk::buffer sshot_vkbuf(*m_device, utils::align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
VkBufferImageCopy copy_info;

View File

@ -131,8 +131,8 @@ namespace vk
multisampled = msaa_image;
resolve = resolve_image;
const u32 invocations_x = align(resolve_image->width(), cs_wave_x) / cs_wave_x;
const u32 invocations_y = align(resolve_image->height(), cs_wave_y) / cs_wave_y;
const u32 invocations_x = utils::align(resolve_image->width(), cs_wave_x) / cs_wave_x;
const u32 invocations_y = utils::align(resolve_image->height(), cs_wave_y) / cs_wave_y;
compute_task::run(cmd, invocations_x, invocations_y, 1);
}

View File

@ -7,6 +7,8 @@
#include "VKRenderPass.h"
#include "VKRenderTargets.h"
#include "util/asm.hpp"
namespace vk
{
VkComponentMapping default_component_map()
@ -89,7 +91,7 @@ namespace vk
ensure(dst->size() >= allocation_end);
const auto data_offset = u32(region.bufferOffset);
const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
const auto z32_offset = utils::align<u32>(data_offset + packed16_length, 256);
// 1. Copy the depth to buffer
VkBufferImageCopy region2;
@ -135,8 +137,8 @@ namespace vk
ensure(dst->size() >= allocation_end);
const auto data_offset = u32(region.bufferOffset);
const auto z_offset = align<u32>(data_offset + packed_length, 256);
const auto s_offset = align<u32>(z_offset + in_depth_size, 256);
const auto z_offset = utils::align<u32>(data_offset + packed_length, 256);
const auto s_offset = utils::align<u32>(z_offset + in_depth_size, 256);
// 1. Copy the depth and stencil blocks to separate banks
VkBufferImageCopy sub_regions[2];
@ -225,7 +227,7 @@ namespace vk
ensure(src->size() >= allocation_end);
const auto data_offset = u32(region.bufferOffset);
const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
const auto z32_offset = utils::align<u32>(data_offset + packed16_length, 256);
// 1. Pre-compute barrier
vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
@ -260,8 +262,8 @@ namespace vk
ensure(src->size() >= allocation_end); // "Out of memory (compute heap). Lower your resolution scale setting."
const auto data_offset = u32(region.bufferOffset);
const auto z_offset = align<u32>(data_offset + packed_length, 256);
const auto s_offset = align<u32>(z_offset + in_depth_size, 256);
const auto z_offset = utils::align<u32>(data_offset + packed_length, 256);
const auto s_offset = utils::align<u32>(z_offset + in_depth_size, 256);
// Zero out the stencil block
vkCmdFillBuffer(cmd, src->value, s_offset, in_stencil_size, 0);
@ -821,7 +823,7 @@ namespace vk
const auto src_offset = section.bufferOffset;
// Align output to 128-byte boundary to keep some drivers happy
dst_offset = align(dst_offset, 128);
dst_offset = utils::align(dst_offset, 128);
u32 data_length = 0;
for (unsigned i = 0, j = packet.first; i < packet.second; ++i, ++j)
@ -930,7 +932,7 @@ namespace vk
if (layout.level == 0)
{
// Align mip0 on a 128-byte boundary
scratch_offset = align(scratch_offset, 128);
scratch_offset = utils::align(scratch_offset, 128);
}
// Copy from upload heap to scratch mem

View File

@ -0,0 +1,360 @@
#include "stdafx.h"
#include "VKGSRender.h"
#include "VKTextureCache.h"
#include "util/asm.hpp"
namespace vk
{
void cached_texture_section::dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch)
{
ensure(src->samples() == 1);
if (!m_device)
{
m_device = &cmd.get_command_pool().get_owner();
}
if (dma_fence)
{
// NOTE: This can be reached if previously synchronized, or a special path happens.
// If a hard flush occurred while this surface was flush_always the cache would have reset its protection afterwards.
// DMA resource would still be present but already used to flush previously.
vk::get_resource_manager()->dispose(dma_fence);
}
if (vk::is_renderpass_open(cmd))
{
vk::end_renderpass(cmd);
}
src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
const auto internal_bpp = vk::get_format_texel_width(src->format());
const auto transfer_width = static_cast<u32>(src_area.width());
const auto transfer_height = static_cast<u32>(src_area.height());
real_pitch = internal_bpp * transfer_width;
rsx_pitch = pitch;
const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT;
if (require_format_conversion || pack_unpack_swap_bytes)
{
const auto section_length = valid_range.length();
const auto transfer_pitch = real_pitch;
const auto task_length = transfer_pitch * src_area.height();
auto working_buffer = vk::get_scratch_buffer(task_length);
auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
VkBufferImageCopy region = {};
region.imageSubresource = { src->aspect(), 0, 0, 1 };
region.imageOffset = { src_area.x1, src_area.y1, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
vk::copy_image_to_buffer(cmd, src, working_buffer, region, (require_format_conversion && pack_unpack_swap_bytes));
// NOTE: For depth/stencil formats, copying to buffer and byteswap are combined into one step above
if (pack_unpack_swap_bytes && !require_format_conversion)
{
const auto texel_layout = vk::get_format_element_size(src->format());
const auto elem_size = texel_layout.first;
vk::cs_shuffle_base *shuffle_kernel;
if (elem_size == 2)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
}
else if (elem_size == 4)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
}
else
{
ensure(get_context() == rsx::texture_upload_context::dma);
shuffle_kernel = nullptr;
}
if (shuffle_kernel)
{
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
shuffle_kernel->run(cmd, working_buffer, task_length);
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
}
}
if (rsx_pitch == real_pitch) [[likely]]
{
VkBufferCopy copy = {};
copy.dstOffset = final_mapping.first;
copy.size = section_length;
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
}
else
{
if (context != rsx::texture_upload_context::dma)
{
// Partial load for the bits outside the existing image
// NOTE: A true DMA section would have been prepped beforehand
// TODO: Parial range load/flush
vk::load_dma(valid_range.start, section_length);
}
std::vector<VkBufferCopy> copy;
copy.reserve(transfer_height);
u32 dst_offset = final_mapping.first;
u32 src_offset = 0;
for (unsigned row = 0; row < transfer_height; ++row)
{
copy.push_back({ src_offset, dst_offset, transfer_pitch });
src_offset += real_pitch;
dst_offset += rsx_pitch;
}
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
}
}
else
{
VkBufferImageCopy region = {};
region.bufferRowLength = (rsx_pitch / internal_bpp);
region.imageSubresource = { src->aspect(), 0, 0, 1 };
region.imageOffset = { src_area.x1, src_area.y1, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
region.bufferOffset = mapping.first;
vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, &region);
}
src->pop_layout(cmd);
// Create event object for this transfer and queue signal op
dma_fence = std::make_unique<vk::event>(*m_device);
dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT);
// Set cb flag for queued dma operations
cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
if (get_context() == rsx::texture_upload_context::dma)
{
// Save readback hint in case transformation is required later
switch (internal_bpp)
{
case 2:
gcm_format = CELL_GCM_TEXTURE_R5G6B5;
break;
case 4:
default:
gcm_format = CELL_GCM_TEXTURE_A8R8G8B8;
break;
}
}
synchronized = true;
sync_timestamp = get_system_time();
}
void texture_cache::copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const
{
const auto dst_aspect = dst->aspect();
const auto dst_bpp = vk::get_format_texel_width(dst->format());
for (const auto &section : sections_to_transfer)
{
if (!section.src)
continue;
const bool typeless = section.src->aspect() != dst_aspect ||
!formats_are_bitcast_compatible(dst, section.src);
// Avoid inserting unnecessary barrier GENERAL->TRANSFER_SRC->GENERAL in active render targets
const auto preferred_layout = (section.src->current_layout != VK_IMAGE_LAYOUT_GENERAL) ?
VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL;
section.src->push_layout(cmd, preferred_layout);
auto src_image = section.src;
auto src_x = section.src_x;
auto src_y = section.src_y;
auto src_w = section.src_w;
auto src_h = section.src_h;
rsx::flags32_t transform = section.xform;
if (section.xform == rsx::surface_transform::coordinate_transform)
{
// Dimensions were given in 'dst' space. Work out the real source coordinates
const auto src_bpp = vk::get_format_texel_width(section.src->format());
src_x = (src_x * dst_bpp) / src_bpp;
src_w = utils::aligned_div<u16>(src_w * dst_bpp, src_bpp);
transform &= ~(rsx::surface_transform::coordinate_transform);
}
if (auto surface = dynamic_cast<vk::render_target*>(section.src))
{
surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
}
if (typeless) [[unlikely]]
{
const auto src_bpp = vk::get_format_texel_width(section.src->format());
const u16 convert_w = u16(src_w * src_bpp) / dst_bpp;
const u16 convert_x = u16(src_x * src_bpp) / dst_bpp;
if (convert_w == section.dst_w && src_h == section.dst_h &&
transform == rsx::surface_transform::identity &&
section.level == 0 && section.dst_z == 0)
{
// Optimization to avoid double transfer
// TODO: Handle level and layer offsets
const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
const areai dst_rect = coordi{{ section.dst_x, section.dst_y }, { section.dst_w, section.dst_h }};
vk::copy_image_typeless(cmd, section.src, dst, src_rect, dst_rect, 1);
section.src->pop_layout(cmd);
continue;
}
src_image = vk::get_typeless_helper(dst->format(), dst->format_class(), convert_x + convert_w, src_y + src_h);
src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
const areai dst_rect = coordi{{ convert_x, src_y }, { convert_w, src_h }};
vk::copy_image_typeless(cmd, section.src, src_image, src_rect, dst_rect, 1);
src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
src_x = convert_x;
src_w = convert_w;
}
ensure(src_image->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src_image->current_layout == VK_IMAGE_LAYOUT_GENERAL);
// Final aspect mask of the 'final' transfer source
const auto new_src_aspect = src_image->aspect();
if (src_w == section.dst_w && src_h == section.dst_h && transform == rsx::surface_transform::identity) [[likely]]
{
VkImageCopy copy_rgn;
copy_rgn.srcOffset = { src_x, src_y, 0 };
copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
copy_rgn.dstSubresource = { dst_aspect, 0, 0, 1 };
copy_rgn.srcSubresource = { new_src_aspect, 0, 0, 1 };
copy_rgn.extent = { src_w, src_h, 1 };
if (dst->info.imageType == VK_IMAGE_TYPE_3D)
{
copy_rgn.dstOffset.z = section.dst_z;
}
else
{
copy_rgn.dstSubresource.baseArrayLayer = section.dst_z;
copy_rgn.dstSubresource.mipLevel = section.level;
}
vkCmdCopyImage(cmd, src_image->value, src_image->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
}
else
{
ensure(section.dst_z == 0);
u16 dst_x = section.dst_x, dst_y = section.dst_y;
vk::image* _dst;
if (src_image->info.format == dst->info.format && section.level == 0) [[likely]]
{
_dst = dst;
}
else
{
// Either a bitcast is required or a scale+copy to mipmap level
_dst = vk::get_typeless_helper(src_image->format(), src_image->format_class(), dst->width(), dst->height() * 2);
_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
}
if (transform == rsx::surface_transform::identity)
{
vk::copy_scaled_image(cmd, src_image, _dst,
coordi{ { src_x, src_y }, { src_w, src_h } },
coordi{ { section.dst_x, section.dst_y }, { section.dst_w, section.dst_h } },
1, src_image->format() == _dst->format(),
VK_FILTER_NEAREST);
}
else if (transform == rsx::surface_transform::argb_to_bgra)
{
VkBufferImageCopy copy{};
copy.imageExtent = { src_w, src_h, 1 };
copy.imageOffset = { src_x, src_y, 0 };
copy.imageSubresource = { src_image->aspect(), 0, 0, 1 };
const auto mem_length = src_w * src_h * dst_bpp;
auto scratch_buf = vk::get_scratch_buffer(mem_length);
vkCmdCopyImageToBuffer(cmd, src_image->value, src_image->current_layout, scratch_buf->value, 1, &copy);
vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
auto shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
shuffle_kernel->run(cmd, scratch_buf, mem_length);
vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
auto tmp = vk::get_typeless_helper(src_image->format(), src_image->format_class(), section.dst_x + section.dst_w, section.dst_y + section.dst_h);
tmp->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
copy.imageOffset = { 0, 0, 0 };
vkCmdCopyBufferToImage(cmd, scratch_buf->value, tmp->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy);
dst_x = 0;
dst_y = 0;
if (src_w != section.dst_w || src_h != section.dst_h)
{
// Optionally scale if needed
if (tmp == _dst) [[unlikely]]
{
dst_y = src_h;
}
vk::copy_scaled_image(cmd, tmp, _dst,
areai{ 0, 0, src_w, static_cast<s32>(src_h) },
coordi{ { dst_x, dst_y }, { section.dst_w, section.dst_h } },
1, tmp->info.format == _dst->info.format,
VK_FILTER_NEAREST);
}
else
{
_dst = tmp;
}
}
else
{
fmt::throw_exception("Unreachable");
}
if (_dst != dst) [[unlikely]]
{
// Casting comes after the scaling!
VkImageCopy copy_rgn;
copy_rgn.srcOffset = { s32(dst_x), s32(dst_y), 0 };
copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
copy_rgn.dstSubresource = { dst_aspect, section.level, 0, 1 };
copy_rgn.srcSubresource = { _dst->aspect(), 0, 0, 1 };
copy_rgn.extent = { section.dst_w, section.dst_h, 1 };
_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
vkCmdCopyImage(cmd, _dst->value, _dst->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
}
}
section.src->pop_layout(cmd);
}
}
}

View File

@ -167,160 +167,7 @@ namespace vk
return flushed;
}
void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch)
{
ensure(src->samples() == 1);
if (!m_device)
{
m_device = &cmd.get_command_pool().get_owner();
}
if (dma_fence)
{
// NOTE: This can be reached if previously synchronized, or a special path happens.
// If a hard flush occurred while this surface was flush_always the cache would have reset its protection afterwards.
// DMA resource would still be present but already used to flush previously.
vk::get_resource_manager()->dispose(dma_fence);
}
if (vk::is_renderpass_open(cmd))
{
vk::end_renderpass(cmd);
}
src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
const auto internal_bpp = vk::get_format_texel_width(src->format());
const auto transfer_width = static_cast<u32>(src_area.width());
const auto transfer_height = static_cast<u32>(src_area.height());
real_pitch = internal_bpp * transfer_width;
rsx_pitch = pitch;
const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT;
if (require_format_conversion || pack_unpack_swap_bytes)
{
const auto section_length = valid_range.length();
const auto transfer_pitch = real_pitch;
const auto task_length = transfer_pitch * src_area.height();
auto working_buffer = vk::get_scratch_buffer(task_length);
auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
VkBufferImageCopy region = {};
region.imageSubresource = { src->aspect(), 0, 0, 1 };
region.imageOffset = { src_area.x1, src_area.y1, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
vk::copy_image_to_buffer(cmd, src, working_buffer, region, (require_format_conversion && pack_unpack_swap_bytes));
// NOTE: For depth/stencil formats, copying to buffer and byteswap are combined into one step above
if (pack_unpack_swap_bytes && !require_format_conversion)
{
const auto texel_layout = vk::get_format_element_size(src->format());
const auto elem_size = texel_layout.first;
vk::cs_shuffle_base *shuffle_kernel;
if (elem_size == 2)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
}
else if (elem_size == 4)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
}
else
{
ensure(get_context() == rsx::texture_upload_context::dma);
shuffle_kernel = nullptr;
}
if (shuffle_kernel)
{
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
shuffle_kernel->run(cmd, working_buffer, task_length);
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
}
}
if (rsx_pitch == real_pitch) [[likely]]
{
VkBufferCopy copy = {};
copy.dstOffset = final_mapping.first;
copy.size = section_length;
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
}
else
{
if (context != rsx::texture_upload_context::dma)
{
// Partial load for the bits outside the existing image
// NOTE: A true DMA section would have been prepped beforehand
// TODO: Parial range load/flush
vk::load_dma(valid_range.start, section_length);
}
std::vector<VkBufferCopy> copy;
copy.reserve(transfer_height);
u32 dst_offset = final_mapping.first;
u32 src_offset = 0;
for (unsigned row = 0; row < transfer_height; ++row)
{
copy.push_back({ src_offset, dst_offset, transfer_pitch });
src_offset += real_pitch;
dst_offset += rsx_pitch;
}
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
}
}
else
{
VkBufferImageCopy region = {};
region.bufferRowLength = (rsx_pitch / internal_bpp);
region.imageSubresource = { src->aspect(), 0, 0, 1 };
region.imageOffset = { src_area.x1, src_area.y1, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
region.bufferOffset = mapping.first;
vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, &region);
}
src->pop_layout(cmd);
// Create event object for this transfer and queue signal op
dma_fence = std::make_unique<vk::event>(*m_device);
dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT);
// Set cb flag for queued dma operations
cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
if (get_context() == rsx::texture_upload_context::dma)
{
// Save readback hint in case transformation is required later
switch (internal_bpp)
{
case 2:
gcm_format = CELL_GCM_TEXTURE_R5G6B5;
break;
case 4:
default:
gcm_format = CELL_GCM_TEXTURE_A8R8G8B8;
break;
}
}
synchronized = true;
sync_timestamp = get_system_time();
}
void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch);
void copy_texture(vk::command_buffer& cmd, bool miss)
{
@ -610,202 +457,7 @@ namespace vk
return mapping;
}
void copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const
{
const auto dst_aspect = dst->aspect();
const auto dst_bpp = vk::get_format_texel_width(dst->format());
for (const auto &section : sections_to_transfer)
{
if (!section.src)
continue;
const bool typeless = section.src->aspect() != dst_aspect ||
!formats_are_bitcast_compatible(dst, section.src);
// Avoid inserting unnecessary barrier GENERAL->TRANSFER_SRC->GENERAL in active render targets
const auto preferred_layout = (section.src->current_layout != VK_IMAGE_LAYOUT_GENERAL) ?
VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL;
section.src->push_layout(cmd, preferred_layout);
auto src_image = section.src;
auto src_x = section.src_x;
auto src_y = section.src_y;
auto src_w = section.src_w;
auto src_h = section.src_h;
rsx::flags32_t transform = section.xform;
if (section.xform == rsx::surface_transform::coordinate_transform)
{
// Dimensions were given in 'dst' space. Work out the real source coordinates
const auto src_bpp = vk::get_format_texel_width(section.src->format());
src_x = (src_x * dst_bpp) / src_bpp;
src_w = ::aligned_div<u16>(src_w * dst_bpp, src_bpp);
transform &= ~(rsx::surface_transform::coordinate_transform);
}
if (auto surface = dynamic_cast<vk::render_target*>(section.src))
{
surface->transform_samples_to_pixels(src_x, src_w, src_y, src_h);
}
if (typeless) [[unlikely]]
{
const auto src_bpp = vk::get_format_texel_width(section.src->format());
const u16 convert_w = u16(src_w * src_bpp) / dst_bpp;
const u16 convert_x = u16(src_x * src_bpp) / dst_bpp;
if (convert_w == section.dst_w && src_h == section.dst_h &&
transform == rsx::surface_transform::identity &&
section.level == 0 && section.dst_z == 0)
{
// Optimization to avoid double transfer
// TODO: Handle level and layer offsets
const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
const areai dst_rect = coordi{{ section.dst_x, section.dst_y }, { section.dst_w, section.dst_h }};
vk::copy_image_typeless(cmd, section.src, dst, src_rect, dst_rect, 1);
section.src->pop_layout(cmd);
continue;
}
src_image = vk::get_typeless_helper(dst->format(), dst->format_class(), convert_x + convert_w, src_y + src_h);
src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
const areai src_rect = coordi{{ src_x, src_y }, { src_w, src_h }};
const areai dst_rect = coordi{{ convert_x, src_y }, { convert_w, src_h }};
vk::copy_image_typeless(cmd, section.src, src_image, src_rect, dst_rect, 1);
src_image->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
src_x = convert_x;
src_w = convert_w;
}
ensure(src_image->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src_image->current_layout == VK_IMAGE_LAYOUT_GENERAL);
// Final aspect mask of the 'final' transfer source
const auto new_src_aspect = src_image->aspect();
if (src_w == section.dst_w && src_h == section.dst_h && transform == rsx::surface_transform::identity) [[likely]]
{
VkImageCopy copy_rgn;
copy_rgn.srcOffset = { src_x, src_y, 0 };
copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
copy_rgn.dstSubresource = { dst_aspect, 0, 0, 1 };
copy_rgn.srcSubresource = { new_src_aspect, 0, 0, 1 };
copy_rgn.extent = { src_w, src_h, 1 };
if (dst->info.imageType == VK_IMAGE_TYPE_3D)
{
copy_rgn.dstOffset.z = section.dst_z;
}
else
{
copy_rgn.dstSubresource.baseArrayLayer = section.dst_z;
copy_rgn.dstSubresource.mipLevel = section.level;
}
vkCmdCopyImage(cmd, src_image->value, src_image->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
}
else
{
ensure(section.dst_z == 0);
u16 dst_x = section.dst_x, dst_y = section.dst_y;
vk::image* _dst;
if (src_image->info.format == dst->info.format && section.level == 0) [[likely]]
{
_dst = dst;
}
else
{
// Either a bitcast is required or a scale+copy to mipmap level
_dst = vk::get_typeless_helper(src_image->format(), src_image->format_class(), dst->width(), dst->height() * 2);
_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
}
if (transform == rsx::surface_transform::identity)
{
vk::copy_scaled_image(cmd, src_image, _dst,
coordi{ { src_x, src_y }, { src_w, src_h } },
coordi{ { section.dst_x, section.dst_y }, { section.dst_w, section.dst_h } },
1, src_image->format() == _dst->format(),
VK_FILTER_NEAREST);
}
else if (transform == rsx::surface_transform::argb_to_bgra)
{
VkBufferImageCopy copy{};
copy.imageExtent = { src_w, src_h, 1 };
copy.imageOffset = { src_x, src_y, 0 };
copy.imageSubresource = { src_image->aspect(), 0, 0, 1 };
const auto mem_length = src_w * src_h * dst_bpp;
auto scratch_buf = vk::get_scratch_buffer(mem_length);
vkCmdCopyImageToBuffer(cmd, src_image->value, src_image->current_layout, scratch_buf->value, 1, &copy);
vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
auto shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
shuffle_kernel->run(cmd, scratch_buf, mem_length);
vk::insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, mem_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
auto tmp = vk::get_typeless_helper(src_image->format(), src_image->format_class(), section.dst_x + section.dst_w, section.dst_y + section.dst_h);
tmp->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
copy.imageOffset = { 0, 0, 0 };
vkCmdCopyBufferToImage(cmd, scratch_buf->value, tmp->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy);
dst_x = 0;
dst_y = 0;
if (src_w != section.dst_w || src_h != section.dst_h)
{
// Optionally scale if needed
if (tmp == _dst) [[unlikely]]
{
dst_y = src_h;
}
vk::copy_scaled_image(cmd, tmp, _dst,
areai{ 0, 0, src_w, static_cast<s32>(src_h) },
coordi{ { dst_x, dst_y }, { section.dst_w, section.dst_h } },
1, tmp->info.format == _dst->info.format,
VK_FILTER_NEAREST);
}
else
{
_dst = tmp;
}
}
else
{
fmt::throw_exception("Unreachable");
}
if (_dst != dst) [[unlikely]]
{
// Casting comes after the scaling!
VkImageCopy copy_rgn;
copy_rgn.srcOffset = { s32(dst_x), s32(dst_y), 0 };
copy_rgn.dstOffset = { section.dst_x, section.dst_y, 0 };
copy_rgn.dstSubresource = { dst_aspect, section.level, 0, 1 };
copy_rgn.srcSubresource = { _dst->aspect(), 0, 0, 1 };
copy_rgn.extent = { section.dst_w, section.dst_h, 1 };
_dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
vkCmdCopyImage(cmd, _dst->value, _dst->current_layout, dst->value, dst->current_layout, 1, &copy_rgn);
}
}
section.src->pop_layout(cmd);
}
}
void copy_transfer_regions_impl(vk::command_buffer& cmd, vk::image* dst, const std::vector<copy_region_descriptor>& sections_to_transfer) const;
vk::image* get_template_from_collection_impl(const std::vector<copy_region_descriptor>& sections_to_transfer) const
{

View File

@ -107,6 +107,7 @@
<ClCompile Include="Emu\RSX\GL\OpenGL.cpp" />
<ClCompile Include="Emu\RSX\GL\GLTexture.cpp" />
<ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
<ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">

View File

@ -14,6 +14,7 @@
<ClCompile Include="Emu\RSX\GL\GLShaderInterpreter.cpp" />
<ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
<ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="Emu\RSX\GL\GLTexture.h" />

View File

@ -1,6 +1,8 @@
#include "stdafx.h"
#include "PSF.h"
#include "util/asm.hpp"
LOG_CHANNEL(psf_log, "PSF");
template<>
@ -208,7 +210,7 @@ namespace psf
}
// Align next section (data) offset
key_offset = ::align(key_offset, 4);
key_offset = utils::align(key_offset, 4);
// Generate header
header_t header;

View File

@ -67,6 +67,7 @@
<ClCompile Include="Emu\RSX\VK\VKTexture.cpp" />
<ClCompile Include="Emu\RSX\VK\VKVertexBuffers.cpp" />
<ClCompile Include="Emu\RSX\VK\VKVertexProgram.cpp" />
<ClCompile Include="Emu\RSX\VK\VKTextureCache.cpp" />
<ClCompile Include="Emu\RSX\VK\VKMemAlloc.cpp" />
</ItemGroup>
<ItemGroup>

View File

@ -18,6 +18,7 @@
<ClCompile Include="Emu\RSX\VK\VKTexture.cpp" />
<ClCompile Include="Emu\RSX\VK\VKVertexBuffers.cpp" />
<ClCompile Include="Emu\RSX\VK\VKVertexProgram.cpp" />
<ClCompile Include="Emu\RSX\VK\VKTextureCache.cpp" />
<ClCompile Include="Emu\RSX\VK\VKMemAlloc.cpp" />
<ClCompile Include="Emu\RSX\VK\VKCommandStream.cpp" />
<ClCompile Include="Emu\RSX\VK\VKQueryPool.cpp" />

View File

@ -18,6 +18,7 @@
#include "Emu/Cell/PPUFunction.h"
#include "util/yaml.hpp"
#include "util/asm.hpp"
#include "util/to_endian.hpp"
#include "Utilities/StrUtil.h"
#include "Utilities/bin_patch.h" // get_patches_path()
@ -418,17 +419,17 @@ bool cheat_engine::set_value(const u32 offset, const T value)
if (exec_code_at_end && exec_code_at_start)
{
size = align<u32>(addr + size, 4) - (addr & -4);
size = utils::align<u32>(addr + size, 4) - (addr & -4);
addr &= -4;
}
else if (exec_code_at_end)
{
size -= align<u32>(size - 4096 + (addr & 4095), 4);
addr = align<u32>(addr, 4096);
size -= utils::align<u32>(size - 4096 + (addr & 4095), 4);
addr = utils::align<u32>(addr, 4096);
}
else if (exec_code_at_start)
{
size = align<u32>(4096 - (addr & 4095), 4);
size = utils::align<u32>(4096 - (addr & 4095), 4);
addr &= -4;
}

View File

@ -27,6 +27,8 @@
#include <QVBoxLayout>
#include <QTimer>
#include "util/asm.hpp"
constexpr auto qstr = QString::fromStdString;
debugger_frame::debugger_frame(std::shared_ptr<gui_settings> settings, QWidget *parent)
@ -573,7 +575,7 @@ void debugger_frame::ShowGotoAddressDialog()
if (cpu)
{
// -1 turns into 0
u32 pc = ::align<u32>(cpu->get_pc(), 4);
u32 pc = utils::align<u32>(cpu->get_pc(), 4);
address_preview_label->setText(QString("Address: 0x%1").arg(pc, 8, 16, QChar('0')));
expression_input->setPlaceholderText(QString("0x%1").arg(pc, 8, 16, QChar('0')));
}
@ -605,7 +607,7 @@ void debugger_frame::ShowGotoAddressDialog()
if (diag->exec() == QDialog::Accepted)
{
// -1 turns into 0
u32 address = ::align<u32>(cpu ? cpu->get_pc() : 0, 4);
u32 address = utils::align<u32>(cpu ? cpu->get_pc() : 0, 4);
if (expression_input->text().isEmpty())
{

View File

@ -15,6 +15,8 @@
#include <QWheelEvent>
#include <shared_mutex>
#include "util/asm.hpp"
constexpr auto qstr = QString::fromStdString;
memory_viewer_panel::memory_viewer_panel(QWidget* parent, u32 addr)
@ -293,7 +295,7 @@ void memory_viewer_panel::resizeEvent(QResizeEvent *event)
std::string memory_viewer_panel::getHeaderAtAddr(u32 addr)
{
// Check if its an SPU Local Storage beginning
const u32 spu_boundary = ::align<u32>(addr, SPU_LS_SIZE);
const u32 spu_boundary = utils::align<u32>(addr, SPU_LS_SIZE);
if (spu_boundary <= addr + m_colcount * 4 - 1)
{

View File

@ -15,6 +15,7 @@
#include <charconv>
#include "util/v128.hpp"
#include "util/asm.hpp"
constexpr auto qstr = QString::fromStdString;
inline std::string sstr(const QString& _in) { return _in.toStdString(); }
@ -30,7 +31,7 @@ enum registers : int
ppu_ff31 = ppu_ff0 + 31,
ppu_v0,
ppu_v31 = ppu_v0 + 31,
spu_r0 = ::align(ppu_v31 + 1u, 128),
spu_r0 = utils::align(ppu_v31 + 1u, 128),
spu_r127 = spu_r0 + 127,
PPU_CR,
PPU_LR,

View File

@ -34,6 +34,7 @@
#include <thread>
#include "util/sysinfo.hpp"
#include "util/asm.hpp"
#ifdef WITH_DISCORD_RPC
#include "_discord_utils.h"
@ -1809,7 +1810,7 @@ void settings_dialog::SnapSlider(QSlider *slider, int interval)
{
return;
}
slider->setValue(::rounded_div(value, interval) * interval);
slider->setValue(utils::rounded_div(value, interval) * interval);
});
}

View File

@ -292,6 +292,32 @@ namespace utils
do _mm_pause();
while (__rdtsc() - start < cycles);
}
// Align to power of 2
template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
constexpr T align(T value, ullong align)
{
return static_cast<T>((value + (align - 1)) & (0 - align));
}
// General purpose aligned division, the result is rounded up not truncated
template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
constexpr T aligned_div(T value, ullong align)
{
return static_cast<T>((value + align - 1) / align);
}
// General purpose aligned division, the result is rounded to nearest
template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
constexpr T rounded_div(T value, std::conditional_t<std::is_signed<T>::value, llong, ullong> align)
{
if constexpr (std::is_unsigned<T>::value)
{
return static_cast<T>((value + (align / 2)) / align);
}
return static_cast<T>((value + (value < 0 ? 0 - align : align) / 2) / align);
}
} // namespace utils
using utils::busy_wait;

View File

@ -15,6 +15,8 @@
#include <errno.h>
#endif
#include "util/asm.hpp"
inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
{
int regs[4];
@ -298,7 +300,7 @@ std::string utils::get_OS_version()
static constexpr ullong round_tsc(ullong val)
{
return ::rounded_div(val, 1'000'000) * 1'000'000;
return utils::rounded_div(val, 1'000'000) * 1'000'000;
}
ullong utils::get_tsc_freq()

View File

@ -595,31 +595,6 @@ struct f16
}
};
template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
constexpr T align(T value, ullong align)
{
return static_cast<T>((value + (align - 1)) & (0 - align));
}
// General purpose aligned division, the result is rounded up not truncated
template <typename T, typename = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
constexpr T aligned_div(T value, ullong align)
{
return static_cast<T>((value + align - 1) / align);
}
// General purpose aligned division, the result is rounded to nearest
template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
constexpr T rounded_div(T value, std::conditional_t<std::is_signed<T>::value, llong, ullong> align)
{
if constexpr (std::is_unsigned<T>::value)
{
return static_cast<T>((value + (align / 2)) / align);
}
return static_cast<T>((value + (value < 0 ? 0 - align : align) / 2) / align);
}
template <typename T, typename T2>
inline u32 offset32(T T2::*const mptr)
{

View File

@ -1,6 +1,7 @@
#include "stdafx.h"
#include "util/logs.hpp"
#include "util/vm.hpp"
#include "util/asm.hpp"
#ifdef _WIN32
#include "util/dyn_lib.hpp"
#include <Windows.h>
@ -209,7 +210,7 @@ namespace utils
}
shm::shm(u32 size, u32 flags)
: m_size(::align(size, 0x10000))
: m_size(utils::align(size, 0x10000))
, m_flags(flags)
, m_ptr(0)
{
@ -306,7 +307,7 @@ namespace utils
{
const u64 res64 = reinterpret_cast<u64>(::mmap(reinterpret_cast<void*>(ptr64), m_size + 0xf000, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0));
const u64 aligned = ::align(res64, 0x10000);
const u64 aligned = utils::align(res64, 0x10000);
const auto result = ::mmap(reinterpret_cast<void*>(aligned), m_size, +prot, MAP_SHARED | MAP_FIXED, m_file, 0);
// Now cleanup remnants