From 362a26a4043084a7e08d036f8d1335696dfbacd0 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Wed, 21 Sep 2022 23:15:40 +0300 Subject: [PATCH] gl: Fix D24X8 accelerated encode/decode - PS3 D24X8 is swapped as a full word, unlike PC. - Add missing paths to handle custom swap behavior. --- rpcs3/Emu/RSX/GL/GLCompute.cpp | 46 ++++++++++++++----- rpcs3/Emu/RSX/GL/GLCompute.h | 2 + rpcs3/Emu/RSX/GL/GLTexture.cpp | 37 ++++++++++++--- .../CopyBufferToGenericImage.glsl | 16 ++----- .../GLSLSnippets/CopyD24x8ToBuffer.glsl | 8 +++- 5 files changed, 76 insertions(+), 33 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLCompute.cpp b/rpcs3/Emu/RSX/GL/GLCompute.cpp index e290be4d88..b2c0ab97df 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.cpp +++ b/rpcs3/Emu/RSX/GL/GLCompute.cpp @@ -207,7 +207,8 @@ namespace gl compute_task::run(cmd, num_invocations); } - cs_shuffle_d32fx8_to_x8d24f::cs_shuffle_d32fx8_to_x8d24f() + template + cs_shuffle_d32fx8_to_x8d24f::cs_shuffle_d32fx8_to_x8d24f() { uniforms = "uniform uint in_ptr, out_ptr;\n"; @@ -223,15 +224,22 @@ namespace gl " value |= stencil;\n" " data[index + out_ptr] = bswap_u32(value);\n"; + if constexpr (!SwapBytes) + { + work_kernel = fmt::replace_all(work_kernel, "bswap_u32(value)", "value", 1); + } + cs_shuffle_base::build(""); } - void cs_shuffle_d32fx8_to_x8d24f::bind_resources() + template + void cs_shuffle_d32fx8_to_x8d24f::bind_resources() { m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); } - void cs_shuffle_d32fx8_to_x8d24f::run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) + template + void cs_shuffle_d32fx8_to_x8d24f::run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) { u32 data_offset; if (src_offset > dst_offset) @@ -250,7 +258,11 @@ namespace gl cs_shuffle_base::run(cmd, data, num_texels * 4, data_offset); } - cs_shuffle_x8d24f_to_d32fx8::cs_shuffle_x8d24f_to_d32fx8() + template cs_shuffle_d32fx8_to_x8d24f; + template cs_shuffle_d32fx8_to_x8d24f; + + template + cs_shuffle_x8d24f_to_d32fx8::cs_shuffle_x8d24f_to_d32fx8() { uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n"; @@ -267,15 +279,22 @@ namespace gl " data[index * 2 + out_offset] = d24f_to_f32(depth);\n" " data[index * 2 + (out_offset + 1)] = stencil;\n"; + if constexpr (!SwapBytes) + { + work_kernel = fmt::replace_all(work_kernel, "value = bswap_u32(value)", "// value = bswap_u32(value)", 1); + } + cs_shuffle_base::build(""); } - void cs_shuffle_x8d24f_to_d32fx8::bind_resources() + template + void cs_shuffle_x8d24f_to_d32fx8::bind_resources() { m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); } - void cs_shuffle_x8d24f_to_d32fx8::run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) + template + void cs_shuffle_x8d24f_to_d32fx8::run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) { u32 data_offset; if (src_offset > dst_offset) @@ -294,6 +313,9 @@ namespace gl cs_shuffle_base::run(cmd, data, num_texels * 4, data_offset); } + template cs_shuffle_x8d24f_to_d32fx8; + template cs_shuffle_x8d24f_to_d32fx8; + cs_d24x8_to_ssbo::cs_d24x8_to_ssbo() { initialize(); @@ -332,11 +354,11 @@ namespace gl } // This method is callable in sensitive code and must restore the GL state on exit - gl::saved_sampler_state save_0(GL_COMPUTE_BUFFER_SLOT(0), m_sampler); - gl::saved_sampler_state save_1(GL_COMPUTE_BUFFER_SLOT(1), m_sampler); + gl::saved_sampler_state save_sampler0(GL_COMPUTE_BUFFER_SLOT(0), m_sampler); + gl::saved_sampler_state save_sampler1(GL_COMPUTE_BUFFER_SLOT(1), m_sampler); - gl::bind_image_view_safe(cmd, GL_COMPUTE_BUFFER_SLOT(0), depth_view); - gl::bind_image_view_safe(cmd, GL_COMPUTE_BUFFER_SLOT(1), stencil_view); + gl::bind_image_view_safe save_image1(cmd, GL_COMPUTE_BUFFER_SLOT(0), depth_view); + gl::bind_image_view_safe save_image2(cmd, GL_COMPUTE_BUFFER_SLOT(1), stencil_view); dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(2), out_offset, row_pitch * 4 * region.height); @@ -383,8 +405,8 @@ namespace gl } // This method is callable in sensitive code and must restore the GL state on exit - gl::saved_sampler_state save(GL_COMPUTE_BUFFER_SLOT(0), m_sampler); - gl::bind_image_view_safe(cmd, GL_COMPUTE_BUFFER_SLOT(0), data_view); + gl::saved_sampler_state save_sampler(GL_COMPUTE_BUFFER_SLOT(0), m_sampler); + gl::bind_image_view_safe save_image(cmd, GL_COMPUTE_BUFFER_SLOT(0), data_view); dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(1), out_offset, row_pitch * 4 * region.height); diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 2e911f7ae4..380ef0f2eb 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -78,6 +78,7 @@ namespace gl } }; + template struct cs_shuffle_d32fx8_to_x8d24f : cs_shuffle_base { u32 m_ssbo_length = 0; @@ -89,6 +90,7 @@ namespace gl void run(gl::command_context& cmd, const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels); }; + template struct cs_shuffle_x8d24f_to_d32fx8 : cs_shuffle_base { u32 m_ssbo_length = 0; diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 76ab759205..0532de4850 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -289,10 +289,10 @@ namespace gl } if (auto as_vi = dynamic_cast(src); - gl::get_driver_caps().vendor_AMD && src->get_target() == gl::texture::target::texture2D && as_vi) { + // RGBA8 <-> D24X8 bitcasts are some very common conversions due to some PS3 coding hacks & workarounds. switch (src->get_internal_format()) { case gl::texture::internal_format::depth24_stencil8: @@ -337,8 +337,16 @@ namespace gl mem_info->memory_required = (mem_info->image_size_in_texels * 6); ensure(!initialize_scratch_mem()); - get_compute_task>()->run(cmd, dst, dst_offset, - static_cast(mem_info->image_size_in_bytes), static_cast(mem_info->image_size_in_bytes)); + if (pack_info.swap_bytes) [[ likely ]] + { + get_compute_task>()->run(cmd, dst, dst_offset, + static_cast(mem_info->image_size_in_bytes), static_cast(mem_info->image_size_in_bytes)); + } + else + { + get_compute_task>()->run(cmd, dst, dst_offset, + static_cast(mem_info->image_size_in_bytes), static_cast(mem_info->image_size_in_bytes)); + } result = reinterpret_cast(mem_info->image_size_in_bytes + dst_offset); } else if (pack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) @@ -347,8 +355,16 @@ namespace gl mem_info->memory_required = (mem_info->image_size_in_texels * 12); ensure(!initialize_scratch_mem()); - get_compute_task()->run(cmd, dst, dst_offset, - static_cast(mem_info->image_size_in_bytes), static_cast(mem_info->image_size_in_texels)); + if (pack_info.swap_bytes) + { + get_compute_task>()->run(cmd, dst, dst_offset, + static_cast(mem_info->image_size_in_bytes), static_cast(mem_info->image_size_in_texels)); + } + else + { + get_compute_task>()->run(cmd, dst, dst_offset, + static_cast(mem_info->image_size_in_bytes), static_cast(mem_info->image_size_in_texels)); + } result = reinterpret_cast(mem_info->image_size_in_bytes + dst_offset); } else @@ -501,7 +517,6 @@ namespace gl else { // Stencil format on NV. Use driver upload path - if (unpack_info.type == GL_UNSIGNED_INT_24_8) { if (auto job = get_trivial_transform_job(unpack_info)) @@ -517,7 +532,15 @@ namespace gl { mem_info->memory_required = (mem_info->image_size_in_texels * 8); initialize_scratch_mem(); - get_compute_task()->run(cmd, transfer_buf, in_offset, out_offset, static_cast(mem_info->image_size_in_texels)); + + if (unpack_info.swap_bytes) + { + get_compute_task>()->run(cmd, transfer_buf, in_offset, out_offset, static_cast(mem_info->image_size_in_texels)); + } + else + { + get_compute_task>()->run(cmd, transfer_buf, in_offset, out_offset, static_cast(mem_info->image_size_in_texels)); + } } else { diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToGenericImage.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToGenericImage.glsl index f73a54751d..64d29d37f9 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToGenericImage.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToGenericImage.glsl @@ -14,8 +14,8 @@ R"( #define FMT_GL_BGR5_A1 0x99F0 #define FMT_GL_RGBA4 0x8056 -#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8 -#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24 +#define bswap_u16(bits) (bits & 0xFFu) << 8u | (bits & 0xFF00u) >> 8u | (bits & 0xFF0000u) << 8u | (bits & 0xFF000000u) >> 8u +#define bswap_u32(bits) (bits & 0xFFu) << 24u | (bits & 0xFF00u) << 8u | (bits & 0xFF0000u) >> 8u | (bits & 0xFF000000u) >> 24u layout(location=0) out vec4 outColor; @@ -73,18 +73,10 @@ uint readUint32(const in uint address) uvec2 readUint24_8(const in uint address) { - const uint raw_value = data[address]; - const uint stencil = bitfieldExtract(raw_value, 0, 8); - - if (swap_bytes != 0) - { - const uint depth = min(bswap_u32(raw_value), 0xffffff); - return uvec2(depth, stencil); - } - + const uint raw_value = readUint32(address); return uvec2( bitfieldExtract(raw_value, 8, 24), - stencil + bitfieldExtract(raw_value, 0, 8) ); } diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyD24x8ToBuffer.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyD24x8ToBuffer.glsl index 9ae21bfd7c..afcac4b38e 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyD24x8ToBuffer.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyD24x8ToBuffer.glsl @@ -5,6 +5,8 @@ layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in; #define IMAGE_LOCATION(x) (x + %loc) #define SSBO_LOCATION IMAGE_LOCATION(2) +#define bswap_u32(bits) (bits & 0xFFu) << 24u | (bits & 0xFF00u) << 8u | (bits & 0xFF0000u) >> 8u | (bits & 0xFF000000u) >> 24u + layout(%set, binding=IMAGE_LOCATION(0)) uniform sampler2D depthData; layout(%set, binding=IMAGE_LOCATION(1)) uniform usampler2D stencilData; @@ -62,13 +64,15 @@ void main() float depth = texelFetch(depthData, coord, 0).x; uint stencil = texelFetch(stencilData, coord, 0).x; uint depth_bytes = uint(depth * 0xffffff); + uint value = (depth_bytes << 8) | stencil; if (swap_bytes != 0) { - depth_bytes = (bitfieldExtract(depth_bytes, 0, 8) << 16u) | (bitfieldExtract(depth_bytes, 16, 8) << 0u) | depth_bytes & 0xFF00u; + // PS3-style byteswap (full word). PC byteswap is slightly different. + value = bswap_u32(value); } - data[input_coord_to_output_id(coord)] = (depth_bytes << 8) | stencil; + data[input_coord_to_output_id(coord)] = value; } } )"