From cbd895a29c86f4e044ca00d55dff12e3705897ee Mon Sep 17 00:00:00 2001 From: Megamouse Date: Wed, 10 Mar 2021 00:58:08 +0100 Subject: [PATCH] Move code to cpp (#9938) * GL: move GLOverlays code to cpp * GL: move GLCompute code to cpp * VK: move VKOverlays code to cpp * VK: move VKCompute code to cpp --- rpcs3/Emu/CMakeLists.txt | 4 + rpcs3/Emu/RSX/GL/GLCompute.cpp | 297 ++++++ rpcs3/Emu/RSX/GL/GLCompute.h | 300 +----- rpcs3/Emu/RSX/GL/GLExecutionState.h | 1 + rpcs3/Emu/RSX/GL/GLOverlays.cpp | 648 ++++++++++++ rpcs3/Emu/RSX/GL/GLOverlays.h | 649 +----------- rpcs3/Emu/RSX/Overlays/overlay_animation.h | 2 + rpcs3/Emu/RSX/Overlays/overlay_controls.h | 1 + rpcs3/Emu/RSX/VK/VKCompute.cpp | 428 ++++++++ rpcs3/Emu/RSX/VK/VKCompute.h | 442 +------- rpcs3/Emu/RSX/VK/VKFramebuffer.cpp | 2 +- rpcs3/Emu/RSX/VK/VKOverlays.cpp | 1049 +++++++++++++++++++ rpcs3/Emu/RSX/VK/VKOverlays.h | 1088 ++------------------ rpcs3/Emu/RSX/VK/VKPresent.cpp | 1 + rpcs3/Emu/RSX/VK/VKResolveHelper.h | 2 + rpcs3/GLGSRender.vcxproj | 2 + rpcs3/GLGSRender.vcxproj.filters | 2 + rpcs3/VKGSRender.vcxproj | 2 + rpcs3/VKGSRender.vcxproj.filters | 2 + 19 files changed, 2578 insertions(+), 2344 deletions(-) create mode 100644 rpcs3/Emu/RSX/GL/GLCompute.cpp create mode 100644 rpcs3/Emu/RSX/GL/GLOverlays.cpp create mode 100644 rpcs3/Emu/RSX/VK/VKCompute.cpp create mode 100644 rpcs3/Emu/RSX/VK/VKOverlays.cpp diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index 7f98d81763..e0b056e2ba 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -430,10 +430,12 @@ target_sources(rpcs3_emu PRIVATE RSX/Capture/rsx_capture.cpp RSX/Capture/rsx_replay.cpp RSX/GL/GLCommonDecompiler.cpp + RSX/GL/GLCompute.cpp RSX/GL/GLDraw.cpp RSX/GL/GLFragmentProgram.cpp RSX/GL/GLGSRender.cpp RSX/GL/GLHelpers.cpp + RSX/GL/GLOverlays.cpp RSX/GL/GLPipelineCompiler.cpp RSX/GL/GLPresent.cpp RSX/GL/GLRenderTargets.cpp @@ -462,6 +464,7 @@ if(TARGET 3rdparty_vulkan) RSX/VK/vkutils/shared.cpp RSX/VK/VKCommandStream.cpp RSX/VK/VKCommonDecompiler.cpp + RSX/VK/VKCompute.cpp RSX/VK/VKDMA.cpp RSX/VK/VKDraw.cpp RSX/VK/VKFormats.cpp @@ -470,6 +473,7 @@ if(TARGET 3rdparty_vulkan) RSX/VK/VKGSRender.cpp RSX/VK/VKHelpers.cpp RSX/VK/VKMemAlloc.cpp + RSX/VK/VKOverlays.cpp RSX/VK/VKPipelineCompiler.cpp RSX/VK/VKPresent.cpp RSX/VK/VKProgramPipeline.cpp diff --git a/rpcs3/Emu/RSX/GL/GLCompute.cpp b/rpcs3/Emu/RSX/GL/GLCompute.cpp new file mode 100644 index 0000000000..ff19922581 --- /dev/null +++ b/rpcs3/Emu/RSX/GL/GLCompute.cpp @@ -0,0 +1,297 @@ +#include "GLCompute.h" +#include "Utilities/StrUtil.h" + +namespace gl +{ + void compute_task::initialize() + { + // Set up optimal kernel size + const auto& caps = gl::get_driver_caps(); + if (caps.vendor_AMD || caps.vendor_MESA) + { + optimal_group_size = 64; + unroll_loops = false; + } + else if (caps.vendor_NVIDIA) + { + optimal_group_size = 32; + } + else + { + optimal_group_size = 128; + } + + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, reinterpret_cast(&max_invocations_x)); + } + + void compute_task::create() + { + if (!compiled) + { + m_shader.create(::glsl::program_domain::glsl_compute_program, m_src); + m_shader.compile(); + + m_program.create(); + m_program.attach(m_shader); + m_program.link(); + + compiled = true; + } + } + + void compute_task::destroy() + { + if (compiled) + { + m_program.remove(); + m_shader.remove(); + + compiled = false; + } + } + + void compute_task::run(u32 invocations_x, u32 invocations_y) + { + GLint old_program; + glGetIntegerv(GL_CURRENT_PROGRAM, &old_program); + + bind_resources(); + m_program.use(); + glDispatchCompute(invocations_x, invocations_y, 1); + + glUseProgram(old_program); + } + + void compute_task::run(u32 num_invocations) + { + u32 invocations_x, invocations_y; + if (num_invocations <= max_invocations_x) [[likely]] + { + invocations_x = num_invocations; + invocations_y = 1; + } + else + { + // Since all the invocations will run, the optimal distribution is sqrt(count) + const u32 optimal_length = static_cast(floor(std::sqrt(num_invocations))); + invocations_x = optimal_length; + invocations_y = invocations_x; + + if (num_invocations % invocations_x) invocations_y++; + } + + run(invocations_x, invocations_y); + } + + cs_shuffle_base::cs_shuffle_base() + { + work_kernel = + " value = data[index];\n" + " data[index] = %f(value);\n"; + + loop_advance = + " index++;\n"; + + suffix = + "}\n"; + } + + void cs_shuffle_base::build(const char* function_name, u32 _kernel_size) + { + // Initialize to allow detecting optimal settings + initialize(); + + kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; + + m_src = + "#version 430\n" + "layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n" + "layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n" + "%ub" + "\n" + "#define KERNEL_SIZE %ks\n" + "\n" + "// Generic swap routines\n" + "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" + "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" + "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" + "\n" + "// Depth format conversions\n" + "#define d24f_to_f32(bits) (bits << 7)\n" + "#define f32_to_d24f(bits) (bits >> 7)\n" + "\n" + "uint linear_invocation_id()\n" + "{\n" + " uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n" + " return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n" + "}\n" + "\n" + "%md" + "void main()\n" + "{\n" + " uint invocation_id = linear_invocation_id();\n" + " uint index = invocation_id * KERNEL_SIZE;\n" + " uint value;\n" + " %vars" + "\n"; + + const std::pair syntax_replace[] = + { + { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) }, + { "%ws", std::to_string(optimal_group_size) }, + { "%ks", std::to_string(kernel_size) }, + { "%vars", variables }, + { "%f", function_name }, + { "%ub", uniforms }, + { "%md", method_declarations } + }; + + m_src = fmt::replace_all(m_src, syntax_replace); + work_kernel = fmt::replace_all(work_kernel, syntax_replace); + + if (kernel_size <= 1) + { + m_src += " {\n" + work_kernel + " }\n"; + } + else if (unroll_loops) + { + work_kernel += loop_advance + "\n"; + + m_src += std::string + ( + " //Unrolled loop\n" + " {\n" + ); + + // Assemble body with manual loop unroll to try loweing GPR usage + for (u32 n = 0; n < kernel_size; ++n) + { + m_src += work_kernel; + } + + m_src += " }\n"; + } + else + { + m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n"; + m_src += " {\n"; + m_src += work_kernel; + m_src += loop_advance; + m_src += " }\n"; + } + + m_src += suffix; + } + + void cs_shuffle_base::bind_resources() + { + m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length); + } + + void cs_shuffle_base::run(const gl::buffer* data, u32 data_length, u32 data_offset) + { + m_data = data; + m_data_offset = data_offset; + m_data_length = data_length; + + const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; + const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation); + const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation; + + if ((num_bytes_to_process + data_offset) > data->size()) + { + // Technically robust buffer access should keep the driver from crashing in OOB situations + rsx_log.error("Inadequate buffer length submitted for a compute operation." + "Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size()); + } + + compute_task::run(num_invocations); + } + + cs_shuffle_d32fx8_to_x8d24f::cs_shuffle_d32fx8_to_x8d24f() + { + uniforms = "uniform uint in_ptr, out_ptr;\n"; + + variables = + " uint in_offset = in_ptr >> 2;\n" + " uint out_offset = out_ptr >> 2;\n" + " uint depth, stencil;\n"; + + work_kernel = + " depth = data[index * 2 + in_offset];\n" + " stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n" + " value = f32_to_d24f(depth) << 8;\n" + " value |= stencil;\n" + " data[index + out_ptr] = bswap_u32(value);\n"; + + cs_shuffle_base::build(""); + } + + void cs_shuffle_d32fx8_to_x8d24f::bind_resources() + { + m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); + } + + void cs_shuffle_d32fx8_to_x8d24f::run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) + { + u32 data_offset; + if (src_offset > dst_offset) + { + data_offset = dst_offset; + m_ssbo_length = (src_offset + num_texels * 8) - data_offset; + } + else + { + data_offset = src_offset; + m_ssbo_length = (dst_offset + num_texels * 4) - data_offset; + } + + m_program.uniforms["in_ptr"] = src_offset - data_offset; + m_program.uniforms["out_ptr"] = dst_offset - data_offset; + cs_shuffle_base::run(data, num_texels * 4, data_offset); + } + + cs_shuffle_x8d24f_to_d32fx8::cs_shuffle_x8d24f_to_d32fx8() + { + uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n"; + + variables = + " uint in_offset = in_ptr >> 2;\n" + " uint out_offset = out_ptr >> 2;\n" + " uint depth, stencil;\n"; + + work_kernel = + " value = data[index + in_offset];\n" + " value = bswap_u32(value);\n" + " stencil = (value & 0xFFu);\n" + " depth = (value >> 8);\n" + " data[index * 2 + out_offset] = d24f_to_f32(depth);\n" + " data[index * 2 + (out_offset + 1)] = stencil;\n"; + + cs_shuffle_base::build(""); + } + + void cs_shuffle_x8d24f_to_d32fx8::bind_resources() + { + m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); + } + + void cs_shuffle_x8d24f_to_d32fx8::run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) + { + u32 data_offset; + if (src_offset > dst_offset) + { + data_offset = dst_offset; + m_ssbo_length = (src_offset + num_texels * 4) - data_offset; + } + else + { + data_offset = src_offset; + m_ssbo_length = (dst_offset + num_texels * 8) - data_offset; + } + + m_program.uniforms["in_ptr"] = src_offset - data_offset; + m_program.uniforms["out_ptr"] = dst_offset - data_offset; + cs_shuffle_base::run(data, num_texels * 4, data_offset); + } +} diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 3a7c394f0f..de63ce477c 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -1,10 +1,8 @@ #pragma once -#include "Utilities/StrUtil.h" #include "Emu/IdManager.h" #include "GLHelpers.h" -#include "util/asm.hpp" #include namespace gl @@ -22,88 +20,14 @@ namespace gl u32 optimal_kernel_size = 1; u32 max_invocations_x = 65535; - void initialize() - { - // Set up optimal kernel size - const auto& caps = gl::get_driver_caps(); - if (caps.vendor_AMD || caps.vendor_MESA) - { - optimal_group_size = 64; - unroll_loops = false; - } - else if (caps.vendor_NVIDIA) - { - optimal_group_size = 32; - } - else - { - optimal_group_size = 128; - } + void initialize(); + void create(); + void destroy(); - glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, reinterpret_cast(&max_invocations_x)); - } + virtual void bind_resources() {} - void create() - { - if (!compiled) - { - m_shader.create(::glsl::program_domain::glsl_compute_program, m_src); - m_shader.compile(); - - m_program.create(); - m_program.attach(m_shader); - m_program.link(); - - compiled = true; - } - } - - void destroy() - { - if (compiled) - { - m_program.remove(); - m_shader.remove(); - - compiled = false; - } - } - - virtual void bind_resources() - {} - - void run(u32 invocations_x, u32 invocations_y) - { - GLint old_program; - glGetIntegerv(GL_CURRENT_PROGRAM, &old_program); - - bind_resources(); - m_program.use(); - glDispatchCompute(invocations_x, invocations_y, 1); - - glUseProgram(old_program); - } - - void run(u32 num_invocations) - { - u32 invocations_x, invocations_y; - if (num_invocations <= max_invocations_x) [[likely]] - { - invocations_x = num_invocations; - invocations_y = 1; - } - else - { - // Since all the invocations will run, the optimal distribution is sqrt(count) - const u32 optimal_length = static_cast(floor(std::sqrt(num_invocations))); - invocations_x = optimal_length; - invocations_y = invocations_x; - - if (num_invocations % invocations_x) invocations_y++; - } - - run(invocations_x, invocations_y); - } + void run(u32 invocations_x, u32 invocations_y); + void run(u32 num_invocations); }; struct cs_shuffle_base : compute_task @@ -115,130 +39,13 @@ namespace gl std::string uniforms, variables, work_kernel, loop_advance, suffix, method_declarations; - cs_shuffle_base() - { - work_kernel = - " value = data[index];\n" - " data[index] = %f(value);\n"; + cs_shuffle_base(); - loop_advance = - " index++;\n"; + void build(const char* function_name, u32 _kernel_size = 0); - suffix = - "}\n"; - } + void bind_resources() override; - void build(const char* function_name, u32 _kernel_size = 0) - { - // Initialize to allow detecting optimal settings - initialize(); - - kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; - - m_src = - "#version 430\n" - "layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n" - "layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n" - "%ub" - "\n" - "#define KERNEL_SIZE %ks\n" - "\n" - "// Generic swap routines\n" - "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" - "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" - "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" - "\n" - "// Depth format conversions\n" - "#define d24f_to_f32(bits) (bits << 7)\n" - "#define f32_to_d24f(bits) (bits >> 7)\n" - "\n" - "uint linear_invocation_id()\n" - "{\n" - " uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n" - " return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n" - "}\n" - "\n" - "%md" - "void main()\n" - "{\n" - " uint invocation_id = linear_invocation_id();\n" - " uint index = invocation_id * KERNEL_SIZE;\n" - " uint value;\n" - " %vars" - "\n"; - - const std::pair syntax_replace[] = - { - { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) }, - { "%ws", std::to_string(optimal_group_size) }, - { "%ks", std::to_string(kernel_size) }, - { "%vars", variables }, - { "%f", function_name }, - { "%ub", uniforms }, - { "%md", method_declarations } - }; - - m_src = fmt::replace_all(m_src, syntax_replace); - work_kernel = fmt::replace_all(work_kernel, syntax_replace); - - if (kernel_size <= 1) - { - m_src += " {\n" + work_kernel + " }\n"; - } - else if (unroll_loops) - { - work_kernel += loop_advance + "\n"; - - m_src += std::string - ( - " //Unrolled loop\n" - " {\n" - ); - - // Assemble body with manual loop unroll to try loweing GPR usage - for (u32 n = 0; n < kernel_size; ++n) - { - m_src += work_kernel; - } - - m_src += " }\n"; - } - else - { - m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n"; - m_src += " {\n"; - m_src += work_kernel; - m_src += loop_advance; - m_src += " }\n"; - } - - m_src += suffix; - } - - void bind_resources() override - { - m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length); - } - - void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0) - { - m_data = data; - m_data_offset = data_offset; - m_data_length = data_length; - - const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; - const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation); - const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation; - - if ((num_bytes_to_process + data_offset) > data->size()) - { - // Technically robust buffer access should keep the driver from crashing in OOB situations - rsx_log.error("Inadequate buffer length submitted for a compute operation." - "Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size()); - } - - compute_task::run(num_invocations); - } + void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0); }; struct cs_shuffle_16 : cs_shuffle_base @@ -272,97 +79,22 @@ namespace gl { u32 m_ssbo_length = 0; - cs_shuffle_d32fx8_to_x8d24f() - { - uniforms = "uniform uint in_ptr, out_ptr;\n"; + cs_shuffle_d32fx8_to_x8d24f(); - variables = - " uint in_offset = in_ptr >> 2;\n" - " uint out_offset = out_ptr >> 2;\n" - " uint depth, stencil;\n"; + void bind_resources() override; - work_kernel = - " depth = data[index * 2 + in_offset];\n" - " stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n" - " value = f32_to_d24f(depth) << 8;\n" - " value |= stencil;\n" - " data[index + out_ptr] = bswap_u32(value);\n"; - - cs_shuffle_base::build(""); - } - - void bind_resources() override - { - m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); - } - - void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) - { - u32 data_offset; - if (src_offset > dst_offset) - { - data_offset = dst_offset; - m_ssbo_length = (src_offset + num_texels * 8) - data_offset; - } - else - { - data_offset = src_offset; - m_ssbo_length = (dst_offset + num_texels * 4) - data_offset; - } - - m_program.uniforms["in_ptr"] = src_offset - data_offset; - m_program.uniforms["out_ptr"] = dst_offset - data_offset; - cs_shuffle_base::run(data, num_texels * 4, data_offset); - } + void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels); }; struct cs_shuffle_x8d24f_to_d32fx8 : cs_shuffle_base { u32 m_ssbo_length = 0; - cs_shuffle_x8d24f_to_d32fx8() - { - uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n"; + cs_shuffle_x8d24f_to_d32fx8(); - variables = - " uint in_offset = in_ptr >> 2;\n" - " uint out_offset = out_ptr >> 2;\n" - " uint depth, stencil;\n"; + void bind_resources() override; - work_kernel = - " value = data[index + in_offset];\n" - " value = bswap_u32(value);\n" - " stencil = (value & 0xFFu);\n" - " depth = (value >> 8);\n" - " data[index * 2 + out_offset] = d24f_to_f32(depth);\n" - " data[index * 2 + (out_offset + 1)] = stencil;\n"; - - cs_shuffle_base::build(""); - } - - void bind_resources() override - { - m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); - } - - void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) - { - u32 data_offset; - if (src_offset > dst_offset) - { - data_offset = dst_offset; - m_ssbo_length = (src_offset + num_texels * 4) - data_offset; - } - else - { - data_offset = src_offset; - m_ssbo_length = (dst_offset + num_texels * 8) - data_offset; - } - - m_program.uniforms["in_ptr"] = src_offset - data_offset; - m_program.uniforms["out_ptr"] = dst_offset - data_offset; - cs_shuffle_base::run(data, num_texels * 4, data_offset); - } + void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels); }; diff --git a/rpcs3/Emu/RSX/GL/GLExecutionState.h b/rpcs3/Emu/RSX/GL/GLExecutionState.h index 37a41f0953..63f58bd432 100644 --- a/rpcs3/Emu/RSX/GL/GLExecutionState.h +++ b/rpcs3/Emu/RSX/GL/GLExecutionState.h @@ -1,5 +1,6 @@ #pragma once +#include "util/logs.hpp" #include "util/types.hpp" #include "Utilities/geometry.h" #include "OpenGL.h" diff --git a/rpcs3/Emu/RSX/GL/GLOverlays.cpp b/rpcs3/Emu/RSX/GL/GLOverlays.cpp new file mode 100644 index 0000000000..25913b3f92 --- /dev/null +++ b/rpcs3/Emu/RSX/GL/GLOverlays.cpp @@ -0,0 +1,648 @@ +#include "GLOverlays.h" + +extern u64 get_system_time(); + +namespace gl +{ + void overlay_pass::create() + { + if (!compiled) + { + fs.create(::glsl::program_domain::glsl_fragment_program, fs_src); + fs.compile(); + + vs.create(::glsl::program_domain::glsl_vertex_program, vs_src); + vs.compile(); + + program_handle.create(); + program_handle.attach(vs); + program_handle.attach(fs); + program_handle.link(); + + fbo.create(); + + m_sampler.create(); + m_sampler.apply_defaults(input_filter); + + m_vertex_data_buffer.create(); + + int old_vao; + glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao); + + m_vao.create(); + m_vao.bind(); + + m_vao.array_buffer = m_vertex_data_buffer; + auto ptr = buffer_pointer(&m_vao); + m_vao[0] = ptr; + + glBindVertexArray(old_vao); + + compiled = true; + } + } + + void overlay_pass::destroy() + { + if (compiled) + { + program_handle.remove(); + vs.remove(); + fs.remove(); + + fbo.remove(); + m_vao.remove(); + m_vertex_data_buffer.remove(); + + m_sampler.remove(); + + compiled = false; + } + } + + void overlay_pass::emit_geometry() + { + int old_vao; + glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao); + + m_vao.bind(); + glDrawArrays(primitives, 0, num_drawable_elements); + + glBindVertexArray(old_vao); + } + + void overlay_pass::run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending) + { + if (!compiled) + { + rsx_log.error("You must initialize overlay passes with create() before calling run()"); + return; + } + + GLint program; + GLint old_fbo; + GLint depth_func; + GLint viewport[4]; + GLboolean color_writes[4]; + GLboolean depth_write; + + GLint blend_src_rgb; + GLint blend_src_a; + GLint blend_dst_rgb; + GLint blend_dst_a; + GLint blend_eq_a; + GLint blend_eq_rgb; + + if (target_texture) + { + glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo); + glBindFramebuffer(GL_FRAMEBUFFER, fbo.id()); + + if (depth_target) + { + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, target_texture, 0); + glDrawBuffer(GL_NONE); + } + else + { + GLenum buffer = GL_COLOR_ATTACHMENT0; + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, target_texture, 0); + glDrawBuffers(1, &buffer); + } + } + + if (!target_texture || glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE) + { + // Push rasterizer state + glGetIntegerv(GL_VIEWPORT, viewport); + glGetBooleanv(GL_COLOR_WRITEMASK, color_writes); + glGetBooleanv(GL_DEPTH_WRITEMASK, &depth_write); + glGetIntegerv(GL_CURRENT_PROGRAM, &program); + glGetIntegerv(GL_DEPTH_FUNC, &depth_func); + + GLboolean scissor_enabled = glIsEnabled(GL_SCISSOR_TEST); + GLboolean depth_test_enabled = glIsEnabled(GL_DEPTH_TEST); + GLboolean cull_face_enabled = glIsEnabled(GL_CULL_FACE); + GLboolean blend_enabled = glIsEnabledi(GL_BLEND, 0); + GLboolean stencil_test_enabled = glIsEnabled(GL_STENCIL_TEST); + + if (use_blending) + { + glGetIntegerv(GL_BLEND_SRC_RGB, &blend_src_rgb); + glGetIntegerv(GL_BLEND_SRC_ALPHA, &blend_src_a); + glGetIntegerv(GL_BLEND_DST_RGB, &blend_dst_rgb); + glGetIntegerv(GL_BLEND_DST_ALPHA, &blend_dst_a); + glGetIntegerv(GL_BLEND_EQUATION_RGB, &blend_eq_rgb); + glGetIntegerv(GL_BLEND_EQUATION_ALPHA, &blend_eq_a); + } + + // Set initial state + glViewport(region.x1, region.y1, region.width(), region.height()); + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glDepthMask(depth_target ? GL_TRUE : GL_FALSE); + + // Disabling depth test will also disable depth writes which is not desired + glDepthFunc(GL_ALWAYS); + glEnable(GL_DEPTH_TEST); + + if (scissor_enabled) glDisable(GL_SCISSOR_TEST); + if (cull_face_enabled) glDisable(GL_CULL_FACE); + if (stencil_test_enabled) glDisable(GL_STENCIL_TEST); + + if (use_blending) + { + if (!blend_enabled) + glEnablei(GL_BLEND, 0); + + glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); + glBlendEquation(GL_FUNC_ADD); + } + else if (blend_enabled) + { + glDisablei(GL_BLEND, 0); + } + + // Render + program_handle.use(); + on_load(); + bind_resources(); + emit_geometry(); + + // Clean up + if (target_texture) + { + if (depth_target) + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); + else + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + + glBindFramebuffer(GL_FRAMEBUFFER, old_fbo); + } + + glUseProgram(program); + + glViewport(viewport[0], viewport[1], viewport[2], viewport[3]); + glColorMask(color_writes[0], color_writes[1], color_writes[2], color_writes[3]); + glDepthMask(depth_write); + glDepthFunc(depth_func); + + if (!depth_test_enabled) glDisable(GL_DEPTH_TEST); + if (scissor_enabled) glEnable(GL_SCISSOR_TEST); + if (cull_face_enabled) glEnable(GL_CULL_FACE); + if (stencil_test_enabled) glEnable(GL_STENCIL_TEST); + + if (use_blending) + { + if (!blend_enabled) + glDisablei(GL_BLEND, 0); + + glBlendFuncSeparate(blend_src_rgb, blend_dst_rgb, blend_src_a, blend_dst_a); + glBlendEquationSeparate(blend_eq_rgb, blend_eq_a); + } + else if (blend_enabled) + { + glEnablei(GL_BLEND, 0); + } + } + else + { + rsx_log.error("Overlay pass failed because framebuffer was not complete. Run with debug output enabled to diagnose the problem"); + } + } + + ui_overlay_renderer::ui_overlay_renderer() + { + vs_src = + "#version 420\n\n" + "layout(location=0) in vec4 in_pos;\n" + "layout(location=0) out vec2 tc0;\n" + "layout(location=1) flat out vec4 clip_rect;\n" + "uniform vec4 ui_scale;\n" + "uniform vec4 viewport;\n" + "uniform vec4 clip_bounds;\n" + "\n" + "vec2 snap_to_grid(vec2 normalized)\n" + "{\n" + " return (floor(normalized * viewport.xy) + 0.5) / viewport.xy;\n" + "}\n" + "\n" + "vec4 clip_to_ndc(const in vec4 coord)\n" + "{\n" + " vec4 ret = (coord * ui_scale.zwzw) / ui_scale.xyxy;\n" + " ret.yw = 1. - ret.yw;\n" + " return ret;\n" + "}\n" + "\n" + "vec4 ndc_to_window(const in vec4 coord)\n" + "{\n" + " return fma(coord, viewport.xyxy, viewport.zwzw);\n" + "}\n" + "\n" + "void main()\n" + "{\n" + " tc0.xy = in_pos.zw;\n" + " clip_rect = ndc_to_window(clip_to_ndc(clip_bounds)).xwzy; // Swap y1 and y2 due to flipped origin!\n" + " vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n" + " pos.xy = snap_to_grid(pos.xy);\n" + " gl_Position = (pos + pos) - 1.;\n" + "}\n"; + + fs_src = + "#version 420\n\n" + "layout(binding=31) uniform sampler2D fs0;\n" + "layout(binding=30) uniform sampler2DArray fs1;\n" + "layout(location=0) in vec2 tc0;\n" + "layout(location=1) flat in vec4 clip_rect;\n" + "layout(location=0) out vec4 ocol;\n" + "uniform vec4 color;\n" + "uniform float time;\n" + "uniform int sampler_mode;\n" + "uniform int pulse_glow;\n" + "uniform int clip_region;\n" + "uniform int blur_strength;\n" + "\n" + "vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n" + "{\n" + " vec2 coords[9];\n" + " coords[0] = coord - tex_offset\n;" + " coords[1] = coord + vec2(0., -tex_offset.y);\n" + " coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n" + " coords[3] = coord + vec2(-tex_offset.x, 0.);\n" + " coords[4] = coord;\n" + " coords[5] = coord + vec2(tex_offset.x, 0.);\n" + " coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n" + " coords[7] = coord + vec2(0., tex_offset.y);\n" + " coords[8] = coord + tex_offset;\n" + "\n" + " float weights[9] =\n" + " {\n" + " 1., 2., 1.,\n" + " 2., 4., 2.,\n" + " 1., 2., 1.\n" + " };\n" + "\n" + " vec4 blurred = vec4(0.);\n" + " for (int n = 0; n < 9; ++n)\n" + " {\n" + " blurred += texture(tex, coords[n]) * weights[n];\n" + " }\n" + "\n" + " return blurred / 16.f;\n" + "}\n" + "\n" + "vec4 sample_image(sampler2D tex, vec2 coord)\n" + "{\n" + " vec4 original = texture(tex, coord);\n" + " if (blur_strength == 0) return original;\n" + " \n" + " vec2 constraints = 1.f / vec2(640, 360);\n" + " vec2 res_offset = 1.f / textureSize(fs0, 0);\n" + " vec2 tex_offset = max(res_offset, constraints);\n" + "\n" + " // Sample triangle pattern and average\n" + " // TODO: Nicer looking gaussian blur with less sampling\n" + " vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n" + " vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n" + " vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n" + "\n" + " vec4 blurred = blur0 + blur1 + blur2;\n" + " blurred /= 3.;\n" + " return mix(original, blurred, float(blur_strength) / 100.);\n" + "}\n" + "\n" + "void main()\n" + "{\n" + " if (clip_region != 0)\n" + " {" + " if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n" + " gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n" + " {\n" + " discard;\n" + " return;\n" + " }\n" + " }\n" + "\n" + " vec4 diff_color = color;\n" + " if (pulse_glow != 0)\n" + " diff_color.a *= (sin(time) + 1.f) * 0.5f;\n" + "\n" + " switch (sampler_mode)\n" + " {\n" + " case 1:\n" + " ocol = sample_image(fs0, tc0) * diff_color;\n" + " break;\n" + " case 2:\n" + " ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))) * diff_color;\n" + " break;\n" + " default:\n" + " ocol = diff_color;\n" + " break;\n" + " }\n" + "}\n"; + + // Smooth filtering required for inputs + input_filter = GL_LINEAR; + } + + gl::texture_view* ui_overlay_renderer::load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid) + { + auto tex = std::make_unique(GL_TEXTURE_2D, desc->w, desc->h, 1, 1, GL_RGBA8); + tex->copy_from(desc->data, gl::texture::format::rgba, gl::texture::type::uint_8_8_8_8, {}); + + GLenum remap[] = { GL_RED, GL_ALPHA, GL_BLUE, GL_GREEN }; + auto view = std::make_unique(tex.get(), remap); + + auto result = view.get(); + if (!temp_resource) + { + resources.push_back(std::move(tex)); + view_cache[view_cache.size()] = std::move(view); + } + else + { + u64 key = reinterpret_cast(desc); + temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex)); + temp_view_cache[key] = std::move(view); + } + + return result; + } + + void ui_overlay_renderer::create() + { + overlay_pass::create(); + + rsx::overlays::resource_config configuration; + configuration.load_files(); + + for (const auto &res : configuration.texture_raw_data) + { + load_simple_image(res.get(), false, UINT32_MAX); + } + + configuration.free_resources(); + } + + void ui_overlay_renderer::destroy() + { + temp_image_cache.clear(); + resources.clear(); + font_cache.clear(); + overlay_pass::destroy(); + } + + void ui_overlay_renderer::remove_temp_resources(u64 key) + { + std::vector keys_to_remove; + for (const auto& temp_image : temp_image_cache) + { + if (temp_image.second.first == key) + { + keys_to_remove.push_back(temp_image.first); + } + } + + for (const auto& _key : keys_to_remove) + { + temp_image_cache.erase(_key); + temp_view_cache.erase(_key); + } + } + + gl::texture_view* ui_overlay_renderer::find_font(rsx::overlays::font* font) + { + const auto font_size = font->get_glyph_data_dimensions(); + + u64 key = reinterpret_cast(font); + auto found = view_cache.find(key); + if (found != view_cache.end()) + { + if (const auto this_size = found->second->image()->size3D(); + font_size.width == this_size.width && + font_size.height == this_size.height && + font_size.depth == this_size.depth) + { + return found->second.get(); + } + } + + // Create font file + std::vector glyph_data; + font->get_glyph_data(glyph_data); + + auto tex = std::make_unique(GL_TEXTURE_2D_ARRAY, font_size.width, font_size.height, font_size.depth, 1, GL_R8); + tex->copy_from(glyph_data.data(), gl::texture::format::r, gl::texture::type::ubyte, {}); + + GLenum remap[] = { GL_RED, GL_RED, GL_RED, GL_RED }; + auto view = std::make_unique(tex.get(), remap); + + auto result = view.get(); + font_cache[key] = std::move(tex); + view_cache[key] = std::move(view); + + return result; + } + + gl::texture_view* ui_overlay_renderer::find_temp_image(rsx::overlays::image_info* desc, u32 owner_uid) + { + auto key = reinterpret_cast(desc); + auto cached = temp_view_cache.find(key); + if (cached != temp_view_cache.end()) + { + return cached->second.get(); + } + else + { + return load_simple_image(desc, true, owner_uid); + } + } + + void ui_overlay_renderer::set_primitive_type(rsx::overlays::primitive_type type) + { + m_current_primitive_type = type; + + switch (type) + { + case rsx::overlays::primitive_type::quad_list: + case rsx::overlays::primitive_type::triangle_strip: + primitives = GL_TRIANGLE_STRIP; + break; + case rsx::overlays::primitive_type::line_list: + primitives = GL_LINES; + break; + case rsx::overlays::primitive_type::line_strip: + primitives = GL_LINE_STRIP; + break; + default: + fmt::throw_exception("Unexpected primitive type %d", static_cast(type)); + } + } + + void ui_overlay_renderer::emit_geometry() + { + if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list) + { + // Emulate quads with disjointed triangle strips + int num_quads = num_drawable_elements / 4; + std::vector firsts; + std::vector counts; + + firsts.resize(num_quads); + counts.resize(num_quads); + + for (int n = 0; n < num_quads; ++n) + { + firsts[n] = (n * 4); + counts[n] = 4; + } + + int old_vao; + glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao); + + m_vao.bind(); + glMultiDrawArrays(GL_TRIANGLE_STRIP, firsts.data(), counts.data(), num_quads); + + glBindVertexArray(old_vao); + } + else + { + overlay_pass::emit_geometry(); + } + } + + void ui_overlay_renderer::run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui) + { + program_handle.uniforms["viewport"] = color4f(static_cast(viewport.width()), static_cast(viewport.height()), static_cast(viewport.x1), static_cast(viewport.y1)); + program_handle.uniforms["ui_scale"] = color4f(static_cast(ui.virtual_width), static_cast(ui.virtual_height), 1.f, 1.f); + program_handle.uniforms["time"] = static_cast(get_system_time() / 1000) * 0.005f; + + saved_sampler_state save_30(30, m_sampler); + saved_sampler_state save_31(31, m_sampler); + + for (auto &cmd : ui.get_compiled().draw_commands) + { + set_primitive_type(cmd.config.primitives); + upload_vertex_data(cmd.verts.data(), ::size32(cmd.verts)); + num_drawable_elements = ::size32(cmd.verts); + GLint texture_read = GL_TRUE; + + switch (cmd.config.texture_ref) + { + case rsx::overlays::image_resource_id::game_icon: + case rsx::overlays::image_resource_id::backbuffer: + //TODO + case rsx::overlays::image_resource_id::none: + { + texture_read = GL_FALSE; + glBindTexture(GL_TEXTURE_2D, GL_NONE); + break; + } + case rsx::overlays::image_resource_id::raw_image: + { + glBindTexture(GL_TEXTURE_2D, find_temp_image(static_cast(cmd.config.external_data_ref), ui.uid)->id()); + break; + } + case rsx::overlays::image_resource_id::font_file: + { + texture_read = (GL_TRUE + 1); + glActiveTexture(GL_TEXTURE0 + 30); + glBindTexture(GL_TEXTURE_2D_ARRAY, find_font(cmd.config.font_ref)->id()); + glActiveTexture(GL_TEXTURE0 + 31); + break; + } + default: + { + glBindTexture(GL_TEXTURE_2D, view_cache[cmd.config.texture_ref - 1]->id()); + break; + } + } + + program_handle.uniforms["color"] = cmd.config.color; + program_handle.uniforms["sampler_mode"] = texture_read; + program_handle.uniforms["pulse_glow"] = static_cast(cmd.config.pulse_glow); + program_handle.uniforms["blur_strength"] = static_cast(cmd.config.blur_strength); + program_handle.uniforms["clip_region"] = static_cast(cmd.config.clip_region); + program_handle.uniforms["clip_bounds"] = cmd.config.clip_rect; + overlay_pass::run(viewport, target, false, true); + } + + ui.update(); + } + + video_out_calibration_pass::video_out_calibration_pass() + { + vs_src = + "#version 420\n\n" + "layout(location=0) out vec2 tc0;\n" + "\n" + "void main()\n" + "{\n" + " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" + " vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n" + " tc0 = coords[gl_VertexID % 4];\n" + " vec2 pos = positions[gl_VertexID % 4];\n" + " gl_Position = vec4(pos, 0., 1.);\n" + "}\n"; + + fs_src = + "#version 420\n\n" + "layout(binding=31) uniform sampler2D fs0;\n" + "layout(binding=30) uniform sampler2D fs1;\n" + "layout(location=0) in vec2 tc0;\n" + "layout(location=0) out vec4 ocol;\n" + "\n" + "uniform float gamma;\n" + "uniform int limit_range;\n" + "uniform int stereo;\n" + "uniform int stereo_image_count;\n" + "\n" + "vec4 read_source()\n" + "{\n" + " if (stereo == 0) return texture(fs0, tc0);\n" + "\n" + " vec4 left, right;\n" + " if (stereo_image_count == 2)\n" + " {\n" + " left = texture(fs0, tc0);\n" + " right = texture(fs1, tc0);\n" + " }\n" + " else\n" + " {\n" + " vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n" + " vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n" + " left = texture(fs0, coord_left);\n" + " right = texture(fs0, coord_right);\n" + " }\n" + "\n" + " return vec4(left.r, right.g, right.b, 1.);\n" + "}\n" + "\n" + "void main()\n" + "{\n" + " vec4 color = read_source();\n" + " color.rgb = pow(color.rgb, vec3(gamma));\n" + " if (limit_range > 0)\n" + " ocol = ((color * 220.) + 16.) / 255.;\n" + " else\n" + " ocol = color;\n" + "}\n"; + + input_filter = GL_LINEAR; + } + + void video_out_calibration_pass::run(const areau& viewport, const rsx::simple_array& source, f32 gamma, bool limited_rgb, bool _3d) + { + program_handle.uniforms["gamma"] = gamma; + program_handle.uniforms["limit_range"] = limited_rgb + 0; + program_handle.uniforms["stereo"] = _3d + 0; + program_handle.uniforms["stereo_image_count"] = (source[1] == GL_NONE? 1 : 2); + + saved_sampler_state saved(31, m_sampler); + glBindTexture(GL_TEXTURE_2D, source[0]); + + saved_sampler_state saved2(30, m_sampler); + glBindTexture(GL_TEXTURE_2D, source[1]); + + overlay_pass::run(viewport, GL_NONE, false, false); + } +} diff --git a/rpcs3/Emu/RSX/GL/GLOverlays.h b/rpcs3/Emu/RSX/GL/GLOverlays.h index b78a3b0981..574d713bfc 100644 --- a/rpcs3/Emu/RSX/GL/GLOverlays.h +++ b/rpcs3/Emu/RSX/GL/GLOverlays.h @@ -1,13 +1,12 @@ #pragma once #include "util/types.hpp" -#include "GLHelpers.h" #include "../Overlays/overlays.h" +#include "GLTexture.h" +#include "Emu/RSX/rsx_utils.h" #include #include -extern u64 get_system_time(); - namespace gl { struct overlay_pass @@ -53,61 +52,8 @@ namespace gl } }; - void create() - { - if (!compiled) - { - fs.create(::glsl::program_domain::glsl_fragment_program, fs_src); - fs.compile(); - - vs.create(::glsl::program_domain::glsl_vertex_program, vs_src); - vs.compile(); - - program_handle.create(); - program_handle.attach(vs); - program_handle.attach(fs); - program_handle.link(); - - fbo.create(); - - m_sampler.create(); - m_sampler.apply_defaults(input_filter); - - m_vertex_data_buffer.create(); - - int old_vao; - glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao); - - m_vao.create(); - m_vao.bind(); - - m_vao.array_buffer = m_vertex_data_buffer; - auto ptr = buffer_pointer(&m_vao); - m_vao[0] = ptr; - - glBindVertexArray(old_vao); - - compiled = true; - } - } - - void destroy() - { - if (compiled) - { - program_handle.remove(); - vs.remove(); - fs.remove(); - - fbo.remove(); - m_vao.remove(); - m_vertex_data_buffer.remove(); - - m_sampler.remove(); - - compiled = false; - } - } + void create(); + void destroy(); virtual void on_load() {} virtual void on_unload() {} @@ -121,155 +67,9 @@ namespace gl m_vertex_data_buffer.data(elements_count * sizeof(T), data); } - virtual void emit_geometry() - { - int old_vao; - glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao); + virtual void emit_geometry(); - m_vao.bind(); - glDrawArrays(primitives, 0, num_drawable_elements); - - glBindVertexArray(old_vao); - } - - void run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending = false) - { - if (!compiled) - { - rsx_log.error("You must initialize overlay passes with create() before calling run()"); - return; - } - - GLint program; - GLint old_fbo; - GLint depth_func; - GLint viewport[4]; - GLboolean color_writes[4]; - GLboolean depth_write; - - GLint blend_src_rgb; - GLint blend_src_a; - GLint blend_dst_rgb; - GLint blend_dst_a; - GLint blend_eq_a; - GLint blend_eq_rgb; - - if (target_texture) - { - glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo); - glBindFramebuffer(GL_FRAMEBUFFER, fbo.id()); - - if (depth_target) - { - glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, target_texture, 0); - glDrawBuffer(GL_NONE); - } - else - { - GLenum buffer = GL_COLOR_ATTACHMENT0; - glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, target_texture, 0); - glDrawBuffers(1, &buffer); - } - } - - if (!target_texture || glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE) - { - // Push rasterizer state - glGetIntegerv(GL_VIEWPORT, viewport); - glGetBooleanv(GL_COLOR_WRITEMASK, color_writes); - glGetBooleanv(GL_DEPTH_WRITEMASK, &depth_write); - glGetIntegerv(GL_CURRENT_PROGRAM, &program); - glGetIntegerv(GL_DEPTH_FUNC, &depth_func); - - GLboolean scissor_enabled = glIsEnabled(GL_SCISSOR_TEST); - GLboolean depth_test_enabled = glIsEnabled(GL_DEPTH_TEST); - GLboolean cull_face_enabled = glIsEnabled(GL_CULL_FACE); - GLboolean blend_enabled = glIsEnabledi(GL_BLEND, 0); - GLboolean stencil_test_enabled = glIsEnabled(GL_STENCIL_TEST); - - if (use_blending) - { - glGetIntegerv(GL_BLEND_SRC_RGB, &blend_src_rgb); - glGetIntegerv(GL_BLEND_SRC_ALPHA, &blend_src_a); - glGetIntegerv(GL_BLEND_DST_RGB, &blend_dst_rgb); - glGetIntegerv(GL_BLEND_DST_ALPHA, &blend_dst_a); - glGetIntegerv(GL_BLEND_EQUATION_RGB, &blend_eq_rgb); - glGetIntegerv(GL_BLEND_EQUATION_ALPHA, &blend_eq_a); - } - - // Set initial state - glViewport(region.x1, region.y1, region.width(), region.height()); - glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); - glDepthMask(depth_target ? GL_TRUE : GL_FALSE); - - // Disabling depth test will also disable depth writes which is not desired - glDepthFunc(GL_ALWAYS); - glEnable(GL_DEPTH_TEST); - - if (scissor_enabled) glDisable(GL_SCISSOR_TEST); - if (cull_face_enabled) glDisable(GL_CULL_FACE); - if (stencil_test_enabled) glDisable(GL_STENCIL_TEST); - - if (use_blending) - { - if (!blend_enabled) - glEnablei(GL_BLEND, 0); - - glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); - glBlendEquation(GL_FUNC_ADD); - } - else if (blend_enabled) - { - glDisablei(GL_BLEND, 0); - } - - // Render - program_handle.use(); - on_load(); - bind_resources(); - emit_geometry(); - - // Clean up - if (target_texture) - { - if (depth_target) - glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - else - glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - - glBindFramebuffer(GL_FRAMEBUFFER, old_fbo); - } - - glUseProgram(program); - - glViewport(viewport[0], viewport[1], viewport[2], viewport[3]); - glColorMask(color_writes[0], color_writes[1], color_writes[2], color_writes[3]); - glDepthMask(depth_write); - glDepthFunc(depth_func); - - if (!depth_test_enabled) glDisable(GL_DEPTH_TEST); - if (scissor_enabled) glEnable(GL_SCISSOR_TEST); - if (cull_face_enabled) glEnable(GL_CULL_FACE); - if (stencil_test_enabled) glEnable(GL_STENCIL_TEST); - - if (use_blending) - { - if (!blend_enabled) - glDisablei(GL_BLEND, 0); - - glBlendFuncSeparate(blend_src_rgb, blend_dst_rgb, blend_src_a, blend_dst_a); - glBlendEquationSeparate(blend_eq_rgb, blend_eq_a); - } - else if (blend_enabled) - { - glEnablei(GL_BLEND, 0); - } - } - else - { - rsx_log.error("Overlay pass failed because framebuffer was not complete. Run with debug output enabled to diagnose the problem"); - } - } + void run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending = false); }; struct ui_overlay_renderer : public overlay_pass @@ -282,443 +82,30 @@ namespace gl std::unordered_map> view_cache; rsx::overlays::primitive_type m_current_primitive_type = rsx::overlays::primitive_type::quad_list; - ui_overlay_renderer() - { - vs_src = - "#version 420\n\n" - "layout(location=0) in vec4 in_pos;\n" - "layout(location=0) out vec2 tc0;\n" - "layout(location=1) flat out vec4 clip_rect;\n" - "uniform vec4 ui_scale;\n" - "uniform vec4 viewport;\n" - "uniform vec4 clip_bounds;\n" - "\n" - "vec2 snap_to_grid(vec2 normalized)\n" - "{\n" - " return (floor(normalized * viewport.xy) + 0.5) / viewport.xy;\n" - "}\n" - "\n" - "vec4 clip_to_ndc(const in vec4 coord)\n" - "{\n" - " vec4 ret = (coord * ui_scale.zwzw) / ui_scale.xyxy;\n" - " ret.yw = 1. - ret.yw;\n" - " return ret;\n" - "}\n" - "\n" - "vec4 ndc_to_window(const in vec4 coord)\n" - "{\n" - " return fma(coord, viewport.xyxy, viewport.zwzw);\n" - "}\n" - "\n" - "void main()\n" - "{\n" - " tc0.xy = in_pos.zw;\n" - " clip_rect = ndc_to_window(clip_to_ndc(clip_bounds)).xwzy; // Swap y1 and y2 due to flipped origin!\n" - " vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n" - " pos.xy = snap_to_grid(pos.xy);\n" - " gl_Position = (pos + pos) - 1.;\n" - "}\n"; + ui_overlay_renderer(); - fs_src = - "#version 420\n\n" - "layout(binding=31) uniform sampler2D fs0;\n" - "layout(binding=30) uniform sampler2DArray fs1;\n" - "layout(location=0) in vec2 tc0;\n" - "layout(location=1) flat in vec4 clip_rect;\n" - "layout(location=0) out vec4 ocol;\n" - "uniform vec4 color;\n" - "uniform float time;\n" - "uniform int sampler_mode;\n" - "uniform int pulse_glow;\n" - "uniform int clip_region;\n" - "uniform int blur_strength;\n" - "\n" - "vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n" - "{\n" - " vec2 coords[9];\n" - " coords[0] = coord - tex_offset\n;" - " coords[1] = coord + vec2(0., -tex_offset.y);\n" - " coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n" - " coords[3] = coord + vec2(-tex_offset.x, 0.);\n" - " coords[4] = coord;\n" - " coords[5] = coord + vec2(tex_offset.x, 0.);\n" - " coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n" - " coords[7] = coord + vec2(0., tex_offset.y);\n" - " coords[8] = coord + tex_offset;\n" - "\n" - " float weights[9] =\n" - " {\n" - " 1., 2., 1.,\n" - " 2., 4., 2.,\n" - " 1., 2., 1.\n" - " };\n" - "\n" - " vec4 blurred = vec4(0.);\n" - " for (int n = 0; n < 9; ++n)\n" - " {\n" - " blurred += texture(tex, coords[n]) * weights[n];\n" - " }\n" - "\n" - " return blurred / 16.f;\n" - "}\n" - "\n" - "vec4 sample_image(sampler2D tex, vec2 coord)\n" - "{\n" - " vec4 original = texture(tex, coord);\n" - " if (blur_strength == 0) return original;\n" - " \n" - " vec2 constraints = 1.f / vec2(640, 360);\n" - " vec2 res_offset = 1.f / textureSize(fs0, 0);\n" - " vec2 tex_offset = max(res_offset, constraints);\n" - "\n" - " // Sample triangle pattern and average\n" - " // TODO: Nicer looking gaussian blur with less sampling\n" - " vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n" - " vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n" - " vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n" - "\n" - " vec4 blurred = blur0 + blur1 + blur2;\n" - " blurred /= 3.;\n" - " return mix(original, blurred, float(blur_strength) / 100.);\n" - "}\n" - "\n" - "void main()\n" - "{\n" - " if (clip_region != 0)\n" - " {" - " if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n" - " gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n" - " {\n" - " discard;\n" - " return;\n" - " }\n" - " }\n" - "\n" - " vec4 diff_color = color;\n" - " if (pulse_glow != 0)\n" - " diff_color.a *= (sin(time) + 1.f) * 0.5f;\n" - "\n" - " switch (sampler_mode)\n" - " {\n" - " case 1:\n" - " ocol = sample_image(fs0, tc0) * diff_color;\n" - " break;\n" - " case 2:\n" - " ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))) * diff_color;\n" - " break;\n" - " default:\n" - " ocol = diff_color;\n" - " break;\n" - " }\n" - "}\n"; + gl::texture_view* load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid); - // Smooth filtering required for inputs - input_filter = GL_LINEAR; - } + void create(); + void destroy(); - gl::texture_view* load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid) - { - auto tex = std::make_unique(GL_TEXTURE_2D, desc->w, desc->h, 1, 1, GL_RGBA8); - tex->copy_from(desc->data, gl::texture::format::rgba, gl::texture::type::uint_8_8_8_8, {}); + void remove_temp_resources(u64 key); - GLenum remap[] = { GL_RED, GL_ALPHA, GL_BLUE, GL_GREEN }; - auto view = std::make_unique(tex.get(), remap); + gl::texture_view* find_font(rsx::overlays::font* font); - auto result = view.get(); - if (!temp_resource) - { - resources.push_back(std::move(tex)); - view_cache[view_cache.size()] = std::move(view); - } - else - { - u64 key = reinterpret_cast(desc); - temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex)); - temp_view_cache[key] = std::move(view); - } + gl::texture_view* find_temp_image(rsx::overlays::image_info* desc, u32 owner_uid); - return result; - } + void set_primitive_type(rsx::overlays::primitive_type type); - void create() - { - overlay_pass::create(); + void emit_geometry() override; - rsx::overlays::resource_config configuration; - configuration.load_files(); - - for (const auto &res : configuration.texture_raw_data) - { - load_simple_image(res.get(), false, UINT32_MAX); - } - - configuration.free_resources(); - } - - void destroy() - { - temp_image_cache.clear(); - resources.clear(); - font_cache.clear(); - overlay_pass::destroy(); - } - - void remove_temp_resources(u64 key) - { - std::vector keys_to_remove; - for (const auto& temp_image : temp_image_cache) - { - if (temp_image.second.first == key) - { - keys_to_remove.push_back(temp_image.first); - } - } - - for (const auto& _key : keys_to_remove) - { - temp_image_cache.erase(_key); - temp_view_cache.erase(_key); - } - } - - gl::texture_view* find_font(rsx::overlays::font *font) - { - const auto font_size = font->get_glyph_data_dimensions(); - - u64 key = reinterpret_cast(font); - auto found = view_cache.find(key); - if (found != view_cache.end()) - { - if (const auto this_size = found->second->image()->size3D(); - font_size.width == this_size.width && - font_size.height == this_size.height && - font_size.depth == this_size.depth) - { - return found->second.get(); - } - } - - // Create font file - std::vector glyph_data; - font->get_glyph_data(glyph_data); - - auto tex = std::make_unique(GL_TEXTURE_2D_ARRAY, font_size.width, font_size.height, font_size.depth, 1, GL_R8); - tex->copy_from(glyph_data.data(), gl::texture::format::r, gl::texture::type::ubyte, {}); - - GLenum remap[] = { GL_RED, GL_RED, GL_RED, GL_RED }; - auto view = std::make_unique(tex.get(), remap); - - auto result = view.get(); - font_cache[key] = std::move(tex); - view_cache[key] = std::move(view); - - return result; - } - - gl::texture_view* find_temp_image(rsx::overlays::image_info *desc, u32 owner_uid) - { - auto key = reinterpret_cast(desc); - auto cached = temp_view_cache.find(key); - if (cached != temp_view_cache.end()) - { - return cached->second.get(); - } - else - { - return load_simple_image(desc, true, owner_uid); - } - } - - void set_primitive_type(rsx::overlays::primitive_type type) - { - m_current_primitive_type = type; - - switch (type) - { - case rsx::overlays::primitive_type::quad_list: - case rsx::overlays::primitive_type::triangle_strip: - primitives = GL_TRIANGLE_STRIP; - break; - case rsx::overlays::primitive_type::line_list: - primitives = GL_LINES; - break; - case rsx::overlays::primitive_type::line_strip: - primitives = GL_LINE_STRIP; - break; - default: - fmt::throw_exception("Unexpected primitive type %d", static_cast(type)); - } - } - - void emit_geometry() override - { - if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list) - { - // Emulate quads with disjointed triangle strips - int num_quads = num_drawable_elements / 4; - std::vector firsts; - std::vector counts; - - firsts.resize(num_quads); - counts.resize(num_quads); - - for (int n = 0; n < num_quads; ++n) - { - firsts[n] = (n * 4); - counts[n] = 4; - } - - int old_vao; - glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao); - - m_vao.bind(); - glMultiDrawArrays(GL_TRIANGLE_STRIP, firsts.data(), counts.data(), num_quads); - - glBindVertexArray(old_vao); - } - else - { - overlay_pass::emit_geometry(); - } - } - - void run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui) - { - program_handle.uniforms["viewport"] = color4f(static_cast(viewport.width()), static_cast(viewport.height()), static_cast(viewport.x1), static_cast(viewport.y1)); - program_handle.uniforms["ui_scale"] = color4f(static_cast(ui.virtual_width), static_cast(ui.virtual_height), 1.f, 1.f); - program_handle.uniforms["time"] = static_cast(get_system_time() / 1000) * 0.005f; - - saved_sampler_state save_30(30, m_sampler); - saved_sampler_state save_31(31, m_sampler); - - for (auto &cmd : ui.get_compiled().draw_commands) - { - set_primitive_type(cmd.config.primitives); - upload_vertex_data(cmd.verts.data(), ::size32(cmd.verts)); - num_drawable_elements = ::size32(cmd.verts); - GLint texture_read = GL_TRUE; - - switch (cmd.config.texture_ref) - { - case rsx::overlays::image_resource_id::game_icon: - case rsx::overlays::image_resource_id::backbuffer: - //TODO - case rsx::overlays::image_resource_id::none: - { - texture_read = GL_FALSE; - glBindTexture(GL_TEXTURE_2D, GL_NONE); - break; - } - case rsx::overlays::image_resource_id::raw_image: - { - glBindTexture(GL_TEXTURE_2D, find_temp_image(static_cast(cmd.config.external_data_ref), ui.uid)->id()); - break; - } - case rsx::overlays::image_resource_id::font_file: - { - texture_read = (GL_TRUE + 1); - glActiveTexture(GL_TEXTURE0 + 30); - glBindTexture(GL_TEXTURE_2D_ARRAY, find_font(cmd.config.font_ref)->id()); - glActiveTexture(GL_TEXTURE0 + 31); - break; - } - default: - { - glBindTexture(GL_TEXTURE_2D, view_cache[cmd.config.texture_ref - 1]->id()); - break; - } - } - - program_handle.uniforms["color"] = cmd.config.color; - program_handle.uniforms["sampler_mode"] = texture_read; - program_handle.uniforms["pulse_glow"] = static_cast(cmd.config.pulse_glow); - program_handle.uniforms["blur_strength"] = static_cast(cmd.config.blur_strength); - program_handle.uniforms["clip_region"] = static_cast(cmd.config.clip_region); - program_handle.uniforms["clip_bounds"] = cmd.config.clip_rect; - overlay_pass::run(viewport, target, false, true); - } - - ui.update(); - } + void run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui); }; struct video_out_calibration_pass : public overlay_pass { - video_out_calibration_pass() - { - vs_src = - "#version 420\n\n" - "layout(location=0) out vec2 tc0;\n" - "\n" - "void main()\n" - "{\n" - " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" - " vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n" - " tc0 = coords[gl_VertexID % 4];\n" - " vec2 pos = positions[gl_VertexID % 4];\n" - " gl_Position = vec4(pos, 0., 1.);\n" - "}\n"; + video_out_calibration_pass(); - fs_src = - "#version 420\n\n" - "layout(binding=31) uniform sampler2D fs0;\n" - "layout(binding=30) uniform sampler2D fs1;\n" - "layout(location=0) in vec2 tc0;\n" - "layout(location=0) out vec4 ocol;\n" - "\n" - "uniform float gamma;\n" - "uniform int limit_range;\n" - "uniform int stereo;\n" - "uniform int stereo_image_count;\n" - "\n" - "vec4 read_source()\n" - "{\n" - " if (stereo == 0) return texture(fs0, tc0);\n" - "\n" - " vec4 left, right;\n" - " if (stereo_image_count == 2)\n" - " {\n" - " left = texture(fs0, tc0);\n" - " right = texture(fs1, tc0);\n" - " }\n" - " else\n" - " {\n" - " vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n" - " vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n" - " left = texture(fs0, coord_left);\n" - " right = texture(fs0, coord_right);\n" - " }\n" - "\n" - " return vec4(left.r, right.g, right.b, 1.);\n" - "}\n" - "\n" - "void main()\n" - "{\n" - " vec4 color = read_source();\n" - " color.rgb = pow(color.rgb, vec3(gamma));\n" - " if (limit_range > 0)\n" - " ocol = ((color * 220.) + 16.) / 255.;\n" - " else\n" - " ocol = color;\n" - "}\n"; - - input_filter = GL_LINEAR; - } - - void run(const areau& viewport, const rsx::simple_array& source, f32 gamma, bool limited_rgb, bool _3d) - { - program_handle.uniforms["gamma"] = gamma; - program_handle.uniforms["limit_range"] = limited_rgb + 0; - program_handle.uniforms["stereo"] = _3d + 0; - program_handle.uniforms["stereo_image_count"] = (source[1] == GL_NONE? 1 : 2); - - saved_sampler_state saved(31, m_sampler); - glBindTexture(GL_TEXTURE_2D, source[0]); - - saved_sampler_state saved2(30, m_sampler); - glBindTexture(GL_TEXTURE_2D, source[1]); - - overlay_pass::run(viewport, GL_NONE, false, false); - } + void run(const areau& viewport, const rsx::simple_array& source, f32 gamma, bool limited_rgb, bool _3d); }; } diff --git a/rpcs3/Emu/RSX/Overlays/overlay_animation.h b/rpcs3/Emu/RSX/Overlays/overlay_animation.h index 2612459650..547b2ce599 100644 --- a/rpcs3/Emu/RSX/Overlays/overlay_animation.h +++ b/rpcs3/Emu/RSX/Overlays/overlay_animation.h @@ -4,6 +4,8 @@ #include "Utilities/geometry.h" #include "overlay_utils.h" +#include + namespace rsx { namespace overlays diff --git a/rpcs3/Emu/RSX/Overlays/overlay_controls.h b/rpcs3/Emu/RSX/Overlays/overlay_controls.h index 361263ce80..6006a6943d 100644 --- a/rpcs3/Emu/RSX/Overlays/overlay_controls.h +++ b/rpcs3/Emu/RSX/Overlays/overlay_controls.h @@ -19,6 +19,7 @@ #include #include #include +#include #endif #ifdef __APPLE__ diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp new file mode 100644 index 0000000000..48b0da9d7c --- /dev/null +++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp @@ -0,0 +1,428 @@ +#include "VKCompute.h" +#include "VKHelpers.h" +#include "VKRenderPass.h" +#include "vkutils/buffer_object.h" + +#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame + +namespace vk +{ + std::vector> compute_task::get_descriptor_layout() + { + std::vector> result; + result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count); + return result; + } + + void compute_task::init_descriptors() + { + std::vector descriptor_pool_sizes; + std::vector bindings; + + const auto layout = get_descriptor_layout(); + for (const auto &e : layout) + { + descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)}); + + for (unsigned n = 0; n < e.second; ++n) + { + bindings.push_back + ({ + u32(bindings.size()), + e.first, + 1, + VK_SHADER_STAGE_COMPUTE_BIT, + nullptr + }); + } + } + + // Reserve descriptor pools + m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 3); + + VkDescriptorSetLayoutCreateInfo infos = {}; + infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + infos.pBindings = bindings.data(); + infos.bindingCount = ::size32(bindings); + + CHECK_RESULT(vkCreateDescriptorSetLayout(*g_render_device, &infos, nullptr, &m_descriptor_layout)); + + VkPipelineLayoutCreateInfo layout_info = {}; + layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + layout_info.setLayoutCount = 1; + layout_info.pSetLayouts = &m_descriptor_layout; + + VkPushConstantRange push_constants{}; + if (use_push_constants) + { + push_constants.size = push_constants_size; + push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + layout_info.pushConstantRangeCount = 1; + layout_info.pPushConstantRanges = &push_constants; + } + + CHECK_RESULT(vkCreatePipelineLayout(*g_render_device, &layout_info, nullptr, &m_pipeline_layout)); + } + + void compute_task::create() + { + if (!initialized) + { + init_descriptors(); + + switch (vk::get_driver_vendor()) + { + case vk::driver_vendor::unknown: + case vk::driver_vendor::INTEL: + // Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256 + // Based on intel's own OpenCL recommended settings + unroll_loops = true; + optimal_kernel_size = 1; + optimal_group_size = 128; + break; + case vk::driver_vendor::NVIDIA: + // Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample) + unroll_loops = true; + optimal_group_size = 32; + optimal_kernel_size = 1; + break; + case vk::driver_vendor::AMD: + case vk::driver_vendor::RADV: + // Wavefronts are multiples of 64 + unroll_loops = false; + optimal_kernel_size = 1; + optimal_group_size = 64; + break; + } + + const auto& gpu = vk::g_render_device->gpu(); + max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0]; + + initialized = true; + } + } + + void compute_task::destroy() + { + if (initialized) + { + m_shader.destroy(); + m_program.reset(); + m_param_buffer.reset(); + + vkDestroyDescriptorSetLayout(*g_render_device, m_descriptor_layout, nullptr); + vkDestroyPipelineLayout(*g_render_device, m_pipeline_layout, nullptr); + m_descriptor_pool.destroy(); + + initialized = false; + } + } + + void compute_task::free_resources() + { + if (m_used_descriptors == 0) + return; + + m_descriptor_pool.reset(0); + m_used_descriptors = 0; + } + + void compute_task::load_program(VkCommandBuffer cmd) + { + if (!m_program) + { + m_shader.create(::glsl::program_domain::glsl_compute_program, m_src); + auto handle = m_shader.compile(); + + VkPipelineShaderStageCreateInfo shader_stage{}; + shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; + shader_stage.module = handle; + shader_stage.pName = "main"; + + VkComputePipelineCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + info.stage = shader_stage; + info.layout = m_pipeline_layout; + info.basePipelineIndex = -1; + info.basePipelineHandle = VK_NULL_HANDLE; + + auto compiler = vk::get_pipe_compiler(); + m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE); + declare_inputs(); + } + + ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS); + + VkDescriptorSetAllocateInfo alloc_info = {}; + alloc_info.descriptorPool = m_descriptor_pool; + alloc_info.descriptorSetCount = 1; + alloc_info.pSetLayouts = &m_descriptor_layout; + alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + + CHECK_RESULT(vkAllocateDescriptorSets(*g_render_device, &alloc_info, &m_descriptor_set)); + m_used_descriptors++; + + bind_resources(); + + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr); + } + + void compute_task::run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z) + { + // CmdDispatch is outside renderpass scope only + if (vk::is_renderpass_open(cmd)) + { + vk::end_renderpass(cmd); + } + + load_program(cmd); + vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z); + } + + void compute_task::run(VkCommandBuffer cmd, u32 num_invocations) + { + u32 invocations_x, invocations_y; + if (num_invocations > max_invocations_x) + { + // AMD hw reports an annoyingly small maximum number of invocations in the X dimension + // Split the 1D job into 2 dimensions to accomodate this + invocations_x = static_cast(floor(std::sqrt(num_invocations))); + invocations_y = invocations_x; + + if (num_invocations % invocations_x) invocations_y++; + } + else + { + invocations_x = num_invocations; + invocations_y = 1; + } + + run(cmd, invocations_x, invocations_y, 1); + } + + cs_shuffle_base::cs_shuffle_base() + { + work_kernel = + " value = data[index];\n" + " data[index] = %f(value);\n"; + + loop_advance = + " index++;\n"; + + suffix = + "}\n"; + } + + void cs_shuffle_base::build(const char* function_name, u32 _kernel_size) + { + // Initialize to allow detecting optimal settings + create(); + + kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; + + m_src = + "#version 430\n" + "layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n" + "layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n" + "%ub" + "\n" + "#define KERNEL_SIZE %ks\n" + "\n" + "// Generic swap routines\n" + "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" + "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" + "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" + "\n" + "// Depth format conversions\n" + "#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n" + "#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n" + "#define d24f_to_f32(bits) (bits << 7)\n" + "#define f32_to_d24f(bits) (bits >> 7)\n" + "#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n" + "#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n" + "#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n" + "\n" + "%md" + "void main()\n" + "{\n" + " uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);" + " uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n" + " uint index = invocation_id * KERNEL_SIZE;\n" + " uint value;\n" + "%vars" + "\n"; + + const auto parameters_size = utils::align(push_constants_size, 16) / 16; + const std::pair syntax_replace[] = + { + { "%ws", std::to_string(optimal_group_size) }, + { "%ks", std::to_string(kernel_size) }, + { "%vars", variables }, + { "%f", function_name }, + { "%md", method_declarations }, + { "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" }, + }; + + m_src = fmt::replace_all(m_src, syntax_replace); + work_kernel = fmt::replace_all(work_kernel, syntax_replace); + + if (kernel_size <= 1) + { + m_src += " {\n" + work_kernel + " }\n"; + } + else if (unroll_loops) + { + work_kernel += loop_advance + "\n"; + + m_src += std::string + ( + " //Unrolled loop\n" + " {\n" + ); + + // Assemble body with manual loop unroll to try loweing GPR usage + for (u32 n = 0; n < kernel_size; ++n) + { + m_src += work_kernel; + } + + m_src += " }\n"; + } + else + { + m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n"; + m_src += " {\n"; + m_src += work_kernel; + m_src += loop_advance; + m_src += " }\n"; + } + + m_src += suffix; + } + + void cs_shuffle_base::bind_resources() + { + m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + } + + void cs_shuffle_base::set_parameters(VkCommandBuffer cmd, const u32* params, u8 count) + { + ensure(use_push_constants); + vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params); + } + + void cs_shuffle_base::run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset) + { + m_data = data; + m_data_offset = data_offset; + m_data_length = data_length; + + const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; + const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation); + const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation; + + if ((num_bytes_to_process + data_offset) > data->size()) + { + // Technically robust buffer access should keep the driver from crashing in OOB situations + rsx_log.error("Inadequate buffer length submitted for a compute operation." + "Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size()); + } + + compute_task::run(cmd, num_invocations); + } + + cs_interleave_task::cs_interleave_task() + { + use_push_constants = true; + push_constants_size = 16; + + variables = + " uint block_length = params[0].x >> 2;\n" + " uint z_offset = params[0].y >> 2;\n" + " uint s_offset = params[0].z >> 2;\n" + " uint depth;\n" + " uint stencil;\n" + " uint stencil_shift;\n" + " uint stencil_offset;\n"; + } + + void cs_interleave_task::bind_resources() + { + m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + } + + void cs_interleave_task::run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset) + { + u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 }; + set_parameters(cmd, parameters, 4); + + ensure(stencil_offset > data_offset); + m_ssbo_length = stencil_offset + (data_length / 4) - data_offset; + cs_shuffle_base::run(cmd, data, data_length, data_offset); + } + + cs_scatter_d24x8::cs_scatter_d24x8() + { + work_kernel = + " if (index >= block_length)\n" + " return;\n" + "\n" + " value = data[index];\n" + " data[index + z_offset] = (value >> 8);\n" + " stencil_offset = (index / 4);\n" + " stencil_shift = (index % 4) * 8;\n" + " stencil = (value & 0xFF) << stencil_shift;\n" + " atomicOr(data[stencil_offset + s_offset], stencil);\n"; + + cs_shuffle_base::build(""); + } + + cs_aggregator::cs_aggregator() + { + ssbo_count = 2; + + create(); + + m_src = + "#version 450\n" + "layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n" + + "layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n" + "layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n" + + "void main()\n" + "{\n" + " if (gl_GlobalInvocationID.x < src.length())\n" + " {\n" + " atomicAdd(result, src[gl_GlobalInvocationID.x]);\n" + " }\n" + "}\n"; + + const std::pair syntax_replace[] = + { + { "%ws", std::to_string(optimal_group_size) }, + }; + + m_src = fmt::replace_all(m_src, syntax_replace); + } + + void cs_aggregator::bind_resources() + { + m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + } + + void cs_aggregator::run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words) + { + this->dst = dst; + this->src = src; + word_count = num_words; + block_length = num_words * 4; + + const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size); + compute_task::run(cmd, linear_invocations); + } +} diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index 3a9fcbeb76..8f35f3e2f2 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -1,18 +1,14 @@ #pragma once +#include "VKPipelineCompiler.h" #include "vkutils/descriptors.hpp" -#include "Utilities/StrUtil.h" +#include "vkutils/buffer_object.h" + #include "Emu/IdManager.h" -#include "VKPipelineCompiler.h" -#include "VKRenderPass.h" -#include "VKHelpers.h" -#include "vkutils/buffer_object.h" -#include "vkutils/device.h" - +#include "Utilities/StrUtil.h" #include "util/asm.hpp" -#include -#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame +#include namespace vk { @@ -38,207 +34,22 @@ namespace vk u32 optimal_kernel_size = 1; u32 max_invocations_x = 65535; - virtual std::vector> get_descriptor_layout() - { - std::vector> result; - result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count); - return result; - } + virtual std::vector> get_descriptor_layout(); - void init_descriptors() - { - std::vector descriptor_pool_sizes; - std::vector bindings; + void init_descriptors(); - const auto layout = get_descriptor_layout(); - for (const auto &e : layout) - { - descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)}); + void create(); + void destroy(); - for (unsigned n = 0; n < e.second; ++n) - { - bindings.push_back - ({ - u32(bindings.size()), - e.first, - 1, - VK_SHADER_STAGE_COMPUTE_BIT, - nullptr - }); - } - } + void free_resources(); - // Reserve descriptor pools - m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 3); + virtual void bind_resources() {} + virtual void declare_inputs() {} - VkDescriptorSetLayoutCreateInfo infos = {}; - infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - infos.pBindings = bindings.data(); - infos.bindingCount = ::size32(bindings); + void load_program(VkCommandBuffer cmd); - CHECK_RESULT(vkCreateDescriptorSetLayout(*g_render_device, &infos, nullptr, &m_descriptor_layout)); - - VkPipelineLayoutCreateInfo layout_info = {}; - layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - layout_info.setLayoutCount = 1; - layout_info.pSetLayouts = &m_descriptor_layout; - - VkPushConstantRange push_constants{}; - if (use_push_constants) - { - push_constants.size = push_constants_size; - push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - - layout_info.pushConstantRangeCount = 1; - layout_info.pPushConstantRanges = &push_constants; - } - - CHECK_RESULT(vkCreatePipelineLayout(*g_render_device, &layout_info, nullptr, &m_pipeline_layout)); - } - - void create() - { - if (!initialized) - { - init_descriptors(); - - switch (vk::get_driver_vendor()) - { - case vk::driver_vendor::unknown: - case vk::driver_vendor::INTEL: - // Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256 - // Based on intel's own OpenCL recommended settings - unroll_loops = true; - optimal_kernel_size = 1; - optimal_group_size = 128; - break; - case vk::driver_vendor::NVIDIA: - // Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample) - unroll_loops = true; - optimal_group_size = 32; - optimal_kernel_size = 1; - break; - case vk::driver_vendor::AMD: - case vk::driver_vendor::RADV: - // Wavefronts are multiples of 64 - unroll_loops = false; - optimal_kernel_size = 1; - optimal_group_size = 64; - break; - } - - const auto& gpu = vk::g_render_device->gpu(); - max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0]; - - initialized = true; - } - } - - void destroy() - { - if (initialized) - { - m_shader.destroy(); - m_program.reset(); - m_param_buffer.reset(); - - vkDestroyDescriptorSetLayout(*g_render_device, m_descriptor_layout, nullptr); - vkDestroyPipelineLayout(*g_render_device, m_pipeline_layout, nullptr); - m_descriptor_pool.destroy(); - - initialized = false; - } - } - - void free_resources() - { - if (m_used_descriptors == 0) - return; - - m_descriptor_pool.reset(0); - m_used_descriptors = 0; - } - - virtual void bind_resources() - {} - - virtual void declare_inputs() - {} - - void load_program(VkCommandBuffer cmd) - { - if (!m_program) - { - m_shader.create(::glsl::program_domain::glsl_compute_program, m_src); - auto handle = m_shader.compile(); - - VkPipelineShaderStageCreateInfo shader_stage{}; - shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; - shader_stage.module = handle; - shader_stage.pName = "main"; - - VkComputePipelineCreateInfo info{}; - info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; - info.stage = shader_stage; - info.layout = m_pipeline_layout; - info.basePipelineIndex = -1; - info.basePipelineHandle = VK_NULL_HANDLE; - - auto compiler = vk::get_pipe_compiler(); - m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE); - declare_inputs(); - } - - ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS); - - VkDescriptorSetAllocateInfo alloc_info = {}; - alloc_info.descriptorPool = m_descriptor_pool; - alloc_info.descriptorSetCount = 1; - alloc_info.pSetLayouts = &m_descriptor_layout; - alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - - CHECK_RESULT(vkAllocateDescriptorSets(*g_render_device, &alloc_info, &m_descriptor_set)); - m_used_descriptors++; - - bind_resources(); - - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline); - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr); - } - - void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z) - { - // CmdDispatch is outside renderpass scope only - if (vk::is_renderpass_open(cmd)) - { - vk::end_renderpass(cmd); - } - - load_program(cmd); - vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z); - } - - void run(VkCommandBuffer cmd, u32 num_invocations) - { - u32 invocations_x, invocations_y; - if (num_invocations > max_invocations_x) - { - // AMD hw reports an annoyingly small maximum number of invocations in the X dimension - // Split the 1D job into 2 dimensions to accomodate this - invocations_x = static_cast(floor(std::sqrt(num_invocations))); - invocations_y = invocations_x; - - if (num_invocations % invocations_x) invocations_y++; - } - else - { - invocations_x = num_invocations; - invocations_y = 1; - } - - run(cmd, invocations_x, invocations_y, 1); - } + void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z); + void run(VkCommandBuffer cmd, u32 num_invocations); }; struct cs_shuffle_base : compute_task @@ -251,136 +62,15 @@ namespace vk std::string variables, work_kernel, loop_advance, suffix; std::string method_declarations; - cs_shuffle_base() - { - work_kernel = - " value = data[index];\n" - " data[index] = %f(value);\n"; + cs_shuffle_base(); - loop_advance = - " index++;\n"; + void build(const char* function_name, u32 _kernel_size = 0); - suffix = - "}\n"; - } + void bind_resources() override; - void build(const char* function_name, u32 _kernel_size = 0) - { - // Initialize to allow detecting optimal settings - create(); + void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count); - kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; - - m_src = - "#version 430\n" - "layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n" - "layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n" - "%ub" - "\n" - "#define KERNEL_SIZE %ks\n" - "\n" - "// Generic swap routines\n" - "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" - "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" - "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" - "\n" - "// Depth format conversions\n" - "#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n" - "#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n" - "#define d24f_to_f32(bits) (bits << 7)\n" - "#define f32_to_d24f(bits) (bits >> 7)\n" - "#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n" - "#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n" - "#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n" - "\n" - "%md" - "void main()\n" - "{\n" - " uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);" - " uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n" - " uint index = invocation_id * KERNEL_SIZE;\n" - " uint value;\n" - "%vars" - "\n"; - - const auto parameters_size = utils::align(push_constants_size, 16) / 16; - const std::pair syntax_replace[] = - { - { "%ws", std::to_string(optimal_group_size) }, - { "%ks", std::to_string(kernel_size) }, - { "%vars", variables }, - { "%f", function_name }, - { "%md", method_declarations }, - { "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" }, - }; - - m_src = fmt::replace_all(m_src, syntax_replace); - work_kernel = fmt::replace_all(work_kernel, syntax_replace); - - if (kernel_size <= 1) - { - m_src += " {\n" + work_kernel + " }\n"; - } - else if (unroll_loops) - { - work_kernel += loop_advance + "\n"; - - m_src += std::string - ( - " //Unrolled loop\n" - " {\n" - ); - - // Assemble body with manual loop unroll to try loweing GPR usage - for (u32 n = 0; n < kernel_size; ++n) - { - m_src += work_kernel; - } - - m_src += " }\n"; - } - else - { - m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n"; - m_src += " {\n"; - m_src += work_kernel; - m_src += loop_advance; - m_src += " }\n"; - } - - m_src += suffix; - } - - void bind_resources() override - { - m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); - } - - void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count) - { - ensure(use_push_constants); - vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params); - } - - void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0) - { - m_data = data; - m_data_offset = data_offset; - m_data_length = data_length; - - const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; - const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation); - const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation; - - if ((num_bytes_to_process + data_offset) > data->size()) - { - // Technically robust buffer access should keep the driver from crashing in OOB situations - rsx_log.error("Inadequate buffer length submitted for a compute operation." - "Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size()); - } - - compute_task::run(cmd, num_invocations); - } + void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0); }; struct cs_shuffle_16 : cs_shuffle_base @@ -442,35 +132,11 @@ namespace vk { u32 m_ssbo_length = 0; - cs_interleave_task() - { - use_push_constants = true; - push_constants_size = 16; + cs_interleave_task(); - variables = - " uint block_length = params[0].x >> 2;\n" - " uint z_offset = params[0].y >> 2;\n" - " uint s_offset = params[0].z >> 2;\n" - " uint depth;\n" - " uint stencil;\n" - " uint stencil_shift;\n" - " uint stencil_offset;\n"; - } + void bind_resources() override; - void bind_resources() override - { - m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); - } - - void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset) - { - u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 }; - set_parameters(cmd, parameters, 4); - - ensure(stencil_offset > data_offset); - m_ssbo_length = stencil_offset + (data_length / 4) - data_offset; - cs_shuffle_base::run(cmd, data, data_length, data_offset); - } + void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset); }; template @@ -549,21 +215,7 @@ namespace vk struct cs_scatter_d24x8 : cs_interleave_task { - cs_scatter_d24x8() - { - work_kernel = - " if (index >= block_length)\n" - " return;\n" - "\n" - " value = data[index];\n" - " data[index + z_offset] = (value >> 8);\n" - " stencil_offset = (index / 4);\n" - " stencil_shift = (index % 4) * 8;\n" - " stencil = (value & 0xFF) << stencil_shift;\n" - " atomicOr(data[stencil_offset + s_offset], stencil);\n"; - - cs_shuffle_base::build(""); - } + cs_scatter_d24x8(); }; template @@ -962,51 +614,11 @@ namespace vk u32 block_length = 0; u32 word_count = 0; - cs_aggregator() - { - ssbo_count = 2; + cs_aggregator(); - create(); + void bind_resources() override; - m_src = - "#version 450\n" - "layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n" - - "layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n" - "layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n" - - "void main()\n" - "{\n" - " if (gl_GlobalInvocationID.x < src.length())\n" - " {\n" - " atomicAdd(result, src[gl_GlobalInvocationID.x]);\n" - " }\n" - "}\n"; - - const std::pair syntax_replace[] = - { - { "%ws", std::to_string(optimal_group_size) }, - }; - - m_src = fmt::replace_all(m_src, syntax_replace); - } - - void bind_resources() override - { - m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); - m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); - } - - void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words) - { - this->dst = dst; - this->src = src; - word_count = num_words; - block_length = num_words * 4; - - const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size); - compute_task::run(cmd, linear_invocations); - } + void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words); }; // TODO: Replace with a proper manager diff --git a/rpcs3/Emu/RSX/VK/VKFramebuffer.cpp b/rpcs3/Emu/RSX/VK/VKFramebuffer.cpp index 7583111176..aeb7bd17b4 100644 --- a/rpcs3/Emu/RSX/VK/VKFramebuffer.cpp +++ b/rpcs3/Emu/RSX/VK/VKFramebuffer.cpp @@ -10,7 +10,7 @@ namespace vk { std::unordered_map>> g_framebuffers_cache; - vk::framebuffer_holder *get_framebuffer(VkDevice dev, u16 width, u16 height, VkRenderPass renderpass, const std::vector& image_list) + vk::framebuffer_holder* get_framebuffer(VkDevice dev, u16 width, u16 height, VkRenderPass renderpass, const std::vector& image_list) { u64 key = u64(width) | (u64(height) << 16); auto &queue = g_framebuffers_cache[key]; diff --git a/rpcs3/Emu/RSX/VK/VKOverlays.cpp b/rpcs3/Emu/RSX/VK/VKOverlays.cpp new file mode 100644 index 0000000000..ffb9e3e2a3 --- /dev/null +++ b/rpcs3/Emu/RSX/VK/VKOverlays.cpp @@ -0,0 +1,1049 @@ +#include "VKOverlays.h" +#include "VKRenderTargets.h" +#include "VKFramebuffer.h" +#include "VKResourceManager.h" +#include "VKRenderPass.h" +#include "VKPipelineCompiler.h" + +#include "vkutils/image.h" +#include "vkutils/image_helpers.h" +#include "vkutils/sampler.h" +#include "vkutils/scratch.h" + +#include "../Overlays/overlays.h" + +#include "util/fnv_hash.hpp" + +#define VK_OVERLAY_MAX_DRAW_CALLS 1024 + +namespace vk +{ + overlay_pass::overlay_pass() + { + // Override-able defaults + renderpass_config.set_primitive_type(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP); + } + + overlay_pass::~overlay_pass() + { + m_vao.destroy(); + m_ubo.destroy(); + } + + u64 overlay_pass::get_pipeline_key(VkRenderPass pass) + { + if (!multi_primitive) + { + // Default fast path + return reinterpret_cast(pass); + } + else + { + struct + { + u64 pass_value; + u64 config; + } + key{ reinterpret_cast(pass), static_cast(renderpass_config.ia.topology) }; + return rpcs3::hash_struct(key); + } + } + + void overlay_pass::check_heap() + { + if (!m_vao.heap) + { + m_vao.create(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, 1 * 0x100000, "overlays VAO", 128); + m_ubo.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, 8 * 0x100000, "overlays UBO", 128); + } + } + + void overlay_pass::init_descriptors() + { + VkDescriptorPoolSize descriptor_pool_sizes[2] = + { + { VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_OVERLAY_MAX_DRAW_CALLS * m_num_usable_samplers }, + { VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_OVERLAY_MAX_DRAW_CALLS }, + }; + + // Reserve descriptor pools + m_descriptor_pool.create(*m_device, descriptor_pool_sizes, 2, VK_OVERLAY_MAX_DRAW_CALLS, 2); + + std::vector bindings(1 + m_num_usable_samplers); + + bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + bindings[0].descriptorCount = 1; + bindings[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT; + bindings[0].binding = 0; + bindings[0].pImmutableSamplers = nullptr; + + for (u32 n = 1; n <= m_num_usable_samplers; ++n) + { + bindings[n].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + bindings[n].descriptorCount = 1; + bindings[n].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + bindings[n].binding = n; + bindings[n].pImmutableSamplers = nullptr; + } + + VkDescriptorSetLayoutCreateInfo infos = {}; + infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + infos.pBindings = bindings.data(); + infos.bindingCount = 1 + m_num_usable_samplers; + + CHECK_RESULT(vkCreateDescriptorSetLayout(*m_device, &infos, nullptr, &m_descriptor_layout)); + + VkPipelineLayoutCreateInfo layout_info = {}; + layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + layout_info.setLayoutCount = 1; + layout_info.pSetLayouts = &m_descriptor_layout; + + std::vector push_constants = get_push_constants(); + if (!push_constants.empty()) + { + layout_info.pushConstantRangeCount = u32(push_constants.size()); + layout_info.pPushConstantRanges = push_constants.data(); + } + + CHECK_RESULT(vkCreatePipelineLayout(*m_device, &layout_info, nullptr, &m_pipeline_layout)); + } + + std::vector overlay_pass::get_vertex_inputs() + { + check_heap(); + return{}; + } + + std::vector overlay_pass::get_fragment_inputs() + { + std::vector fs_inputs; + fs_inputs.push_back({ ::glsl::program_domain::glsl_fragment_program, vk::glsl::program_input_type::input_type_uniform_buffer,{},{}, 0, "static_data" }); + + for (u32 n = 1; n <= m_num_usable_samplers; ++n) + { + fs_inputs.push_back({ ::glsl::program_domain::glsl_fragment_program, vk::glsl::program_input_type::input_type_texture,{},{}, n, "fs" + std::to_string(n-1) }); + } + + return fs_inputs; + } + + vk::glsl::program* overlay_pass::build_pipeline(u64 storage_key, VkRenderPass render_pass) + { + if (!compiled) + { + m_vertex_shader.create(::glsl::program_domain::glsl_vertex_program, vs_src); + m_vertex_shader.compile(); + + m_fragment_shader.create(::glsl::program_domain::glsl_fragment_program, fs_src); + m_fragment_shader.compile(); + + compiled = true; + } + + VkPipelineShaderStageCreateInfo shader_stages[2] = {}; + shader_stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT; + shader_stages[0].module = m_vertex_shader.get_handle(); + shader_stages[0].pName = "main"; + + shader_stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shader_stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT; + shader_stages[1].module = m_fragment_shader.get_handle(); + shader_stages[1].pName = "main"; + + std::vector dynamic_state_descriptors; + dynamic_state_descriptors.push_back(VK_DYNAMIC_STATE_VIEWPORT); + dynamic_state_descriptors.push_back(VK_DYNAMIC_STATE_SCISSOR); + get_dynamic_state_entries(dynamic_state_descriptors); + + VkPipelineDynamicStateCreateInfo dynamic_state_info = {}; + dynamic_state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamic_state_info.dynamicStateCount = ::size32(dynamic_state_descriptors); + dynamic_state_info.pDynamicStates = dynamic_state_descriptors.data(); + + VkVertexInputBindingDescription vb = { 0, 16, VK_VERTEX_INPUT_RATE_VERTEX }; + VkVertexInputAttributeDescription via = { 0, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 0 }; + VkPipelineVertexInputStateCreateInfo vi = {}; + vi.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vi.vertexBindingDescriptionCount = 1; + vi.pVertexBindingDescriptions = &vb; + vi.vertexAttributeDescriptionCount = 1; + vi.pVertexAttributeDescriptions = &via; + + VkPipelineViewportStateCreateInfo vp = {}; + vp.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + vp.scissorCount = 1; + vp.viewportCount = 1; + + VkGraphicsPipelineCreateInfo info = {}; + info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + info.pVertexInputState = &vi; + info.pInputAssemblyState = &renderpass_config.ia; + info.pRasterizationState = &renderpass_config.rs; + info.pColorBlendState = &renderpass_config.cs; + info.pMultisampleState = &renderpass_config.ms; + info.pViewportState = &vp; + info.pDepthStencilState = &renderpass_config.ds; + info.stageCount = 2; + info.pStages = shader_stages; + info.pDynamicState = &dynamic_state_info; + info.layout = m_pipeline_layout; + info.basePipelineIndex = -1; + info.basePipelineHandle = VK_NULL_HANDLE; + info.renderPass = render_pass; + + auto compiler = vk::get_pipe_compiler(); + auto program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE, {}, get_vertex_inputs(), get_fragment_inputs()); + auto result = program.get(); + m_program_cache[storage_key] = std::move(program); + + return result; + } + + void overlay_pass::load_program(vk::command_buffer& cmd, VkRenderPass pass, const std::vector& src) + { + vk::glsl::program *program = nullptr; + const auto key = get_pipeline_key(pass); + + auto found = m_program_cache.find(key); + if (found != m_program_cache.end()) + program = found->second.get(); + else + program = build_pipeline(key, pass); + + ensure(m_used_descriptors < VK_OVERLAY_MAX_DRAW_CALLS); + + VkDescriptorSetAllocateInfo alloc_info = {}; + alloc_info.descriptorPool = m_descriptor_pool; + alloc_info.descriptorSetCount = 1; + alloc_info.pSetLayouts = &m_descriptor_layout; + alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + + CHECK_RESULT(vkAllocateDescriptorSets(*m_device, &alloc_info, &m_descriptor_set)); + m_used_descriptors++; + + if (!m_sampler) + { + m_sampler = std::make_unique(*m_device, + VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + VK_FALSE, 0.f, 1.f, 0.f, 0.f, m_sampler_filter, m_sampler_filter, VK_SAMPLER_MIPMAP_MODE_NEAREST, VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK); + } + + update_uniforms(cmd, program); + + program->bind_uniform({ m_ubo.heap->value, m_ubo_offset, std::max(m_ubo_length, 4u) }, 0, m_descriptor_set); + + for (uint n = 0; n < src.size(); ++n) + { + VkDescriptorImageInfo info = { m_sampler->value, src[n]->value, src[n]->image()->current_layout }; + program->bind_uniform(info, "fs" + std::to_string(n), VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, m_descriptor_set); + } + + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, program->pipeline); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr); + + VkBuffer buffers = m_vao.heap->value; + VkDeviceSize offsets = m_vao_offset; + vkCmdBindVertexBuffers(cmd, 0, 1, &buffers, &offsets); + } + + void overlay_pass::create(const vk::render_device& dev) + { + if (!initialized) + { + m_device = &dev; + init_descriptors(); + + initialized = true; + } + } + + void overlay_pass::destroy() + { + if (initialized) + { + m_vertex_shader.destroy(); + m_fragment_shader.destroy(); + m_program_cache.clear(); + m_sampler.reset(); + + vkDestroyDescriptorSetLayout(*m_device, m_descriptor_layout, nullptr); + vkDestroyPipelineLayout(*m_device, m_pipeline_layout, nullptr); + m_descriptor_pool.destroy(); + + initialized = false; + } + } + + void overlay_pass::free_resources() + { + if (m_used_descriptors == 0) + return; + + m_descriptor_pool.reset(0); + m_used_descriptors = 0; + + m_vao.reset_allocation_stats(); + m_ubo.reset_allocation_stats(); + } + + vk::framebuffer* overlay_pass::get_framebuffer(vk::image* target, VkRenderPass render_pass) + { + VkDevice dev = (*vk::get_current_renderer()); + return vk::get_framebuffer(dev, target->width(), target->height(), render_pass, { target }); + } + + void overlay_pass::emit_geometry(vk::command_buffer& cmd) + { + vkCmdDraw(cmd, num_drawable_elements, 1, first_vertex, 0); + } + + void overlay_pass::set_up_viewport(vk::command_buffer& cmd, u32 x, u32 y, u32 w, u32 h) + { + VkViewport vp{}; + vp.x = static_cast(x); + vp.y = static_cast(y); + vp.width = static_cast(w); + vp.height = static_cast(h); + vp.minDepth = 0.f; + vp.maxDepth = 1.f; + vkCmdSetViewport(cmd, 0, 1, &vp); + + VkRect2D vs = { { static_cast(x), static_cast(y) }, { w, h } }; + vkCmdSetScissor(cmd, 0, 1, &vs); + } + + void overlay_pass::run(vk::command_buffer& cmd, const areau& viewport, vk::framebuffer* fbo, const std::vector& src, VkRenderPass render_pass) + { + load_program(cmd, render_pass, src); + set_up_viewport(cmd, viewport.x1, viewport.y1, viewport.width(), viewport.height()); + + vk::begin_renderpass(cmd, render_pass, fbo->value, viewport); + emit_geometry(cmd); + } + + void overlay_pass::run(vk::command_buffer& cmd, const areau& viewport, vk::image* target, const std::vector& src, VkRenderPass render_pass) + { + auto fbo = static_cast(get_framebuffer(target, render_pass)); + fbo->add_ref(); + + run(cmd, viewport, fbo, src, render_pass); + fbo->release(); + } + + void overlay_pass::run(vk::command_buffer& cmd, const areau& viewport, vk::image* target, vk::image_view* src, VkRenderPass render_pass) + { + std::vector views = { src }; + run(cmd, viewport, target, views, render_pass); + } + + ui_overlay_renderer::ui_overlay_renderer() + { + vs_src = + "#version 450\n" + "#extension GL_ARB_separate_shader_objects : enable\n" + "layout(location=0) in vec4 in_pos;\n" + "layout(std140, set=0, binding=0) uniform static_data{ vec4 regs[8]; };\n" + "layout(location=0) out vec2 tc0;\n" + "layout(location=1) out vec4 color;\n" + "layout(location=2) out vec4 parameters;\n" + "layout(location=3) out vec4 clip_rect;\n" + "layout(location=4) out vec4 parameters2;\n" + "\n" + "vec2 snap_to_grid(const in vec2 normalized)\n" + "{\n" + " return (floor(normalized * regs[5].xy) + 0.5) / regs[5].xy;\n" + "}\n" + "\n" + "vec4 clip_to_ndc(const in vec4 coord)\n" + "{\n" + " return (coord * regs[0].zwzw) / regs[0].xyxy;\n" + "}\n" + "\n" + "vec4 ndc_to_window(const in vec4 coord)\n" + "{\n" + " return fma(coord, regs[5].xyxy, regs[5].zwzw);\n" + "}\n" + "\n" + "void main()\n" + "{\n" + " tc0.xy = in_pos.zw;\n" + " color = regs[1];\n" + " parameters = regs[2];\n" + " parameters2 = regs[4];\n" + " clip_rect = ndc_to_window(clip_to_ndc(regs[3]));\n" + " vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n" + " pos.xy = snap_to_grid(pos.xy);\n" + " gl_Position = (pos + pos) - 1.;\n" + "}\n"; + + fs_src = + "#version 420\n" + "#extension GL_ARB_separate_shader_objects : enable\n" + "layout(set=0, binding=1) uniform sampler2D fs0;\n" + "layout(set=0, binding=2) uniform sampler2DArray fs1;\n" + "layout(location=0) in vec2 tc0;\n" + "layout(location=1) in vec4 color;\n" + "layout(location=2) in vec4 parameters;\n" + "layout(location=3) in vec4 clip_rect;\n" + "layout(location=4) in vec4 parameters2;\n" + "layout(location=0) out vec4 ocol;\n" + "\n" + "vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n" + "{\n" + " vec2 coords[9];\n" + " coords[0] = coord - tex_offset\n;" + " coords[1] = coord + vec2(0., -tex_offset.y);\n" + " coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n" + " coords[3] = coord + vec2(-tex_offset.x, 0.);\n" + " coords[4] = coord;\n" + " coords[5] = coord + vec2(tex_offset.x, 0.);\n" + " coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n" + " coords[7] = coord + vec2(0., tex_offset.y);\n" + " coords[8] = coord + tex_offset;\n" + "\n" + " float weights[9] =\n" + " {\n" + " 1., 2., 1.,\n" + " 2., 4., 2.,\n" + " 1., 2., 1.\n" + " };\n" + "\n" + " vec4 blurred = vec4(0.);\n" + " for (int n = 0; n < 9; ++n)\n" + " {\n" + " blurred += texture(tex, coords[n]) * weights[n];\n" + " }\n" + "\n" + " return blurred / 16.f;\n" + "}\n" + "\n" + "vec4 sample_image(sampler2D tex, vec2 coord, float blur_strength)\n" + "{\n" + " vec4 original = texture(tex, coord);\n" + " if (blur_strength == 0) return original;\n" + " \n" + " vec2 constraints = 1.f / vec2(640, 360);\n" + " vec2 res_offset = 1.f / textureSize(fs0, 0);\n" + " vec2 tex_offset = max(res_offset, constraints);\n" + "\n" + " // Sample triangle pattern and average\n" + " // TODO: Nicer looking gaussian blur with less sampling\n" + " vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n" + " vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n" + " vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n" + "\n" + " vec4 blurred = blur0 + blur1 + blur2;\n" + " blurred /= 3.;\n" + " return mix(original, blurred, blur_strength);\n" + "}\n" + "\n" + "void main()\n" + "{\n" + " if (parameters.w != 0)\n" + " {" + " if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n" + " gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n" + " {\n" + " discard;\n" + " return;\n" + " }\n" + " }\n" + "\n" + " vec4 diff_color = color;\n" + " if (parameters.y != 0)\n" + " diff_color.a *= (sin(parameters.x) + 1.f) * 0.5f;\n" + "\n" + " if (parameters.z < 1.)\n" + " {\n" + " ocol = diff_color;\n" + " }\n" + " else if (parameters.z > 2.)\n" + " {\n" + " ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))).rrrr * diff_color;\n" + " }\n" + " else if (parameters.z > 1.)\n" + " {\n" + " ocol = texture(fs0, tc0).rrrr * diff_color;\n" + " }\n" + " else\n" + " {\n" + " ocol = sample_image(fs0, tc0, parameters2.x).bgra * diff_color;\n" + " }\n" + "}\n"; + + // Allow mixed primitive rendering + multi_primitive = true; + + // 2 input textures + m_num_usable_samplers = 2; + + renderpass_config.set_attachment_count(1); + renderpass_config.set_color_mask(0, true, true, true, true); + renderpass_config.set_depth_mask(false); + renderpass_config.enable_blend(0, + VK_BLEND_FACTOR_SRC_ALPHA, VK_BLEND_FACTOR_SRC_ALPHA, + VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + VK_BLEND_OP_ADD, VK_BLEND_OP_ADD); + } + + vk::image_view* ui_overlay_renderer::upload_simple_texture(vk::render_device& dev, vk::command_buffer& cmd, + vk::data_heap& upload_heap, u64 key, u32 w, u32 h, u32 layers, bool font, bool temp, void* pixel_src, u32 owner_uid) + { + const VkFormat format = (font) ? VK_FORMAT_R8_UNORM : VK_FORMAT_B8G8R8A8_UNORM; + const u32 pitch = (font) ? w : w * 4; + const u32 data_size = pitch * h * layers; + const auto offset = upload_heap.alloc<512>(data_size); + const auto addr = upload_heap.map(offset, data_size); + + const VkImageSubresourceRange range = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, layers }; + + auto tex = std::make_unique(dev, dev.get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + VK_IMAGE_TYPE_2D, format, std::max(w, 1u), std::max(h, 1u), 1, 1, layers, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, + 0); + + if (pixel_src && data_size) + std::memcpy(addr, pixel_src, data_size); + else if (data_size) + std::memset(addr, 0, data_size); + + upload_heap.unmap(); + + VkBufferImageCopy region; + region.imageSubresource = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, layers }; + region.bufferOffset = offset; + region.bufferRowLength = w; + region.bufferImageHeight = h; + region.imageOffset = {}; + region.imageExtent = { static_cast(w), static_cast(h), 1u }; + + change_image_layout(cmd, tex.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, range); + vkCmdCopyBufferToImage(cmd, upload_heap.heap->value, tex->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); + change_image_layout(cmd, tex.get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, range); + + auto view = std::make_unique(dev, tex.get()); + + auto result = view.get(); + + if (!temp || font) + view_cache[key] = std::move(view); + else + temp_view_cache[key] = std::move(view); + + if (font) + font_cache[key] = std::move(tex); + else if (!temp) + resources.push_back(std::move(tex)); + else + temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex)); + + return result; + } + + void ui_overlay_renderer::init(vk::command_buffer& cmd, vk::data_heap& upload_heap) + { + rsx::overlays::resource_config configuration; + configuration.load_files(); + + auto& dev = cmd.get_command_pool().get_owner(); + u64 storage_key = 1; + + for (const auto &res : configuration.texture_raw_data) + { + upload_simple_texture(dev, cmd, upload_heap, storage_key++, res->w, res->h, 1, false, false, res->data, UINT32_MAX); + } + + configuration.free_resources(); + } + + void ui_overlay_renderer::destroy() + { + temp_image_cache.clear(); + temp_view_cache.clear(); + + resources.clear(); + font_cache.clear(); + view_cache.clear(); + + overlay_pass::destroy(); + } + + void ui_overlay_renderer::remove_temp_resources(u32 key) + { + std::vector keys_to_remove; + for (const auto& temp_image : temp_image_cache) + { + if (temp_image.second.first == key) + { + keys_to_remove.push_back(temp_image.first); + } + } + + for (const auto& _key : keys_to_remove) + { + temp_image_cache.erase(_key); + temp_view_cache.erase(_key); + } + } + + vk::image_view* ui_overlay_renderer::find_font(rsx::overlays::font* font, vk::command_buffer& cmd, vk::data_heap& upload_heap) + { + const auto image_size = font->get_glyph_data_dimensions(); + + u64 key = reinterpret_cast(font); + auto found = view_cache.find(key); + if (found != view_cache.end()) + { + if (const auto raw = found->second->image(); + image_size.width == raw->width() && + image_size.height == raw->height() && + image_size.depth == raw->layers()) + { + return found->second.get(); + } + else + { + auto gc = vk::get_resource_manager(); + gc->dispose(font_cache[key]); + gc->dispose(view_cache[key]); + } + } + + // Create font resource + std::vector bytes; + font->get_glyph_data(bytes); + + return upload_simple_texture(cmd.get_command_pool().get_owner(), cmd, upload_heap, key, image_size.width, image_size.height, image_size.depth, + true, false, bytes.data(), UINT32_MAX); + } + + vk::image_view* ui_overlay_renderer::find_temp_image(rsx::overlays::image_info* desc, vk::command_buffer& cmd, vk::data_heap& upload_heap, u32 owner_uid) + { + u64 key = reinterpret_cast(desc); + auto found = temp_view_cache.find(key); + if (found != temp_view_cache.end()) + return found->second.get(); + + return upload_simple_texture(cmd.get_command_pool().get_owner(), cmd, upload_heap, key, desc->w, desc->h, 1, + false, true, desc->data, owner_uid); + } + + void ui_overlay_renderer::update_uniforms(vk::command_buffer& /*cmd*/, vk::glsl::program* /*program*/) + { + m_ubo_offset = static_cast(m_ubo.alloc<256>(128)); + auto dst = static_cast(m_ubo.map(m_ubo_offset, 128)); + + // regs[0] = scaling parameters + dst[0] = m_scale_offset.r; + dst[1] = m_scale_offset.g; + dst[2] = m_scale_offset.b; + dst[3] = m_scale_offset.a; + + // regs[1] = color + dst[4] = m_color.r; + dst[5] = m_color.g; + dst[6] = m_color.b; + dst[7] = m_color.a; + + // regs[2] = fs config parameters + dst[8] = m_time; + dst[9] = m_pulse_glow? 1.f : 0.f; + dst[10] = m_skip_texture_read? 0.f : static_cast(m_texture_type); + dst[11] = m_clip_enabled ? 1.f : 0.f; + + // regs[3] = clip rect + dst[12] = m_clip_region.x1; + dst[13] = m_clip_region.y1; + dst[14] = m_clip_region.x2; + dst[15] = m_clip_region.y2; + + // regs[4] = fs config parameters 2 + dst[16] = m_blur_strength; + + // regs[5] = viewport + dst[20] = m_viewport.width; + dst[21] = m_viewport.height; + dst[22] = m_viewport.x; + dst[23] = m_viewport.y; + + m_ubo.unmap(); + } + + void ui_overlay_renderer::set_primitive_type(rsx::overlays::primitive_type type) + { + m_current_primitive_type = type; + + switch (type) + { + case rsx::overlays::primitive_type::quad_list: + case rsx::overlays::primitive_type::triangle_strip: + renderpass_config.set_primitive_type(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP); + break; + case rsx::overlays::primitive_type::line_list: + renderpass_config.set_primitive_type(VK_PRIMITIVE_TOPOLOGY_LINE_LIST); + break; + case rsx::overlays::primitive_type::line_strip: + renderpass_config.set_primitive_type(VK_PRIMITIVE_TOPOLOGY_LINE_STRIP); + break; + default: + fmt::throw_exception("Unexpected primitive type %d", static_cast(type)); + } + } + + void ui_overlay_renderer::emit_geometry(vk::command_buffer& cmd) + { + if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list) + { + // Emulate quads with disjointed triangle strips + u32 first = 0; + u32 num_quads = num_drawable_elements / 4; + + for (u32 n = 0; n < num_quads; ++n) + { + vkCmdDraw(cmd, 4, 1, first, 0); + first += 4; + } + } + else + { + overlay_pass::emit_geometry(cmd); + } + } + + void ui_overlay_renderer::run(vk::command_buffer& cmd, const areau& viewport, vk::framebuffer* target, VkRenderPass render_pass, + vk::data_heap& upload_heap, rsx::overlays::overlay& ui) + { + m_scale_offset = color4f(ui.virtual_width, ui.virtual_height, 1.f, 1.f); + m_time = static_cast(get_system_time() / 1000) * 0.005f; + m_viewport = { { static_cast(viewport.x1), static_cast(viewport.y1) }, { static_cast(viewport.width()), static_cast(viewport.height()) } }; + + std::vector image_views + { + vk::null_image_view(cmd, VK_IMAGE_VIEW_TYPE_2D), + vk::null_image_view(cmd, VK_IMAGE_VIEW_TYPE_2D_ARRAY) + }; + + for (auto& command : ui.get_compiled().draw_commands) + { + num_drawable_elements = static_cast(command.verts.size()); + + upload_vertex_data(command.verts.data(), num_drawable_elements); + set_primitive_type(command.config.primitives); + + m_skip_texture_read = false; + m_color = command.config.color; + m_pulse_glow = command.config.pulse_glow; + m_blur_strength = static_cast(command.config.blur_strength) * 0.01f; + m_clip_enabled = command.config.clip_region; + m_clip_region = command.config.clip_rect; + m_texture_type = 1; + + vk::image_view* src = nullptr; + switch (command.config.texture_ref) + { + case rsx::overlays::image_resource_id::game_icon: + case rsx::overlays::image_resource_id::backbuffer: + // TODO + case rsx::overlays::image_resource_id::none: + m_skip_texture_read = true; + break; + case rsx::overlays::image_resource_id::font_file: + src = find_font(command.config.font_ref, cmd, upload_heap); + m_texture_type = src->image()->layers() == 1 ? 2 : 3; + break; + case rsx::overlays::image_resource_id::raw_image: + src = find_temp_image(static_cast(command.config.external_data_ref), cmd, upload_heap, ui.uid); + break; + default: + src = view_cache[command.config.texture_ref].get(); + break; + } + + if (src) + { + const int res_id = src->image()->layers() > 1 ? 1 : 0; + image_views[res_id] = src; + } + + overlay_pass::run(cmd, viewport, target, image_views, render_pass); + } + + ui.update(); + } + + attachment_clear_pass::attachment_clear_pass() + { + vs_src = + "#version 450\n" + "#extension GL_ARB_separate_shader_objects : enable\n" + "layout(push_constant) uniform static_data{ vec4 regs[2]; };\n" + "layout(location=0) out vec2 tc0;\n" + "layout(location=1) out vec4 color;\n" + "layout(location=2) out vec4 mask;\n" + "\n" + "void main()\n" + "{\n" + " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" + " vec2 coords[] = {vec2(0., 0.), vec2(1., 0.), vec2(0., 1.), vec2(1., 1.)};\n" + " tc0 = coords[gl_VertexIndex % 4];\n" + " color = regs[0];\n" + " mask = regs[1];\n" + " gl_Position = vec4(positions[gl_VertexIndex % 4], 0., 1.);\n" + "}\n"; + + fs_src = + "#version 420\n" + "#extension GL_ARB_separate_shader_objects : enable\n" + "layout(set=0, binding=1) uniform sampler2D fs0;\n" + "layout(location=0) in vec2 tc0;\n" + "layout(location=1) in vec4 color;\n" + "layout(location=2) in vec4 mask;\n" + "layout(location=0) out vec4 out_color;\n" + "\n" + "void main()\n" + "{\n" + " vec4 original_color = texture(fs0, tc0);\n" + " out_color = mix(original_color, color, bvec4(mask));\n" + "}\n"; + + renderpass_config.set_depth_mask(false); + renderpass_config.set_color_mask(0, true, true, true, true); + renderpass_config.set_attachment_count(1); + } + + std::vector attachment_clear_pass::get_push_constants() + { + VkPushConstantRange constant; + constant.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + constant.offset = 0; + constant.size = 32; + + return { constant }; + } + + void attachment_clear_pass::update_uniforms(vk::command_buffer& cmd, vk::glsl::program* /*program*/) + { + f32 data[8]; + data[0] = clear_color.r; + data[1] = clear_color.g; + data[2] = clear_color.b; + data[3] = clear_color.a; + data[4] = colormask.r; + data[5] = colormask.g; + data[6] = colormask.b; + data[7] = colormask.a; + + vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_VERTEX_BIT, 0, 32, data); + } + + void attachment_clear_pass::set_up_viewport(vk::command_buffer& cmd, u32 x, u32 y, u32 w, u32 h) + { + VkViewport vp{}; + vp.x = static_cast(x); + vp.y = static_cast(y); + vp.width = static_cast(w); + vp.height = static_cast(h); + vp.minDepth = 0.f; + vp.maxDepth = 1.f; + vkCmdSetViewport(cmd, 0, 1, &vp); + + vkCmdSetScissor(cmd, 0, 1, ®ion); + } + + bool attachment_clear_pass::update_config(u32 clearmask, color4f color) + { + color4f mask = { 0.f, 0.f, 0.f, 0.f }; + if (clearmask & 0x10) mask.r = 1.f; + if (clearmask & 0x20) mask.g = 1.f; + if (clearmask & 0x40) mask.b = 1.f; + if (clearmask & 0x80) mask.a = 1.f; + + if (mask != colormask || color != clear_color) + { + colormask = mask; + clear_color = color; + return true; + } + + return false; + } + + void attachment_clear_pass::run(vk::command_buffer& cmd, vk::render_target* target, VkRect2D rect, VkRenderPass render_pass) + { + region = rect; + target->read_barrier(cmd); + + // Coverage sampling disabled, but actually report correct number of samples + renderpass_config.set_multisample_state(target->samples(), 0xFFFF, false, false, false); + + overlay_pass::run(cmd, { 0, 0, target->width(), target->height() }, target, + target->get_view(0xAAE4, rsx::default_remap_vector), render_pass); + } + + stencil_clear_pass::stencil_clear_pass() + { + vs_src = + "#version 450\n" + "#extension GL_ARB_separate_shader_objects : enable\n" + "\n" + "void main()\n" + "{\n" + " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" + " gl_Position = vec4(positions[gl_VertexIndex % 4], 0., 1.);\n" + "}\n"; + + fs_src = + "#version 420\n" + "#extension GL_ARB_separate_shader_objects : enable\n" + "layout(location=0) out vec4 out_color;\n" + "\n" + "void main()\n" + "{\n" + " out_color = vec4(0.);\n" + "}\n"; + } + + void stencil_clear_pass::set_up_viewport(vk::command_buffer& cmd, u32 x, u32 y, u32 w, u32 h) + { + VkViewport vp{}; + vp.x = static_cast(x); + vp.y = static_cast(y); + vp.width = static_cast(w); + vp.height = static_cast(h); + vp.minDepth = 0.f; + vp.maxDepth = 1.f; + vkCmdSetViewport(cmd, 0, 1, &vp); + + vkCmdSetScissor(cmd, 0, 1, ®ion); + } + + void stencil_clear_pass::run(vk::command_buffer& cmd, vk::render_target* target, VkRect2D rect, u32 stencil_clear, u32 stencil_write_mask, VkRenderPass render_pass) + { + region = rect; + + // Stencil setup. Replace all pixels in the scissor region with stencil_clear with the correct write mask. + renderpass_config.enable_stencil_test( + VK_STENCIL_OP_REPLACE, VK_STENCIL_OP_REPLACE, VK_STENCIL_OP_REPLACE, // Always replace + VK_COMPARE_OP_ALWAYS, // Always pass + 0xFF, // Full write-through + stencil_clear); // Write active bit + + renderpass_config.set_stencil_mask(stencil_write_mask); + renderpass_config.set_depth_mask(false); + + // Coverage sampling disabled, but actually report correct number of samples + renderpass_config.set_multisample_state(target->samples(), 0xFFFF, false, false, false); + + overlay_pass::run(cmd, { 0, 0, target->width(), target->height() }, target, std::vector{}, render_pass); + } + + video_out_calibration_pass::video_out_calibration_pass() + { + vs_src = + "#version 450\n\n" + "layout(location=0) out vec2 tc0;\n" + "\n" + "void main()\n" + "{\n" + " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" + " vec2 coords[] = {vec2(0., 0.), vec2(1., 0.), vec2(0., 1.), vec2(1., 1.)};\n" + " tc0 = coords[gl_VertexIndex % 4];\n" + " vec2 pos = positions[gl_VertexIndex % 4];\n" + " gl_Position = vec4(pos, 0., 1.);\n" + "}\n"; + + fs_src = + "#version 420\n\n" + "layout(set=0, binding=1) uniform sampler2D fs0;\n" + "layout(set=0, binding=2) uniform sampler2D fs1;\n" + "layout(location=0) in vec2 tc0;\n" + "layout(location=0) out vec4 ocol;\n" + "\n" + "layout(push_constant) uniform static_data\n" + "{\n" + " float gamma;\n" + " int limit_range;\n" + " int stereo;\n" + " int stereo_image_count;\n" + "};\n" + "\n" + "vec4 read_source()\n" + "{\n" + " if (stereo == 0) return texture(fs0, tc0);\n" + "\n" + " vec4 left, right;\n" + " if (stereo_image_count == 2)\n" + " {\n" + " left = texture(fs0, tc0);\n" + " right = texture(fs1, tc0);\n" + " }\n" + " else\n" + " {\n" + " vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n" + " vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n" + " left = texture(fs0, coord_left);\n" + " right = texture(fs0, coord_right);\n" + " }\n" + "\n" + " return vec4(left.r, right.g, right.b, 1.);\n" + "}\n" + "\n" + "void main()\n" + "{\n" + " vec4 color = read_source();\n" + " color.rgb = pow(color.rgb, vec3(gamma));\n" + " if (limit_range > 0)\n" + " ocol = ((color * 220.) + 16.) / 255.;\n" + " else\n" + " ocol = color;\n" + "}\n"; + + renderpass_config.set_depth_mask(false); + renderpass_config.set_color_mask(0, true, true, true, true); + renderpass_config.set_attachment_count(1); + + m_num_usable_samplers = 2; + } + + std::vector video_out_calibration_pass::get_push_constants() + { + VkPushConstantRange constant; + constant.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + constant.offset = 0; + constant.size = 16; + + return { constant }; + } + + void video_out_calibration_pass::update_uniforms(vk::command_buffer& cmd, vk::glsl::program* /*program*/) + { + vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16, config.data); + } + + void video_out_calibration_pass::run(vk::command_buffer& cmd, const areau& viewport, vk::framebuffer* target, + const rsx::simple_array& src, f32 gamma, bool limited_rgb, bool _3d, VkRenderPass render_pass) + { + config.gamma = gamma; + config.limit_range = limited_rgb? 1 : 0; + config.stereo = _3d? 1 : 0; + config.stereo_image_count = std::min(::size32(src), 2u); + + std::vector views; + views.reserve(2); + + for (auto& img : src) + { + // Only raw uploads can possibly have mismatched layout here + img->change_layout(cmd, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + views.push_back(img->get_view(VK_REMAP_IDENTITY, rsx::default_remap_vector)); + } + + if (views.size() < 2) + { + views.push_back(vk::null_image_view(cmd, VK_IMAGE_VIEW_TYPE_2D)); + } + + overlay_pass::run(cmd, viewport, target, views, render_pass); + } +} diff --git a/rpcs3/Emu/RSX/VK/VKOverlays.h b/rpcs3/Emu/RSX/VK/VKOverlays.h index 10a50e3107..97d357b10c 100644 --- a/rpcs3/Emu/RSX/VK/VKOverlays.h +++ b/rpcs3/Emu/RSX/VK/VKOverlays.h @@ -1,27 +1,41 @@ #pragma once -#include "VKVertexProgram.h" -#include "VKFragmentProgram.h" -#include "VKRenderTargets.h" -#include "VKFramebuffer.h" -#include "VKResourceManager.h" -#include "VKRenderPass.h" -#include "VKPipelineCompiler.h" + +#include "../Overlays/overlay_controls.h" +#include "VKProgramPipeline.h" +#include "VKHelpers.h" #include "vkutils/data_heap.h" -#include "vkutils/image.h" -#include "vkutils/image_helpers.h" -#include "vkutils/sampler.h" +#include "vkutils/descriptors.hpp" +#include "vkutils/graphics_pipeline_state.hpp" -#include "../Overlays/overlays.h" +#include "Emu/IdManager.h" #include -#include "util/fnv_hash.hpp" -#define VK_OVERLAY_MAX_DRAW_CALLS 1024 +namespace rsx +{ + namespace overlays + { + struct overlay; + } +} namespace vk { - //TODO: Refactor text print class to inherit from this base class + struct framebuffer; + struct sampler; + struct image_view; + class image; + class viewable_image; + class command_buffer; + class render_target; + + namespace glsl + { + class program; + } + + // TODO: Refactor text print class to inherit from this base class struct overlay_pass { vk::glsl::shader m_vertex_shader; @@ -59,121 +73,21 @@ namespace vk u32 m_ubo_offset = 0; u32 m_vao_offset = 0; - overlay_pass() - { - //Override-able defaults - renderpass_config.set_primitive_type(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP); - } + overlay_pass(); + ~overlay_pass(); - ~overlay_pass() - { - m_vao.destroy(); - m_ubo.destroy(); - } + u64 get_pipeline_key(VkRenderPass pass); - u64 get_pipeline_key(VkRenderPass pass) - { - if (!multi_primitive) - { - // Default fast path - return reinterpret_cast(pass); - } - else - { - struct - { - u64 pass_value; - u64 config; - } - key{ reinterpret_cast(pass), static_cast(renderpass_config.ia.topology) }; - return rpcs3::hash_struct(key); - } - } + void check_heap(); - void check_heap() - { - if (!m_vao.heap) - { - m_vao.create(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, 1 * 0x100000, "overlays VAO", 128); - m_ubo.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, 8 * 0x100000, "overlays UBO", 128); - } - } + void init_descriptors(); - void init_descriptors() - { - VkDescriptorPoolSize descriptor_pool_sizes[2] = - { - { VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_OVERLAY_MAX_DRAW_CALLS * m_num_usable_samplers }, - { VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_OVERLAY_MAX_DRAW_CALLS }, - }; + virtual void update_uniforms(vk::command_buffer& /*cmd*/, vk::glsl::program* /*program*/) {} - //Reserve descriptor pools - m_descriptor_pool.create(*m_device, descriptor_pool_sizes, 2, VK_OVERLAY_MAX_DRAW_CALLS, 2); + virtual std::vector get_vertex_inputs(); + virtual std::vector get_fragment_inputs(); - std::vector bindings(1 + m_num_usable_samplers); - - bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - bindings[0].descriptorCount = 1; - bindings[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT; - bindings[0].binding = 0; - bindings[0].pImmutableSamplers = nullptr; - - for (u32 n = 1; n <= m_num_usable_samplers; ++n) - { - bindings[n].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - bindings[n].descriptorCount = 1; - bindings[n].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; - bindings[n].binding = n; - bindings[n].pImmutableSamplers = nullptr; - } - - VkDescriptorSetLayoutCreateInfo infos = {}; - infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - infos.pBindings = bindings.data(); - infos.bindingCount = 1 + m_num_usable_samplers; - - CHECK_RESULT(vkCreateDescriptorSetLayout(*m_device, &infos, nullptr, &m_descriptor_layout)); - - VkPipelineLayoutCreateInfo layout_info = {}; - layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - layout_info.setLayoutCount = 1; - layout_info.pSetLayouts = &m_descriptor_layout; - - std::vector push_constants = get_push_constants(); - if (!push_constants.empty()) - { - layout_info.pushConstantRangeCount = u32(push_constants.size()); - layout_info.pPushConstantRanges = push_constants.data(); - } - - CHECK_RESULT(vkCreatePipelineLayout(*m_device, &layout_info, nullptr, &m_pipeline_layout)); - } - - virtual void update_uniforms(vk::command_buffer& /*cmd*/, vk::glsl::program* /*program*/) - { - } - - virtual std::vector get_vertex_inputs() - { - check_heap(); - return{}; - } - - virtual std::vector get_fragment_inputs() - { - std::vector fs_inputs; - fs_inputs.push_back({ ::glsl::program_domain::glsl_fragment_program, vk::glsl::program_input_type::input_type_uniform_buffer,{},{}, 0, "static_data" }); - - for (u32 n = 1; n <= m_num_usable_samplers; ++n) - { - fs_inputs.push_back({ ::glsl::program_domain::glsl_fragment_program, vk::glsl::program_input_type::input_type_texture,{},{}, n, "fs" + std::to_string(n-1) }); - } - - return fs_inputs; - } - - virtual void get_dynamic_state_entries(std::vector& /*state_descriptors*/) - {} + virtual void get_dynamic_state_entries(std::vector& /*state_descriptors*/) {} virtual std::vector get_push_constants() { @@ -192,215 +106,24 @@ namespace vk m_vao.unmap(); } - vk::glsl::program* build_pipeline(u64 storage_key, VkRenderPass render_pass) - { - if (!compiled) - { - m_vertex_shader.create(::glsl::program_domain::glsl_vertex_program, vs_src); - m_vertex_shader.compile(); + vk::glsl::program* build_pipeline(u64 storage_key, VkRenderPass render_pass); - m_fragment_shader.create(::glsl::program_domain::glsl_fragment_program, fs_src); - m_fragment_shader.compile(); + void load_program(vk::command_buffer& cmd, VkRenderPass pass, const std::vector& src); - compiled = true; - } + virtual void create(const vk::render_device& dev); + virtual void destroy(); - VkPipelineShaderStageCreateInfo shader_stages[2] = {}; - shader_stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shader_stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT; - shader_stages[0].module = m_vertex_shader.get_handle(); - shader_stages[0].pName = "main"; + void free_resources(); - shader_stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shader_stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT; - shader_stages[1].module = m_fragment_shader.get_handle(); - shader_stages[1].pName = "main"; + vk::framebuffer* get_framebuffer(vk::image* target, VkRenderPass render_pass); - std::vector dynamic_state_descriptors; - dynamic_state_descriptors.push_back(VK_DYNAMIC_STATE_VIEWPORT); - dynamic_state_descriptors.push_back(VK_DYNAMIC_STATE_SCISSOR); - get_dynamic_state_entries(dynamic_state_descriptors); + virtual void emit_geometry(vk::command_buffer& cmd); - VkPipelineDynamicStateCreateInfo dynamic_state_info = {}; - dynamic_state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; - dynamic_state_info.dynamicStateCount = ::size32(dynamic_state_descriptors); - dynamic_state_info.pDynamicStates = dynamic_state_descriptors.data(); + virtual void set_up_viewport(vk::command_buffer& cmd, u32 x, u32 y, u32 w, u32 h); - VkVertexInputBindingDescription vb = { 0, 16, VK_VERTEX_INPUT_RATE_VERTEX }; - VkVertexInputAttributeDescription via = { 0, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 0 }; - VkPipelineVertexInputStateCreateInfo vi = {}; - vi.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; - vi.vertexBindingDescriptionCount = 1; - vi.pVertexBindingDescriptions = &vb; - vi.vertexAttributeDescriptionCount = 1; - vi.pVertexAttributeDescriptions = &via; - - VkPipelineViewportStateCreateInfo vp = {}; - vp.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; - vp.scissorCount = 1; - vp.viewportCount = 1; - - VkGraphicsPipelineCreateInfo info = {}; - info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; - info.pVertexInputState = &vi; - info.pInputAssemblyState = &renderpass_config.ia; - info.pRasterizationState = &renderpass_config.rs; - info.pColorBlendState = &renderpass_config.cs; - info.pMultisampleState = &renderpass_config.ms; - info.pViewportState = &vp; - info.pDepthStencilState = &renderpass_config.ds; - info.stageCount = 2; - info.pStages = shader_stages; - info.pDynamicState = &dynamic_state_info; - info.layout = m_pipeline_layout; - info.basePipelineIndex = -1; - info.basePipelineHandle = VK_NULL_HANDLE; - info.renderPass = render_pass; - - auto compiler = vk::get_pipe_compiler(); - auto program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE, {}, get_vertex_inputs(), get_fragment_inputs()); - auto result = program.get(); - m_program_cache[storage_key] = std::move(program); - - return result; - } - - void load_program(vk::command_buffer& cmd, VkRenderPass pass, const std::vector& src) - { - vk::glsl::program *program = nullptr; - const auto key = get_pipeline_key(pass); - - auto found = m_program_cache.find(key); - if (found != m_program_cache.end()) - program = found->second.get(); - else - program = build_pipeline(key, pass); - - ensure(m_used_descriptors < VK_OVERLAY_MAX_DRAW_CALLS); - - VkDescriptorSetAllocateInfo alloc_info = {}; - alloc_info.descriptorPool = m_descriptor_pool; - alloc_info.descriptorSetCount = 1; - alloc_info.pSetLayouts = &m_descriptor_layout; - alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - - CHECK_RESULT(vkAllocateDescriptorSets(*m_device, &alloc_info, &m_descriptor_set)); - m_used_descriptors++; - - if (!m_sampler) - { - m_sampler = std::make_unique(*m_device, - VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, - VK_FALSE, 0.f, 1.f, 0.f, 0.f, m_sampler_filter, m_sampler_filter, VK_SAMPLER_MIPMAP_MODE_NEAREST, VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK); - } - - update_uniforms(cmd, program); - - program->bind_uniform({ m_ubo.heap->value, m_ubo_offset, std::max(m_ubo_length, 4u) }, 0, m_descriptor_set); - - for (uint n = 0; n < src.size(); ++n) - { - VkDescriptorImageInfo info = { m_sampler->value, src[n]->value, src[n]->image()->current_layout }; - program->bind_uniform(info, "fs" + std::to_string(n), VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, m_descriptor_set); - } - - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, program->pipeline); - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr); - - VkBuffer buffers = m_vao.heap->value; - VkDeviceSize offsets = m_vao_offset; - vkCmdBindVertexBuffers(cmd, 0, 1, &buffers, &offsets); - } - - virtual void create(const vk::render_device &dev) - { - if (!initialized) - { - m_device = &dev; - init_descriptors(); - - initialized = true; - } - } - - virtual void destroy() - { - if (initialized) - { - m_vertex_shader.destroy(); - m_fragment_shader.destroy(); - m_program_cache.clear(); - m_sampler.reset(); - - vkDestroyDescriptorSetLayout(*m_device, m_descriptor_layout, nullptr); - vkDestroyPipelineLayout(*m_device, m_pipeline_layout, nullptr); - m_descriptor_pool.destroy(); - - initialized = false; - } - } - - void free_resources() - { - if (m_used_descriptors == 0) - return; - - m_descriptor_pool.reset(0); - m_used_descriptors = 0; - - m_vao.reset_allocation_stats(); - m_ubo.reset_allocation_stats(); - } - - vk::framebuffer* get_framebuffer(vk::image* target, VkRenderPass render_pass) - { - VkDevice dev = (*vk::get_current_renderer()); - return vk::get_framebuffer(dev, target->width(), target->height(), render_pass, { target }); - } - - virtual void emit_geometry(vk::command_buffer &cmd) - { - vkCmdDraw(cmd, num_drawable_elements, 1, first_vertex, 0); - } - - virtual void set_up_viewport(vk::command_buffer &cmd, u32 x, u32 y, u32 w, u32 h) - { - VkViewport vp{}; - vp.x = static_cast(x); - vp.y = static_cast(y); - vp.width = static_cast(w); - vp.height = static_cast(h); - vp.minDepth = 0.f; - vp.maxDepth = 1.f; - vkCmdSetViewport(cmd, 0, 1, &vp); - - VkRect2D vs = { { static_cast(x), static_cast(y) }, { w, h } }; - vkCmdSetScissor(cmd, 0, 1, &vs); - } - - void run(vk::command_buffer &cmd, const areau& viewport, vk::framebuffer* fbo, const std::vector& src, VkRenderPass render_pass) - { - load_program(cmd, render_pass, src); - set_up_viewport(cmd, viewport.x1, viewport.y1, viewport.width(), viewport.height()); - - vk::begin_renderpass(cmd, render_pass, fbo->value, viewport); - emit_geometry(cmd); - } - - void run(vk::command_buffer &cmd, const areau& viewport, vk::image* target, const std::vector& src, VkRenderPass render_pass) - { - auto fbo = static_cast(get_framebuffer(target, render_pass)); - fbo->add_ref(); - - run(cmd, viewport, fbo, src, render_pass); - fbo->release(); - } - - void run(vk::command_buffer &cmd, const areau& viewport, vk::image* target, vk::image_view* src, VkRenderPass render_pass) - { - std::vector views = { src }; - run(cmd, viewport, target, views, render_pass); - } + void run(vk::command_buffer& cmd, const areau& viewport, vk::framebuffer* fbo, const std::vector& src, VkRenderPass render_pass); + void run(vk::command_buffer& cmd, const areau& viewport, vk::image* target, const std::vector& src, VkRenderPass render_pass); + void run(vk::command_buffer& cmd, const areau& viewport, vk::image* target, vk::image_view* src, VkRenderPass render_pass); }; struct ui_overlay_renderer : public overlay_pass @@ -423,440 +146,28 @@ namespace vk std::unordered_map> temp_view_cache; rsx::overlays::primitive_type m_current_primitive_type = rsx::overlays::primitive_type::quad_list; - ui_overlay_renderer() - { - vs_src = - "#version 450\n" - "#extension GL_ARB_separate_shader_objects : enable\n" - "layout(location=0) in vec4 in_pos;\n" - "layout(std140, set=0, binding=0) uniform static_data{ vec4 regs[8]; };\n" - "layout(location=0) out vec2 tc0;\n" - "layout(location=1) out vec4 color;\n" - "layout(location=2) out vec4 parameters;\n" - "layout(location=3) out vec4 clip_rect;\n" - "layout(location=4) out vec4 parameters2;\n" - "\n" - "vec2 snap_to_grid(const in vec2 normalized)\n" - "{\n" - " return (floor(normalized * regs[5].xy) + 0.5) / regs[5].xy;\n" - "}\n" - "\n" - "vec4 clip_to_ndc(const in vec4 coord)\n" - "{\n" - " return (coord * regs[0].zwzw) / regs[0].xyxy;\n" - "}\n" - "\n" - "vec4 ndc_to_window(const in vec4 coord)\n" - "{\n" - " return fma(coord, regs[5].xyxy, regs[5].zwzw);\n" - "}\n" - "\n" - "void main()\n" - "{\n" - " tc0.xy = in_pos.zw;\n" - " color = regs[1];\n" - " parameters = regs[2];\n" - " parameters2 = regs[4];\n" - " clip_rect = ndc_to_window(clip_to_ndc(regs[3]));\n" - " vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n" - " pos.xy = snap_to_grid(pos.xy);\n" - " gl_Position = (pos + pos) - 1.;\n" - "}\n"; + ui_overlay_renderer(); - fs_src = - "#version 420\n" - "#extension GL_ARB_separate_shader_objects : enable\n" - "layout(set=0, binding=1) uniform sampler2D fs0;\n" - "layout(set=0, binding=2) uniform sampler2DArray fs1;\n" - "layout(location=0) in vec2 tc0;\n" - "layout(location=1) in vec4 color;\n" - "layout(location=2) in vec4 parameters;\n" - "layout(location=3) in vec4 clip_rect;\n" - "layout(location=4) in vec4 parameters2;\n" - "layout(location=0) out vec4 ocol;\n" - "\n" - "vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n" - "{\n" - " vec2 coords[9];\n" - " coords[0] = coord - tex_offset\n;" - " coords[1] = coord + vec2(0., -tex_offset.y);\n" - " coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n" - " coords[3] = coord + vec2(-tex_offset.x, 0.);\n" - " coords[4] = coord;\n" - " coords[5] = coord + vec2(tex_offset.x, 0.);\n" - " coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n" - " coords[7] = coord + vec2(0., tex_offset.y);\n" - " coords[8] = coord + tex_offset;\n" - "\n" - " float weights[9] =\n" - " {\n" - " 1., 2., 1.,\n" - " 2., 4., 2.,\n" - " 1., 2., 1.\n" - " };\n" - "\n" - " vec4 blurred = vec4(0.);\n" - " for (int n = 0; n < 9; ++n)\n" - " {\n" - " blurred += texture(tex, coords[n]) * weights[n];\n" - " }\n" - "\n" - " return blurred / 16.f;\n" - "}\n" - "\n" - "vec4 sample_image(sampler2D tex, vec2 coord, float blur_strength)\n" - "{\n" - " vec4 original = texture(tex, coord);\n" - " if (blur_strength == 0) return original;\n" - " \n" - " vec2 constraints = 1.f / vec2(640, 360);\n" - " vec2 res_offset = 1.f / textureSize(fs0, 0);\n" - " vec2 tex_offset = max(res_offset, constraints);\n" - "\n" - " // Sample triangle pattern and average\n" - " // TODO: Nicer looking gaussian blur with less sampling\n" - " vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n" - " vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n" - " vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n" - "\n" - " vec4 blurred = blur0 + blur1 + blur2;\n" - " blurred /= 3.;\n" - " return mix(original, blurred, blur_strength);\n" - "}\n" - "\n" - "void main()\n" - "{\n" - " if (parameters.w != 0)\n" - " {" - " if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n" - " gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n" - " {\n" - " discard;\n" - " return;\n" - " }\n" - " }\n" - "\n" - " vec4 diff_color = color;\n" - " if (parameters.y != 0)\n" - " diff_color.a *= (sin(parameters.x) + 1.f) * 0.5f;\n" - "\n" - " if (parameters.z < 1.)\n" - " {\n" - " ocol = diff_color;\n" - " }\n" - " else if (parameters.z > 2.)\n" - " {\n" - " ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))).rrrr * diff_color;\n" - " }\n" - " else if (parameters.z > 1.)\n" - " {\n" - " ocol = texture(fs0, tc0).rrrr * diff_color;\n" - " }\n" - " else\n" - " {\n" - " ocol = sample_image(fs0, tc0, parameters2.x).bgra * diff_color;\n" - " }\n" - "}\n"; + vk::image_view* upload_simple_texture(vk::render_device& dev, vk::command_buffer& cmd, + vk::data_heap& upload_heap, u64 key, u32 w, u32 h, u32 layers, bool font, bool temp, void* pixel_src, u32 owner_uid); - // Allow mixed primitive rendering - multi_primitive = true; + void init(vk::command_buffer& cmd, vk::data_heap& upload_heap); - // 2 input textures - m_num_usable_samplers = 2; + void destroy() override; - renderpass_config.set_attachment_count(1); - renderpass_config.set_color_mask(0, true, true, true, true); - renderpass_config.set_depth_mask(false); - renderpass_config.enable_blend(0, - VK_BLEND_FACTOR_SRC_ALPHA, VK_BLEND_FACTOR_SRC_ALPHA, - VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, - VK_BLEND_OP_ADD, VK_BLEND_OP_ADD); - } + void remove_temp_resources(u32 key); - vk::image_view* upload_simple_texture(vk::render_device &dev, vk::command_buffer &cmd, - vk::data_heap& upload_heap, u64 key, u32 w, u32 h, u32 layers, bool font, bool temp, void *pixel_src, u32 owner_uid) - { - const VkFormat format = (font) ? VK_FORMAT_R8_UNORM : VK_FORMAT_B8G8R8A8_UNORM; - const u32 pitch = (font) ? w : w * 4; - const u32 data_size = pitch * h * layers; - const auto offset = upload_heap.alloc<512>(data_size); - const auto addr = upload_heap.map(offset, data_size); + vk::image_view* find_font(rsx::overlays::font* font, vk::command_buffer& cmd, vk::data_heap& upload_heap); + vk::image_view* find_temp_image(rsx::overlays::image_info* desc, vk::command_buffer& cmd, vk::data_heap& upload_heap, u32 owner_uid); - const VkImageSubresourceRange range = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, layers }; + void update_uniforms(vk::command_buffer& /*cmd*/, vk::glsl::program* /*program*/) override; - auto tex = std::make_unique(dev, dev.get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, - VK_IMAGE_TYPE_2D, format, std::max(w, 1u), std::max(h, 1u), 1, 1, layers, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, - VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - 0); + void set_primitive_type(rsx::overlays::primitive_type type); - if (pixel_src && data_size) - std::memcpy(addr, pixel_src, data_size); - else if (data_size) - std::memset(addr, 0, data_size); + void emit_geometry(vk::command_buffer& cmd) override; - upload_heap.unmap(); - - VkBufferImageCopy region; - region.imageSubresource = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, layers }; - region.bufferOffset = offset; - region.bufferRowLength = w; - region.bufferImageHeight = h; - region.imageOffset = {}; - region.imageExtent = { static_cast(w), static_cast(h), 1u }; - - change_image_layout(cmd, tex.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, range); - vkCmdCopyBufferToImage(cmd, upload_heap.heap->value, tex->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); - change_image_layout(cmd, tex.get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, range); - - auto view = std::make_unique(dev, tex.get()); - - auto result = view.get(); - - if (!temp || font) - view_cache[key] = std::move(view); - else - temp_view_cache[key] = std::move(view); - - if (font) - font_cache[key] = std::move(tex); - else if (!temp) - resources.push_back(std::move(tex)); - else - temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex)); - - return result; - } - - void init(vk::command_buffer &cmd, vk::data_heap &upload_heap) - { - rsx::overlays::resource_config configuration; - configuration.load_files(); - - auto& dev = cmd.get_command_pool().get_owner(); - u64 storage_key = 1; - - for (const auto &res : configuration.texture_raw_data) - { - upload_simple_texture(dev, cmd, upload_heap, storage_key++, res->w, res->h, 1, false, false, res->data, UINT32_MAX); - } - - configuration.free_resources(); - } - - void destroy() override - { - temp_image_cache.clear(); - temp_view_cache.clear(); - - resources.clear(); - font_cache.clear(); - view_cache.clear(); - - overlay_pass::destroy(); - } - - void remove_temp_resources(u32 key) - { - std::vector keys_to_remove; - for (const auto& temp_image : temp_image_cache) - { - if (temp_image.second.first == key) - { - keys_to_remove.push_back(temp_image.first); - } - } - - for (const auto& _key : keys_to_remove) - { - temp_image_cache.erase(_key); - temp_view_cache.erase(_key); - } - } - - vk::image_view* find_font(rsx::overlays::font *font, vk::command_buffer &cmd, vk::data_heap &upload_heap) - { - const auto image_size = font->get_glyph_data_dimensions(); - - u64 key = reinterpret_cast(font); - auto found = view_cache.find(key); - if (found != view_cache.end()) - { - if (const auto raw = found->second->image(); - image_size.width == raw->width() && - image_size.height == raw->height() && - image_size.depth == raw->layers()) - { - return found->second.get(); - } - else - { - auto gc = vk::get_resource_manager(); - gc->dispose(font_cache[key]); - gc->dispose(view_cache[key]); - } - } - - // Create font resource - std::vector bytes; - font->get_glyph_data(bytes); - - return upload_simple_texture(cmd.get_command_pool().get_owner(), cmd, upload_heap, key, image_size.width, image_size.height, image_size.depth, - true, false, bytes.data(), UINT32_MAX); - } - - vk::image_view* find_temp_image(rsx::overlays::image_info *desc, vk::command_buffer &cmd, vk::data_heap &upload_heap, u32 owner_uid) - { - u64 key = reinterpret_cast(desc); - auto found = temp_view_cache.find(key); - if (found != temp_view_cache.end()) - return found->second.get(); - - return upload_simple_texture(cmd.get_command_pool().get_owner(), cmd, upload_heap, key, desc->w, desc->h, 1, - false, true, desc->data, owner_uid); - } - - void update_uniforms(vk::command_buffer& /*cmd*/, vk::glsl::program* /*program*/) override - { - m_ubo_offset = static_cast(m_ubo.alloc<256>(128)); - auto dst = static_cast(m_ubo.map(m_ubo_offset, 128)); - - // regs[0] = scaling parameters - dst[0] = m_scale_offset.r; - dst[1] = m_scale_offset.g; - dst[2] = m_scale_offset.b; - dst[3] = m_scale_offset.a; - - // regs[1] = color - dst[4] = m_color.r; - dst[5] = m_color.g; - dst[6] = m_color.b; - dst[7] = m_color.a; - - // regs[2] = fs config parameters - dst[8] = m_time; - dst[9] = m_pulse_glow? 1.f : 0.f; - dst[10] = m_skip_texture_read? 0.f : static_cast(m_texture_type); - dst[11] = m_clip_enabled ? 1.f : 0.f; - - // regs[3] = clip rect - dst[12] = m_clip_region.x1; - dst[13] = m_clip_region.y1; - dst[14] = m_clip_region.x2; - dst[15] = m_clip_region.y2; - - // regs[4] = fs config parameters 2 - dst[16] = m_blur_strength; - - // regs[5] = viewport - dst[20] = m_viewport.width; - dst[21] = m_viewport.height; - dst[22] = m_viewport.x; - dst[23] = m_viewport.y; - - m_ubo.unmap(); - } - - void set_primitive_type(rsx::overlays::primitive_type type) - { - m_current_primitive_type = type; - - switch (type) - { - case rsx::overlays::primitive_type::quad_list: - case rsx::overlays::primitive_type::triangle_strip: - renderpass_config.set_primitive_type(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP); - break; - case rsx::overlays::primitive_type::line_list: - renderpass_config.set_primitive_type(VK_PRIMITIVE_TOPOLOGY_LINE_LIST); - break; - case rsx::overlays::primitive_type::line_strip: - renderpass_config.set_primitive_type(VK_PRIMITIVE_TOPOLOGY_LINE_STRIP); - break; - default: - fmt::throw_exception("Unexpected primitive type %d", static_cast(type)); - } - } - - void emit_geometry(vk::command_buffer &cmd) override - { - if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list) - { - // Emulate quads with disjointed triangle strips - u32 first = 0; - u32 num_quads = num_drawable_elements / 4; - - for (u32 n = 0; n < num_quads; ++n) - { - vkCmdDraw(cmd, 4, 1, first, 0); - first += 4; - } - } - else - { - overlay_pass::emit_geometry(cmd); - } - } - - void run(vk::command_buffer &cmd, const areau& viewport, vk::framebuffer* target, VkRenderPass render_pass, - vk::data_heap &upload_heap, rsx::overlays::overlay &ui) - { - m_scale_offset = color4f(ui.virtual_width, ui.virtual_height, 1.f, 1.f); - m_time = static_cast(get_system_time() / 1000) * 0.005f; - m_viewport = { { static_cast(viewport.x1), static_cast(viewport.y1) }, { static_cast(viewport.width()), static_cast(viewport.height()) } }; - - std::vector image_views - { - vk::null_image_view(cmd, VK_IMAGE_VIEW_TYPE_2D), - vk::null_image_view(cmd, VK_IMAGE_VIEW_TYPE_2D_ARRAY) - }; - - for (auto &command : ui.get_compiled().draw_commands) - { - num_drawable_elements = static_cast(command.verts.size()); - - upload_vertex_data(command.verts.data(), num_drawable_elements); - set_primitive_type(command.config.primitives); - - m_skip_texture_read = false; - m_color = command.config.color; - m_pulse_glow = command.config.pulse_glow; - m_blur_strength = static_cast(command.config.blur_strength) * 0.01f; - m_clip_enabled = command.config.clip_region; - m_clip_region = command.config.clip_rect; - m_texture_type = 1; - - vk::image_view* src = nullptr; - switch (command.config.texture_ref) - { - case rsx::overlays::image_resource_id::game_icon: - case rsx::overlays::image_resource_id::backbuffer: - //TODO - case rsx::overlays::image_resource_id::none: - m_skip_texture_read = true; - break; - case rsx::overlays::image_resource_id::font_file: - src = find_font(command.config.font_ref, cmd, upload_heap); - m_texture_type = src->image()->layers() == 1 ? 2 : 3; - break; - case rsx::overlays::image_resource_id::raw_image: - src = find_temp_image(static_cast(command.config.external_data_ref), cmd, upload_heap, ui.uid); - break; - default: - src = view_cache[command.config.texture_ref].get(); - break; - } - - if (src) - { - const int res_id = src->image()->layers() > 1 ? 1 : 0; - image_views[res_id] = src; - } - - overlay_pass::run(cmd, viewport, target, image_views, render_pass); - } - - ui.update(); - } + void run(vk::command_buffer& cmd, const areau& viewport, vk::framebuffer* target, VkRenderPass render_pass, + vk::data_heap& upload_heap, rsx::overlays::overlay& ui); }; struct attachment_clear_pass : public overlay_pass @@ -865,176 +176,28 @@ namespace vk color4f colormask = { 1.f, 1.f, 1.f, 1.f }; VkRect2D region = {}; - attachment_clear_pass() - { - vs_src = - "#version 450\n" - "#extension GL_ARB_separate_shader_objects : enable\n" - "layout(push_constant) uniform static_data{ vec4 regs[2]; };\n" - "layout(location=0) out vec2 tc0;\n" - "layout(location=1) out vec4 color;\n" - "layout(location=2) out vec4 mask;\n" - "\n" - "void main()\n" - "{\n" - " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" - " vec2 coords[] = {vec2(0., 0.), vec2(1., 0.), vec2(0., 1.), vec2(1., 1.)};\n" - " tc0 = coords[gl_VertexIndex % 4];\n" - " color = regs[0];\n" - " mask = regs[1];\n" - " gl_Position = vec4(positions[gl_VertexIndex % 4], 0., 1.);\n" - "}\n"; + attachment_clear_pass(); - fs_src = - "#version 420\n" - "#extension GL_ARB_separate_shader_objects : enable\n" - "layout(set=0, binding=1) uniform sampler2D fs0;\n" - "layout(location=0) in vec2 tc0;\n" - "layout(location=1) in vec4 color;\n" - "layout(location=2) in vec4 mask;\n" - "layout(location=0) out vec4 out_color;\n" - "\n" - "void main()\n" - "{\n" - " vec4 original_color = texture(fs0, tc0);\n" - " out_color = mix(original_color, color, bvec4(mask));\n" - "}\n"; + std::vector get_push_constants() override; - renderpass_config.set_depth_mask(false); - renderpass_config.set_color_mask(0, true, true, true, true); - renderpass_config.set_attachment_count(1); - } + void update_uniforms(vk::command_buffer& cmd, vk::glsl::program* /*program*/) override; - std::vector get_push_constants() override - { - VkPushConstantRange constant; - constant.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - constant.offset = 0; - constant.size = 32; + void set_up_viewport(vk::command_buffer& cmd, u32 x, u32 y, u32 w, u32 h) override; - return { constant }; - } + bool update_config(u32 clearmask, color4f color); - void update_uniforms(vk::command_buffer& cmd, vk::glsl::program* /*program*/) override - { - f32 data[8]; - data[0] = clear_color.r; - data[1] = clear_color.g; - data[2] = clear_color.b; - data[3] = clear_color.a; - data[4] = colormask.r; - data[5] = colormask.g; - data[6] = colormask.b; - data[7] = colormask.a; - - vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_VERTEX_BIT, 0, 32, data); - } - - void set_up_viewport(vk::command_buffer &cmd, u32 x, u32 y, u32 w, u32 h) override - { - VkViewport vp{}; - vp.x = static_cast(x); - vp.y = static_cast(y); - vp.width = static_cast(w); - vp.height = static_cast(h); - vp.minDepth = 0.f; - vp.maxDepth = 1.f; - vkCmdSetViewport(cmd, 0, 1, &vp); - - vkCmdSetScissor(cmd, 0, 1, ®ion); - } - - bool update_config(u32 clearmask, color4f color) - { - color4f mask = { 0.f, 0.f, 0.f, 0.f }; - if (clearmask & 0x10) mask.r = 1.f; - if (clearmask & 0x20) mask.g = 1.f; - if (clearmask & 0x40) mask.b = 1.f; - if (clearmask & 0x80) mask.a = 1.f; - - if (mask != colormask || color != clear_color) - { - colormask = mask; - clear_color = color; - return true; - } - - return false; - } - - void run(vk::command_buffer &cmd, vk::render_target* target, VkRect2D rect, VkRenderPass render_pass) - { - region = rect; - target->read_barrier(cmd); - - // Coverage sampling disabled, but actually report correct number of samples - renderpass_config.set_multisample_state(target->samples(), 0xFFFF, false, false, false); - - overlay_pass::run(cmd, { 0, 0, target->width(), target->height() }, target, - target->get_view(0xAAE4, rsx::default_remap_vector), render_pass); - } + void run(vk::command_buffer& cmd, vk::render_target* target, VkRect2D rect, VkRenderPass render_pass); }; struct stencil_clear_pass : public overlay_pass { VkRect2D region = {}; - stencil_clear_pass() - { - vs_src = - "#version 450\n" - "#extension GL_ARB_separate_shader_objects : enable\n" - "\n" - "void main()\n" - "{\n" - " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" - " gl_Position = vec4(positions[gl_VertexIndex % 4], 0., 1.);\n" - "}\n"; + stencil_clear_pass(); - fs_src = - "#version 420\n" - "#extension GL_ARB_separate_shader_objects : enable\n" - "layout(location=0) out vec4 out_color;\n" - "\n" - "void main()\n" - "{\n" - " out_color = vec4(0.);\n" - "}\n"; - } + void set_up_viewport(vk::command_buffer& cmd, u32 x, u32 y, u32 w, u32 h) override; - void set_up_viewport(vk::command_buffer& cmd, u32 x, u32 y, u32 w, u32 h) override - { - VkViewport vp{}; - vp.x = static_cast(x); - vp.y = static_cast(y); - vp.width = static_cast(w); - vp.height = static_cast(h); - vp.minDepth = 0.f; - vp.maxDepth = 1.f; - vkCmdSetViewport(cmd, 0, 1, &vp); - - vkCmdSetScissor(cmd, 0, 1, ®ion); - } - - void run(vk::command_buffer& cmd, vk::render_target* target, VkRect2D rect, u32 stencil_clear, u32 stencil_write_mask, VkRenderPass render_pass) - { - region = rect; - - // Stencil setup. Replace all pixels in the scissor region with stencil_clear with the correct write mask. - renderpass_config.enable_stencil_test( - VK_STENCIL_OP_REPLACE, VK_STENCIL_OP_REPLACE, VK_STENCIL_OP_REPLACE, // Always replace - VK_COMPARE_OP_ALWAYS, // Always pass - 0xFF, // Full write-through - stencil_clear); // Write active bit - - renderpass_config.set_stencil_mask(stencil_write_mask); - renderpass_config.set_depth_mask(false); - - // Coverage sampling disabled, but actually report correct number of samples - renderpass_config.set_multisample_state(target->samples(), 0xFFFF, false, false, false); - - overlay_pass::run(cmd, { 0, 0, target->width(), target->height() }, target, std::vector{}, render_pass); - } + void run(vk::command_buffer& cmd, vk::render_target* target, VkRect2D rect, u32 stencil_clear, u32 stencil_write_mask, VkRenderPass render_pass); }; struct video_out_calibration_pass : public overlay_pass @@ -1053,115 +216,14 @@ namespace vk } config; - video_out_calibration_pass() - { - vs_src = - "#version 450\n\n" - "layout(location=0) out vec2 tc0;\n" - "\n" - "void main()\n" - "{\n" - " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" - " vec2 coords[] = {vec2(0., 0.), vec2(1., 0.), vec2(0., 1.), vec2(1., 1.)};\n" - " tc0 = coords[gl_VertexIndex % 4];\n" - " vec2 pos = positions[gl_VertexIndex % 4];\n" - " gl_Position = vec4(pos, 0., 1.);\n" - "}\n"; + video_out_calibration_pass(); - fs_src = - "#version 420\n\n" - "layout(set=0, binding=1) uniform sampler2D fs0;\n" - "layout(set=0, binding=2) uniform sampler2D fs1;\n" - "layout(location=0) in vec2 tc0;\n" - "layout(location=0) out vec4 ocol;\n" - "\n" - "layout(push_constant) uniform static_data\n" - "{\n" - " float gamma;\n" - " int limit_range;\n" - " int stereo;\n" - " int stereo_image_count;\n" - "};\n" - "\n" - "vec4 read_source()\n" - "{\n" - " if (stereo == 0) return texture(fs0, tc0);\n" - "\n" - " vec4 left, right;\n" - " if (stereo_image_count == 2)\n" - " {\n" - " left = texture(fs0, tc0);\n" - " right = texture(fs1, tc0);\n" - " }\n" - " else\n" - " {\n" - " vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n" - " vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n" - " left = texture(fs0, coord_left);\n" - " right = texture(fs0, coord_right);\n" - " }\n" - "\n" - " return vec4(left.r, right.g, right.b, 1.);\n" - "}\n" - "\n" - "void main()\n" - "{\n" - " vec4 color = read_source();\n" - " color.rgb = pow(color.rgb, vec3(gamma));\n" - " if (limit_range > 0)\n" - " ocol = ((color * 220.) + 16.) / 255.;\n" - " else\n" - " ocol = color;\n" - "}\n"; + std::vector get_push_constants() override; - renderpass_config.set_depth_mask(false); - renderpass_config.set_color_mask(0, true, true, true, true); - renderpass_config.set_attachment_count(1); + void update_uniforms(vk::command_buffer& cmd, vk::glsl::program* /*program*/) override; - m_num_usable_samplers = 2; - } - - std::vector get_push_constants() override - { - VkPushConstantRange constant; - constant.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; - constant.offset = 0; - constant.size = 16; - - return { constant }; - } - - void update_uniforms(vk::command_buffer& cmd, vk::glsl::program* /*program*/) override - { - vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16, config.data); - } - - void run(vk::command_buffer &cmd, const areau& viewport, vk::framebuffer* target, - const rsx::simple_array& src, - f32 gamma, bool limited_rgb, bool _3d, VkRenderPass render_pass) - { - config.gamma = gamma; - config.limit_range = limited_rgb? 1 : 0; - config.stereo = _3d? 1 : 0; - config.stereo_image_count = std::min(::size32(src), 2u); - - std::vector views; - views.reserve(2); - - for (auto& img : src) - { - // Only raw uploads can possibly have mismatched layout here - img->change_layout(cmd, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - views.push_back(img->get_view(VK_REMAP_IDENTITY, rsx::default_remap_vector)); - } - - if (views.size() < 2) - { - views.push_back(vk::null_image_view(cmd, VK_IMAGE_VIEW_TYPE_2D)); - } - - overlay_pass::run(cmd, viewport, target, views, render_pass); - } + void run(vk::command_buffer& cmd, const areau& viewport, vk::framebuffer* target, + const rsx::simple_array& src, f32 gamma, bool limited_rgb, bool _3d, VkRenderPass render_pass); }; // TODO: Replace with a proper manager diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp index 083933accf..059a89f93c 100644 --- a/rpcs3/Emu/RSX/VK/VKPresent.cpp +++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp @@ -1,6 +1,7 @@ #include "stdafx.h" #include "VKGSRender.h" #include "vkutils/buffer_object.h" +#include "Emu/RSX/Overlays/overlays.h" #include "Emu/Cell/Modules/cellVideoOut.h" #include "util/asm.hpp" diff --git a/rpcs3/Emu/RSX/VK/VKResolveHelper.h b/rpcs3/Emu/RSX/VK/VKResolveHelper.h index 1e64b2e3ad..c7ffef59ac 100644 --- a/rpcs3/Emu/RSX/VK/VKResolveHelper.h +++ b/rpcs3/Emu/RSX/VK/VKResolveHelper.h @@ -3,6 +3,8 @@ #include "VKCompute.h" #include "VKOverlays.h" +#include "vkutils/image.h" + namespace vk { struct cs_resolve_base : compute_task diff --git a/rpcs3/GLGSRender.vcxproj b/rpcs3/GLGSRender.vcxproj index 8e726f0323..9987d38222 100644 --- a/rpcs3/GLGSRender.vcxproj +++ b/rpcs3/GLGSRender.vcxproj @@ -87,9 +87,11 @@ + + diff --git a/rpcs3/GLGSRender.vcxproj.filters b/rpcs3/GLGSRender.vcxproj.filters index 7ba169eba9..00b2469faa 100644 --- a/rpcs3/GLGSRender.vcxproj.filters +++ b/rpcs3/GLGSRender.vcxproj.filters @@ -15,6 +15,8 @@ + + diff --git a/rpcs3/VKGSRender.vcxproj b/rpcs3/VKGSRender.vcxproj index ef73d58ed2..5a267231e3 100644 --- a/rpcs3/VKGSRender.vcxproj +++ b/rpcs3/VKGSRender.vcxproj @@ -66,6 +66,7 @@ + @@ -73,6 +74,7 @@ + diff --git a/rpcs3/VKGSRender.vcxproj.filters b/rpcs3/VKGSRender.vcxproj.filters index 5bdba38c3c..8941a504e3 100644 --- a/rpcs3/VKGSRender.vcxproj.filters +++ b/rpcs3/VKGSRender.vcxproj.filters @@ -62,6 +62,8 @@ vkutils + +