1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 02:32:36 +01:00

Move code to cpp (#9938)

* GL: move GLOverlays code to cpp
* GL: move GLCompute code to cpp
* VK: move VKOverlays code to cpp
* VK: move VKCompute code to cpp
This commit is contained in:
Megamouse 2021-03-10 00:58:08 +01:00 committed by GitHub
parent 9cbe77904d
commit cbd895a29c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 2578 additions and 2344 deletions

View File

@ -430,10 +430,12 @@ target_sources(rpcs3_emu PRIVATE
RSX/Capture/rsx_capture.cpp
RSX/Capture/rsx_replay.cpp
RSX/GL/GLCommonDecompiler.cpp
RSX/GL/GLCompute.cpp
RSX/GL/GLDraw.cpp
RSX/GL/GLFragmentProgram.cpp
RSX/GL/GLGSRender.cpp
RSX/GL/GLHelpers.cpp
RSX/GL/GLOverlays.cpp
RSX/GL/GLPipelineCompiler.cpp
RSX/GL/GLPresent.cpp
RSX/GL/GLRenderTargets.cpp
@ -462,6 +464,7 @@ if(TARGET 3rdparty_vulkan)
RSX/VK/vkutils/shared.cpp
RSX/VK/VKCommandStream.cpp
RSX/VK/VKCommonDecompiler.cpp
RSX/VK/VKCompute.cpp
RSX/VK/VKDMA.cpp
RSX/VK/VKDraw.cpp
RSX/VK/VKFormats.cpp
@ -470,6 +473,7 @@ if(TARGET 3rdparty_vulkan)
RSX/VK/VKGSRender.cpp
RSX/VK/VKHelpers.cpp
RSX/VK/VKMemAlloc.cpp
RSX/VK/VKOverlays.cpp
RSX/VK/VKPipelineCompiler.cpp
RSX/VK/VKPresent.cpp
RSX/VK/VKProgramPipeline.cpp

View File

@ -0,0 +1,297 @@
#include "GLCompute.h"
#include "Utilities/StrUtil.h"
namespace gl
{
void compute_task::initialize()
{
// Set up optimal kernel size
const auto& caps = gl::get_driver_caps();
if (caps.vendor_AMD || caps.vendor_MESA)
{
optimal_group_size = 64;
unroll_loops = false;
}
else if (caps.vendor_NVIDIA)
{
optimal_group_size = 32;
}
else
{
optimal_group_size = 128;
}
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, reinterpret_cast<GLint*>(&max_invocations_x));
}
void compute_task::create()
{
if (!compiled)
{
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
m_shader.compile();
m_program.create();
m_program.attach(m_shader);
m_program.link();
compiled = true;
}
}
void compute_task::destroy()
{
if (compiled)
{
m_program.remove();
m_shader.remove();
compiled = false;
}
}
void compute_task::run(u32 invocations_x, u32 invocations_y)
{
GLint old_program;
glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);
bind_resources();
m_program.use();
glDispatchCompute(invocations_x, invocations_y, 1);
glUseProgram(old_program);
}
void compute_task::run(u32 num_invocations)
{
u32 invocations_x, invocations_y;
if (num_invocations <= max_invocations_x) [[likely]]
{
invocations_x = num_invocations;
invocations_y = 1;
}
else
{
// Since all the invocations will run, the optimal distribution is sqrt(count)
const u32 optimal_length = static_cast<u32>(floor(std::sqrt(num_invocations)));
invocations_x = optimal_length;
invocations_y = invocations_x;
if (num_invocations % invocations_x) invocations_y++;
}
run(invocations_x, invocations_y);
}
cs_shuffle_base::cs_shuffle_base()
{
work_kernel =
" value = data[index];\n"
" data[index] = %f(value);\n";
loop_advance =
" index++;\n";
suffix =
"}\n";
}
void cs_shuffle_base::build(const char* function_name, u32 _kernel_size)
{
// Initialize to allow detecting optimal settings
initialize();
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
m_src =
"#version 430\n"
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
"layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n"
"%ub"
"\n"
"#define KERNEL_SIZE %ks\n"
"\n"
"// Generic swap routines\n"
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
"\n"
"// Depth format conversions\n"
"#define d24f_to_f32(bits) (bits << 7)\n"
"#define f32_to_d24f(bits) (bits >> 7)\n"
"\n"
"uint linear_invocation_id()\n"
"{\n"
" uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n"
" return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n"
"}\n"
"\n"
"%md"
"void main()\n"
"{\n"
" uint invocation_id = linear_invocation_id();\n"
" uint index = invocation_id * KERNEL_SIZE;\n"
" uint value;\n"
" %vars"
"\n";
const std::pair<std::string, std::string> syntax_replace[] =
{
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
{ "%ws", std::to_string(optimal_group_size) },
{ "%ks", std::to_string(kernel_size) },
{ "%vars", variables },
{ "%f", function_name },
{ "%ub", uniforms },
{ "%md", method_declarations }
};
m_src = fmt::replace_all(m_src, syntax_replace);
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
if (kernel_size <= 1)
{
m_src += " {\n" + work_kernel + " }\n";
}
else if (unroll_loops)
{
work_kernel += loop_advance + "\n";
m_src += std::string
(
" //Unrolled loop\n"
" {\n"
);
// Assemble body with manual loop unroll to try loweing GPR usage
for (u32 n = 0; n < kernel_size; ++n)
{
m_src += work_kernel;
}
m_src += " }\n";
}
else
{
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
m_src += " {\n";
m_src += work_kernel;
m_src += loop_advance;
m_src += " }\n";
}
m_src += suffix;
}
void cs_shuffle_base::bind_resources()
{
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
}
void cs_shuffle_base::run(const gl::buffer* data, u32 data_length, u32 data_offset)
{
m_data = data;
m_data_offset = data_offset;
m_data_length = data_length;
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
if ((num_bytes_to_process + data_offset) > data->size())
{
// Technically robust buffer access should keep the driver from crashing in OOB situations
rsx_log.error("Inadequate buffer length submitted for a compute operation."
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
}
compute_task::run(num_invocations);
}
cs_shuffle_d32fx8_to_x8d24f::cs_shuffle_d32fx8_to_x8d24f()
{
uniforms = "uniform uint in_ptr, out_ptr;\n";
variables =
" uint in_offset = in_ptr >> 2;\n"
" uint out_offset = out_ptr >> 2;\n"
" uint depth, stencil;\n";
work_kernel =
" depth = data[index * 2 + in_offset];\n"
" stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n"
" value = f32_to_d24f(depth) << 8;\n"
" value |= stencil;\n"
" data[index + out_ptr] = bswap_u32(value);\n";
cs_shuffle_base::build("");
}
void cs_shuffle_d32fx8_to_x8d24f::bind_resources()
{
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
}
void cs_shuffle_d32fx8_to_x8d24f::run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
{
u32 data_offset;
if (src_offset > dst_offset)
{
data_offset = dst_offset;
m_ssbo_length = (src_offset + num_texels * 8) - data_offset;
}
else
{
data_offset = src_offset;
m_ssbo_length = (dst_offset + num_texels * 4) - data_offset;
}
m_program.uniforms["in_ptr"] = src_offset - data_offset;
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
cs_shuffle_base::run(data, num_texels * 4, data_offset);
}
cs_shuffle_x8d24f_to_d32fx8::cs_shuffle_x8d24f_to_d32fx8()
{
uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n";
variables =
" uint in_offset = in_ptr >> 2;\n"
" uint out_offset = out_ptr >> 2;\n"
" uint depth, stencil;\n";
work_kernel =
" value = data[index + in_offset];\n"
" value = bswap_u32(value);\n"
" stencil = (value & 0xFFu);\n"
" depth = (value >> 8);\n"
" data[index * 2 + out_offset] = d24f_to_f32(depth);\n"
" data[index * 2 + (out_offset + 1)] = stencil;\n";
cs_shuffle_base::build("");
}
void cs_shuffle_x8d24f_to_d32fx8::bind_resources()
{
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
}
void cs_shuffle_x8d24f_to_d32fx8::run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
{
u32 data_offset;
if (src_offset > dst_offset)
{
data_offset = dst_offset;
m_ssbo_length = (src_offset + num_texels * 4) - data_offset;
}
else
{
data_offset = src_offset;
m_ssbo_length = (dst_offset + num_texels * 8) - data_offset;
}
m_program.uniforms["in_ptr"] = src_offset - data_offset;
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
cs_shuffle_base::run(data, num_texels * 4, data_offset);
}
}

View File

@ -1,10 +1,8 @@
#pragma once
#include "Utilities/StrUtil.h"
#include "Emu/IdManager.h"
#include "GLHelpers.h"
#include "util/asm.hpp"
#include <unordered_map>
namespace gl
@ -22,88 +20,14 @@ namespace gl
u32 optimal_kernel_size = 1;
u32 max_invocations_x = 65535;
void initialize()
{
// Set up optimal kernel size
const auto& caps = gl::get_driver_caps();
if (caps.vendor_AMD || caps.vendor_MESA)
{
optimal_group_size = 64;
unroll_loops = false;
}
else if (caps.vendor_NVIDIA)
{
optimal_group_size = 32;
}
else
{
optimal_group_size = 128;
}
void initialize();
void create();
void destroy();
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, reinterpret_cast<GLint*>(&max_invocations_x));
}
virtual void bind_resources() {}
void create()
{
if (!compiled)
{
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
m_shader.compile();
m_program.create();
m_program.attach(m_shader);
m_program.link();
compiled = true;
}
}
void destroy()
{
if (compiled)
{
m_program.remove();
m_shader.remove();
compiled = false;
}
}
virtual void bind_resources()
{}
void run(u32 invocations_x, u32 invocations_y)
{
GLint old_program;
glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);
bind_resources();
m_program.use();
glDispatchCompute(invocations_x, invocations_y, 1);
glUseProgram(old_program);
}
void run(u32 num_invocations)
{
u32 invocations_x, invocations_y;
if (num_invocations <= max_invocations_x) [[likely]]
{
invocations_x = num_invocations;
invocations_y = 1;
}
else
{
// Since all the invocations will run, the optimal distribution is sqrt(count)
const u32 optimal_length = static_cast<u32>(floor(std::sqrt(num_invocations)));
invocations_x = optimal_length;
invocations_y = invocations_x;
if (num_invocations % invocations_x) invocations_y++;
}
run(invocations_x, invocations_y);
}
void run(u32 invocations_x, u32 invocations_y);
void run(u32 num_invocations);
};
struct cs_shuffle_base : compute_task
@ -115,130 +39,13 @@ namespace gl
std::string uniforms, variables, work_kernel, loop_advance, suffix, method_declarations;
cs_shuffle_base()
{
work_kernel =
" value = data[index];\n"
" data[index] = %f(value);\n";
cs_shuffle_base();
loop_advance =
" index++;\n";
void build(const char* function_name, u32 _kernel_size = 0);
suffix =
"}\n";
}
void bind_resources() override;
void build(const char* function_name, u32 _kernel_size = 0)
{
// Initialize to allow detecting optimal settings
initialize();
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
m_src =
"#version 430\n"
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
"layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n"
"%ub"
"\n"
"#define KERNEL_SIZE %ks\n"
"\n"
"// Generic swap routines\n"
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
"\n"
"// Depth format conversions\n"
"#define d24f_to_f32(bits) (bits << 7)\n"
"#define f32_to_d24f(bits) (bits >> 7)\n"
"\n"
"uint linear_invocation_id()\n"
"{\n"
" uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n"
" return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n"
"}\n"
"\n"
"%md"
"void main()\n"
"{\n"
" uint invocation_id = linear_invocation_id();\n"
" uint index = invocation_id * KERNEL_SIZE;\n"
" uint value;\n"
" %vars"
"\n";
const std::pair<std::string, std::string> syntax_replace[] =
{
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
{ "%ws", std::to_string(optimal_group_size) },
{ "%ks", std::to_string(kernel_size) },
{ "%vars", variables },
{ "%f", function_name },
{ "%ub", uniforms },
{ "%md", method_declarations }
};
m_src = fmt::replace_all(m_src, syntax_replace);
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
if (kernel_size <= 1)
{
m_src += " {\n" + work_kernel + " }\n";
}
else if (unroll_loops)
{
work_kernel += loop_advance + "\n";
m_src += std::string
(
" //Unrolled loop\n"
" {\n"
);
// Assemble body with manual loop unroll to try loweing GPR usage
for (u32 n = 0; n < kernel_size; ++n)
{
m_src += work_kernel;
}
m_src += " }\n";
}
else
{
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
m_src += " {\n";
m_src += work_kernel;
m_src += loop_advance;
m_src += " }\n";
}
m_src += suffix;
}
void bind_resources() override
{
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
}
void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0)
{
m_data = data;
m_data_offset = data_offset;
m_data_length = data_length;
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
if ((num_bytes_to_process + data_offset) > data->size())
{
// Technically robust buffer access should keep the driver from crashing in OOB situations
rsx_log.error("Inadequate buffer length submitted for a compute operation."
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
}
compute_task::run(num_invocations);
}
void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0);
};
struct cs_shuffle_16 : cs_shuffle_base
@ -272,97 +79,22 @@ namespace gl
{
u32 m_ssbo_length = 0;
cs_shuffle_d32fx8_to_x8d24f()
{
uniforms = "uniform uint in_ptr, out_ptr;\n";
cs_shuffle_d32fx8_to_x8d24f();
variables =
" uint in_offset = in_ptr >> 2;\n"
" uint out_offset = out_ptr >> 2;\n"
" uint depth, stencil;\n";
void bind_resources() override;
work_kernel =
" depth = data[index * 2 + in_offset];\n"
" stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n"
" value = f32_to_d24f(depth) << 8;\n"
" value |= stencil;\n"
" data[index + out_ptr] = bswap_u32(value);\n";
cs_shuffle_base::build("");
}
void bind_resources() override
{
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
}
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
{
u32 data_offset;
if (src_offset > dst_offset)
{
data_offset = dst_offset;
m_ssbo_length = (src_offset + num_texels * 8) - data_offset;
}
else
{
data_offset = src_offset;
m_ssbo_length = (dst_offset + num_texels * 4) - data_offset;
}
m_program.uniforms["in_ptr"] = src_offset - data_offset;
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
cs_shuffle_base::run(data, num_texels * 4, data_offset);
}
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels);
};
struct cs_shuffle_x8d24f_to_d32fx8 : cs_shuffle_base
{
u32 m_ssbo_length = 0;
cs_shuffle_x8d24f_to_d32fx8()
{
uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n";
cs_shuffle_x8d24f_to_d32fx8();
variables =
" uint in_offset = in_ptr >> 2;\n"
" uint out_offset = out_ptr >> 2;\n"
" uint depth, stencil;\n";
void bind_resources() override;
work_kernel =
" value = data[index + in_offset];\n"
" value = bswap_u32(value);\n"
" stencil = (value & 0xFFu);\n"
" depth = (value >> 8);\n"
" data[index * 2 + out_offset] = d24f_to_f32(depth);\n"
" data[index * 2 + (out_offset + 1)] = stencil;\n";
cs_shuffle_base::build("");
}
void bind_resources() override
{
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
}
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
{
u32 data_offset;
if (src_offset > dst_offset)
{
data_offset = dst_offset;
m_ssbo_length = (src_offset + num_texels * 4) - data_offset;
}
else
{
data_offset = src_offset;
m_ssbo_length = (dst_offset + num_texels * 8) - data_offset;
}
m_program.uniforms["in_ptr"] = src_offset - data_offset;
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
cs_shuffle_base::run(data, num_texels * 4, data_offset);
}
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels);
};

View File

@ -1,5 +1,6 @@
#pragma once
#include "util/logs.hpp"
#include "util/types.hpp"
#include "Utilities/geometry.h"
#include "OpenGL.h"

View File

@ -0,0 +1,648 @@
#include "GLOverlays.h"
extern u64 get_system_time();
namespace gl
{
void overlay_pass::create()
{
if (!compiled)
{
fs.create(::glsl::program_domain::glsl_fragment_program, fs_src);
fs.compile();
vs.create(::glsl::program_domain::glsl_vertex_program, vs_src);
vs.compile();
program_handle.create();
program_handle.attach(vs);
program_handle.attach(fs);
program_handle.link();
fbo.create();
m_sampler.create();
m_sampler.apply_defaults(input_filter);
m_vertex_data_buffer.create();
int old_vao;
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
m_vao.create();
m_vao.bind();
m_vao.array_buffer = m_vertex_data_buffer;
auto ptr = buffer_pointer(&m_vao);
m_vao[0] = ptr;
glBindVertexArray(old_vao);
compiled = true;
}
}
void overlay_pass::destroy()
{
if (compiled)
{
program_handle.remove();
vs.remove();
fs.remove();
fbo.remove();
m_vao.remove();
m_vertex_data_buffer.remove();
m_sampler.remove();
compiled = false;
}
}
void overlay_pass::emit_geometry()
{
int old_vao;
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
m_vao.bind();
glDrawArrays(primitives, 0, num_drawable_elements);
glBindVertexArray(old_vao);
}
void overlay_pass::run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending)
{
if (!compiled)
{
rsx_log.error("You must initialize overlay passes with create() before calling run()");
return;
}
GLint program;
GLint old_fbo;
GLint depth_func;
GLint viewport[4];
GLboolean color_writes[4];
GLboolean depth_write;
GLint blend_src_rgb;
GLint blend_src_a;
GLint blend_dst_rgb;
GLint blend_dst_a;
GLint blend_eq_a;
GLint blend_eq_rgb;
if (target_texture)
{
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo);
glBindFramebuffer(GL_FRAMEBUFFER, fbo.id());
if (depth_target)
{
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, target_texture, 0);
glDrawBuffer(GL_NONE);
}
else
{
GLenum buffer = GL_COLOR_ATTACHMENT0;
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, target_texture, 0);
glDrawBuffers(1, &buffer);
}
}
if (!target_texture || glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE)
{
// Push rasterizer state
glGetIntegerv(GL_VIEWPORT, viewport);
glGetBooleanv(GL_COLOR_WRITEMASK, color_writes);
glGetBooleanv(GL_DEPTH_WRITEMASK, &depth_write);
glGetIntegerv(GL_CURRENT_PROGRAM, &program);
glGetIntegerv(GL_DEPTH_FUNC, &depth_func);
GLboolean scissor_enabled = glIsEnabled(GL_SCISSOR_TEST);
GLboolean depth_test_enabled = glIsEnabled(GL_DEPTH_TEST);
GLboolean cull_face_enabled = glIsEnabled(GL_CULL_FACE);
GLboolean blend_enabled = glIsEnabledi(GL_BLEND, 0);
GLboolean stencil_test_enabled = glIsEnabled(GL_STENCIL_TEST);
if (use_blending)
{
glGetIntegerv(GL_BLEND_SRC_RGB, &blend_src_rgb);
glGetIntegerv(GL_BLEND_SRC_ALPHA, &blend_src_a);
glGetIntegerv(GL_BLEND_DST_RGB, &blend_dst_rgb);
glGetIntegerv(GL_BLEND_DST_ALPHA, &blend_dst_a);
glGetIntegerv(GL_BLEND_EQUATION_RGB, &blend_eq_rgb);
glGetIntegerv(GL_BLEND_EQUATION_ALPHA, &blend_eq_a);
}
// Set initial state
glViewport(region.x1, region.y1, region.width(), region.height());
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
glDepthMask(depth_target ? GL_TRUE : GL_FALSE);
// Disabling depth test will also disable depth writes which is not desired
glDepthFunc(GL_ALWAYS);
glEnable(GL_DEPTH_TEST);
if (scissor_enabled) glDisable(GL_SCISSOR_TEST);
if (cull_face_enabled) glDisable(GL_CULL_FACE);
if (stencil_test_enabled) glDisable(GL_STENCIL_TEST);
if (use_blending)
{
if (!blend_enabled)
glEnablei(GL_BLEND, 0);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
glBlendEquation(GL_FUNC_ADD);
}
else if (blend_enabled)
{
glDisablei(GL_BLEND, 0);
}
// Render
program_handle.use();
on_load();
bind_resources();
emit_geometry();
// Clean up
if (target_texture)
{
if (depth_target)
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
else
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
glBindFramebuffer(GL_FRAMEBUFFER, old_fbo);
}
glUseProgram(program);
glViewport(viewport[0], viewport[1], viewport[2], viewport[3]);
glColorMask(color_writes[0], color_writes[1], color_writes[2], color_writes[3]);
glDepthMask(depth_write);
glDepthFunc(depth_func);
if (!depth_test_enabled) glDisable(GL_DEPTH_TEST);
if (scissor_enabled) glEnable(GL_SCISSOR_TEST);
if (cull_face_enabled) glEnable(GL_CULL_FACE);
if (stencil_test_enabled) glEnable(GL_STENCIL_TEST);
if (use_blending)
{
if (!blend_enabled)
glDisablei(GL_BLEND, 0);
glBlendFuncSeparate(blend_src_rgb, blend_dst_rgb, blend_src_a, blend_dst_a);
glBlendEquationSeparate(blend_eq_rgb, blend_eq_a);
}
else if (blend_enabled)
{
glEnablei(GL_BLEND, 0);
}
}
else
{
rsx_log.error("Overlay pass failed because framebuffer was not complete. Run with debug output enabled to diagnose the problem");
}
}
ui_overlay_renderer::ui_overlay_renderer()
{
vs_src =
"#version 420\n\n"
"layout(location=0) in vec4 in_pos;\n"
"layout(location=0) out vec2 tc0;\n"
"layout(location=1) flat out vec4 clip_rect;\n"
"uniform vec4 ui_scale;\n"
"uniform vec4 viewport;\n"
"uniform vec4 clip_bounds;\n"
"\n"
"vec2 snap_to_grid(vec2 normalized)\n"
"{\n"
" return (floor(normalized * viewport.xy) + 0.5) / viewport.xy;\n"
"}\n"
"\n"
"vec4 clip_to_ndc(const in vec4 coord)\n"
"{\n"
" vec4 ret = (coord * ui_scale.zwzw) / ui_scale.xyxy;\n"
" ret.yw = 1. - ret.yw;\n"
" return ret;\n"
"}\n"
"\n"
"vec4 ndc_to_window(const in vec4 coord)\n"
"{\n"
" return fma(coord, viewport.xyxy, viewport.zwzw);\n"
"}\n"
"\n"
"void main()\n"
"{\n"
" tc0.xy = in_pos.zw;\n"
" clip_rect = ndc_to_window(clip_to_ndc(clip_bounds)).xwzy; // Swap y1 and y2 due to flipped origin!\n"
" vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n"
" pos.xy = snap_to_grid(pos.xy);\n"
" gl_Position = (pos + pos) - 1.;\n"
"}\n";
fs_src =
"#version 420\n\n"
"layout(binding=31) uniform sampler2D fs0;\n"
"layout(binding=30) uniform sampler2DArray fs1;\n"
"layout(location=0) in vec2 tc0;\n"
"layout(location=1) flat in vec4 clip_rect;\n"
"layout(location=0) out vec4 ocol;\n"
"uniform vec4 color;\n"
"uniform float time;\n"
"uniform int sampler_mode;\n"
"uniform int pulse_glow;\n"
"uniform int clip_region;\n"
"uniform int blur_strength;\n"
"\n"
"vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n"
"{\n"
" vec2 coords[9];\n"
" coords[0] = coord - tex_offset\n;"
" coords[1] = coord + vec2(0., -tex_offset.y);\n"
" coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n"
" coords[3] = coord + vec2(-tex_offset.x, 0.);\n"
" coords[4] = coord;\n"
" coords[5] = coord + vec2(tex_offset.x, 0.);\n"
" coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n"
" coords[7] = coord + vec2(0., tex_offset.y);\n"
" coords[8] = coord + tex_offset;\n"
"\n"
" float weights[9] =\n"
" {\n"
" 1., 2., 1.,\n"
" 2., 4., 2.,\n"
" 1., 2., 1.\n"
" };\n"
"\n"
" vec4 blurred = vec4(0.);\n"
" for (int n = 0; n < 9; ++n)\n"
" {\n"
" blurred += texture(tex, coords[n]) * weights[n];\n"
" }\n"
"\n"
" return blurred / 16.f;\n"
"}\n"
"\n"
"vec4 sample_image(sampler2D tex, vec2 coord)\n"
"{\n"
" vec4 original = texture(tex, coord);\n"
" if (blur_strength == 0) return original;\n"
" \n"
" vec2 constraints = 1.f / vec2(640, 360);\n"
" vec2 res_offset = 1.f / textureSize(fs0, 0);\n"
" vec2 tex_offset = max(res_offset, constraints);\n"
"\n"
" // Sample triangle pattern and average\n"
" // TODO: Nicer looking gaussian blur with less sampling\n"
" vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n"
" vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n"
" vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n"
"\n"
" vec4 blurred = blur0 + blur1 + blur2;\n"
" blurred /= 3.;\n"
" return mix(original, blurred, float(blur_strength) / 100.);\n"
"}\n"
"\n"
"void main()\n"
"{\n"
" if (clip_region != 0)\n"
" {"
" if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n"
" gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n"
" {\n"
" discard;\n"
" return;\n"
" }\n"
" }\n"
"\n"
" vec4 diff_color = color;\n"
" if (pulse_glow != 0)\n"
" diff_color.a *= (sin(time) + 1.f) * 0.5f;\n"
"\n"
" switch (sampler_mode)\n"
" {\n"
" case 1:\n"
" ocol = sample_image(fs0, tc0) * diff_color;\n"
" break;\n"
" case 2:\n"
" ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))) * diff_color;\n"
" break;\n"
" default:\n"
" ocol = diff_color;\n"
" break;\n"
" }\n"
"}\n";
// Smooth filtering required for inputs
input_filter = GL_LINEAR;
}
gl::texture_view* ui_overlay_renderer::load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid)
{
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D, desc->w, desc->h, 1, 1, GL_RGBA8);
tex->copy_from(desc->data, gl::texture::format::rgba, gl::texture::type::uint_8_8_8_8, {});
GLenum remap[] = { GL_RED, GL_ALPHA, GL_BLUE, GL_GREEN };
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
auto result = view.get();
if (!temp_resource)
{
resources.push_back(std::move(tex));
view_cache[view_cache.size()] = std::move(view);
}
else
{
u64 key = reinterpret_cast<u64>(desc);
temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex));
temp_view_cache[key] = std::move(view);
}
return result;
}
void ui_overlay_renderer::create()
{
overlay_pass::create();
rsx::overlays::resource_config configuration;
configuration.load_files();
for (const auto &res : configuration.texture_raw_data)
{
load_simple_image(res.get(), false, UINT32_MAX);
}
configuration.free_resources();
}
void ui_overlay_renderer::destroy()
{
temp_image_cache.clear();
resources.clear();
font_cache.clear();
overlay_pass::destroy();
}
void ui_overlay_renderer::remove_temp_resources(u64 key)
{
std::vector<u64> keys_to_remove;
for (const auto& temp_image : temp_image_cache)
{
if (temp_image.second.first == key)
{
keys_to_remove.push_back(temp_image.first);
}
}
for (const auto& _key : keys_to_remove)
{
temp_image_cache.erase(_key);
temp_view_cache.erase(_key);
}
}
gl::texture_view* ui_overlay_renderer::find_font(rsx::overlays::font* font)
{
const auto font_size = font->get_glyph_data_dimensions();
u64 key = reinterpret_cast<u64>(font);
auto found = view_cache.find(key);
if (found != view_cache.end())
{
if (const auto this_size = found->second->image()->size3D();
font_size.width == this_size.width &&
font_size.height == this_size.height &&
font_size.depth == this_size.depth)
{
return found->second.get();
}
}
// Create font file
std::vector<u8> glyph_data;
font->get_glyph_data(glyph_data);
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D_ARRAY, font_size.width, font_size.height, font_size.depth, 1, GL_R8);
tex->copy_from(glyph_data.data(), gl::texture::format::r, gl::texture::type::ubyte, {});
GLenum remap[] = { GL_RED, GL_RED, GL_RED, GL_RED };
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
auto result = view.get();
font_cache[key] = std::move(tex);
view_cache[key] = std::move(view);
return result;
}
gl::texture_view* ui_overlay_renderer::find_temp_image(rsx::overlays::image_info* desc, u32 owner_uid)
{
auto key = reinterpret_cast<u64>(desc);
auto cached = temp_view_cache.find(key);
if (cached != temp_view_cache.end())
{
return cached->second.get();
}
else
{
return load_simple_image(desc, true, owner_uid);
}
}
void ui_overlay_renderer::set_primitive_type(rsx::overlays::primitive_type type)
{
m_current_primitive_type = type;
switch (type)
{
case rsx::overlays::primitive_type::quad_list:
case rsx::overlays::primitive_type::triangle_strip:
primitives = GL_TRIANGLE_STRIP;
break;
case rsx::overlays::primitive_type::line_list:
primitives = GL_LINES;
break;
case rsx::overlays::primitive_type::line_strip:
primitives = GL_LINE_STRIP;
break;
default:
fmt::throw_exception("Unexpected primitive type %d", static_cast<s32>(type));
}
}
void ui_overlay_renderer::emit_geometry()
{
if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list)
{
// Emulate quads with disjointed triangle strips
int num_quads = num_drawable_elements / 4;
std::vector<GLint> firsts;
std::vector<GLsizei> counts;
firsts.resize(num_quads);
counts.resize(num_quads);
for (int n = 0; n < num_quads; ++n)
{
firsts[n] = (n * 4);
counts[n] = 4;
}
int old_vao;
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
m_vao.bind();
glMultiDrawArrays(GL_TRIANGLE_STRIP, firsts.data(), counts.data(), num_quads);
glBindVertexArray(old_vao);
}
else
{
overlay_pass::emit_geometry();
}
}
void ui_overlay_renderer::run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui)
{
program_handle.uniforms["viewport"] = color4f(static_cast<f32>(viewport.width()), static_cast<f32>(viewport.height()), static_cast<f32>(viewport.x1), static_cast<f32>(viewport.y1));
program_handle.uniforms["ui_scale"] = color4f(static_cast<f32>(ui.virtual_width), static_cast<f32>(ui.virtual_height), 1.f, 1.f);
program_handle.uniforms["time"] = static_cast<f32>(get_system_time() / 1000) * 0.005f;
saved_sampler_state save_30(30, m_sampler);
saved_sampler_state save_31(31, m_sampler);
for (auto &cmd : ui.get_compiled().draw_commands)
{
set_primitive_type(cmd.config.primitives);
upload_vertex_data(cmd.verts.data(), ::size32(cmd.verts));
num_drawable_elements = ::size32(cmd.verts);
GLint texture_read = GL_TRUE;
switch (cmd.config.texture_ref)
{
case rsx::overlays::image_resource_id::game_icon:
case rsx::overlays::image_resource_id::backbuffer:
//TODO
case rsx::overlays::image_resource_id::none:
{
texture_read = GL_FALSE;
glBindTexture(GL_TEXTURE_2D, GL_NONE);
break;
}
case rsx::overlays::image_resource_id::raw_image:
{
glBindTexture(GL_TEXTURE_2D, find_temp_image(static_cast<rsx::overlays::image_info*>(cmd.config.external_data_ref), ui.uid)->id());
break;
}
case rsx::overlays::image_resource_id::font_file:
{
texture_read = (GL_TRUE + 1);
glActiveTexture(GL_TEXTURE0 + 30);
glBindTexture(GL_TEXTURE_2D_ARRAY, find_font(cmd.config.font_ref)->id());
glActiveTexture(GL_TEXTURE0 + 31);
break;
}
default:
{
glBindTexture(GL_TEXTURE_2D, view_cache[cmd.config.texture_ref - 1]->id());
break;
}
}
program_handle.uniforms["color"] = cmd.config.color;
program_handle.uniforms["sampler_mode"] = texture_read;
program_handle.uniforms["pulse_glow"] = static_cast<s32>(cmd.config.pulse_glow);
program_handle.uniforms["blur_strength"] = static_cast<s32>(cmd.config.blur_strength);
program_handle.uniforms["clip_region"] = static_cast<s32>(cmd.config.clip_region);
program_handle.uniforms["clip_bounds"] = cmd.config.clip_rect;
overlay_pass::run(viewport, target, false, true);
}
ui.update();
}
video_out_calibration_pass::video_out_calibration_pass()
{
vs_src =
"#version 420\n\n"
"layout(location=0) out vec2 tc0;\n"
"\n"
"void main()\n"
"{\n"
" vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n"
" vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n"
" tc0 = coords[gl_VertexID % 4];\n"
" vec2 pos = positions[gl_VertexID % 4];\n"
" gl_Position = vec4(pos, 0., 1.);\n"
"}\n";
fs_src =
"#version 420\n\n"
"layout(binding=31) uniform sampler2D fs0;\n"
"layout(binding=30) uniform sampler2D fs1;\n"
"layout(location=0) in vec2 tc0;\n"
"layout(location=0) out vec4 ocol;\n"
"\n"
"uniform float gamma;\n"
"uniform int limit_range;\n"
"uniform int stereo;\n"
"uniform int stereo_image_count;\n"
"\n"
"vec4 read_source()\n"
"{\n"
" if (stereo == 0) return texture(fs0, tc0);\n"
"\n"
" vec4 left, right;\n"
" if (stereo_image_count == 2)\n"
" {\n"
" left = texture(fs0, tc0);\n"
" right = texture(fs1, tc0);\n"
" }\n"
" else\n"
" {\n"
" vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n"
" vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n"
" left = texture(fs0, coord_left);\n"
" right = texture(fs0, coord_right);\n"
" }\n"
"\n"
" return vec4(left.r, right.g, right.b, 1.);\n"
"}\n"
"\n"
"void main()\n"
"{\n"
" vec4 color = read_source();\n"
" color.rgb = pow(color.rgb, vec3(gamma));\n"
" if (limit_range > 0)\n"
" ocol = ((color * 220.) + 16.) / 255.;\n"
" else\n"
" ocol = color;\n"
"}\n";
input_filter = GL_LINEAR;
}
void video_out_calibration_pass::run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d)
{
program_handle.uniforms["gamma"] = gamma;
program_handle.uniforms["limit_range"] = limited_rgb + 0;
program_handle.uniforms["stereo"] = _3d + 0;
program_handle.uniforms["stereo_image_count"] = (source[1] == GL_NONE? 1 : 2);
saved_sampler_state saved(31, m_sampler);
glBindTexture(GL_TEXTURE_2D, source[0]);
saved_sampler_state saved2(30, m_sampler);
glBindTexture(GL_TEXTURE_2D, source[1]);
overlay_pass::run(viewport, GL_NONE, false, false);
}
}

View File

@ -1,13 +1,12 @@
#pragma once
#include "util/types.hpp"
#include "GLHelpers.h"
#include "../Overlays/overlays.h"
#include "GLTexture.h"
#include "Emu/RSX/rsx_utils.h"
#include <string>
#include <unordered_map>
extern u64 get_system_time();
namespace gl
{
struct overlay_pass
@ -53,61 +52,8 @@ namespace gl
}
};
void create()
{
if (!compiled)
{
fs.create(::glsl::program_domain::glsl_fragment_program, fs_src);
fs.compile();
vs.create(::glsl::program_domain::glsl_vertex_program, vs_src);
vs.compile();
program_handle.create();
program_handle.attach(vs);
program_handle.attach(fs);
program_handle.link();
fbo.create();
m_sampler.create();
m_sampler.apply_defaults(input_filter);
m_vertex_data_buffer.create();
int old_vao;
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
m_vao.create();
m_vao.bind();
m_vao.array_buffer = m_vertex_data_buffer;
auto ptr = buffer_pointer(&m_vao);
m_vao[0] = ptr;
glBindVertexArray(old_vao);
compiled = true;
}
}
void destroy()
{
if (compiled)
{
program_handle.remove();
vs.remove();
fs.remove();
fbo.remove();
m_vao.remove();
m_vertex_data_buffer.remove();
m_sampler.remove();
compiled = false;
}
}
void create();
void destroy();
virtual void on_load() {}
virtual void on_unload() {}
@ -121,155 +67,9 @@ namespace gl
m_vertex_data_buffer.data(elements_count * sizeof(T), data);
}
virtual void emit_geometry()
{
int old_vao;
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
virtual void emit_geometry();
m_vao.bind();
glDrawArrays(primitives, 0, num_drawable_elements);
glBindVertexArray(old_vao);
}
void run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending = false)
{
if (!compiled)
{
rsx_log.error("You must initialize overlay passes with create() before calling run()");
return;
}
GLint program;
GLint old_fbo;
GLint depth_func;
GLint viewport[4];
GLboolean color_writes[4];
GLboolean depth_write;
GLint blend_src_rgb;
GLint blend_src_a;
GLint blend_dst_rgb;
GLint blend_dst_a;
GLint blend_eq_a;
GLint blend_eq_rgb;
if (target_texture)
{
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo);
glBindFramebuffer(GL_FRAMEBUFFER, fbo.id());
if (depth_target)
{
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, target_texture, 0);
glDrawBuffer(GL_NONE);
}
else
{
GLenum buffer = GL_COLOR_ATTACHMENT0;
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, target_texture, 0);
glDrawBuffers(1, &buffer);
}
}
if (!target_texture || glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE)
{
// Push rasterizer state
glGetIntegerv(GL_VIEWPORT, viewport);
glGetBooleanv(GL_COLOR_WRITEMASK, color_writes);
glGetBooleanv(GL_DEPTH_WRITEMASK, &depth_write);
glGetIntegerv(GL_CURRENT_PROGRAM, &program);
glGetIntegerv(GL_DEPTH_FUNC, &depth_func);
GLboolean scissor_enabled = glIsEnabled(GL_SCISSOR_TEST);
GLboolean depth_test_enabled = glIsEnabled(GL_DEPTH_TEST);
GLboolean cull_face_enabled = glIsEnabled(GL_CULL_FACE);
GLboolean blend_enabled = glIsEnabledi(GL_BLEND, 0);
GLboolean stencil_test_enabled = glIsEnabled(GL_STENCIL_TEST);
if (use_blending)
{
glGetIntegerv(GL_BLEND_SRC_RGB, &blend_src_rgb);
glGetIntegerv(GL_BLEND_SRC_ALPHA, &blend_src_a);
glGetIntegerv(GL_BLEND_DST_RGB, &blend_dst_rgb);
glGetIntegerv(GL_BLEND_DST_ALPHA, &blend_dst_a);
glGetIntegerv(GL_BLEND_EQUATION_RGB, &blend_eq_rgb);
glGetIntegerv(GL_BLEND_EQUATION_ALPHA, &blend_eq_a);
}
// Set initial state
glViewport(region.x1, region.y1, region.width(), region.height());
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
glDepthMask(depth_target ? GL_TRUE : GL_FALSE);
// Disabling depth test will also disable depth writes which is not desired
glDepthFunc(GL_ALWAYS);
glEnable(GL_DEPTH_TEST);
if (scissor_enabled) glDisable(GL_SCISSOR_TEST);
if (cull_face_enabled) glDisable(GL_CULL_FACE);
if (stencil_test_enabled) glDisable(GL_STENCIL_TEST);
if (use_blending)
{
if (!blend_enabled)
glEnablei(GL_BLEND, 0);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
glBlendEquation(GL_FUNC_ADD);
}
else if (blend_enabled)
{
glDisablei(GL_BLEND, 0);
}
// Render
program_handle.use();
on_load();
bind_resources();
emit_geometry();
// Clean up
if (target_texture)
{
if (depth_target)
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
else
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
glBindFramebuffer(GL_FRAMEBUFFER, old_fbo);
}
glUseProgram(program);
glViewport(viewport[0], viewport[1], viewport[2], viewport[3]);
glColorMask(color_writes[0], color_writes[1], color_writes[2], color_writes[3]);
glDepthMask(depth_write);
glDepthFunc(depth_func);
if (!depth_test_enabled) glDisable(GL_DEPTH_TEST);
if (scissor_enabled) glEnable(GL_SCISSOR_TEST);
if (cull_face_enabled) glEnable(GL_CULL_FACE);
if (stencil_test_enabled) glEnable(GL_STENCIL_TEST);
if (use_blending)
{
if (!blend_enabled)
glDisablei(GL_BLEND, 0);
glBlendFuncSeparate(blend_src_rgb, blend_dst_rgb, blend_src_a, blend_dst_a);
glBlendEquationSeparate(blend_eq_rgb, blend_eq_a);
}
else if (blend_enabled)
{
glEnablei(GL_BLEND, 0);
}
}
else
{
rsx_log.error("Overlay pass failed because framebuffer was not complete. Run with debug output enabled to diagnose the problem");
}
}
void run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending = false);
};
struct ui_overlay_renderer : public overlay_pass
@ -282,443 +82,30 @@ namespace gl
std::unordered_map<u64, std::unique_ptr<gl::texture_view>> view_cache;
rsx::overlays::primitive_type m_current_primitive_type = rsx::overlays::primitive_type::quad_list;
ui_overlay_renderer()
{
vs_src =
"#version 420\n\n"
"layout(location=0) in vec4 in_pos;\n"
"layout(location=0) out vec2 tc0;\n"
"layout(location=1) flat out vec4 clip_rect;\n"
"uniform vec4 ui_scale;\n"
"uniform vec4 viewport;\n"
"uniform vec4 clip_bounds;\n"
"\n"
"vec2 snap_to_grid(vec2 normalized)\n"
"{\n"
" return (floor(normalized * viewport.xy) + 0.5) / viewport.xy;\n"
"}\n"
"\n"
"vec4 clip_to_ndc(const in vec4 coord)\n"
"{\n"
" vec4 ret = (coord * ui_scale.zwzw) / ui_scale.xyxy;\n"
" ret.yw = 1. - ret.yw;\n"
" return ret;\n"
"}\n"
"\n"
"vec4 ndc_to_window(const in vec4 coord)\n"
"{\n"
" return fma(coord, viewport.xyxy, viewport.zwzw);\n"
"}\n"
"\n"
"void main()\n"
"{\n"
" tc0.xy = in_pos.zw;\n"
" clip_rect = ndc_to_window(clip_to_ndc(clip_bounds)).xwzy; // Swap y1 and y2 due to flipped origin!\n"
" vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n"
" pos.xy = snap_to_grid(pos.xy);\n"
" gl_Position = (pos + pos) - 1.;\n"
"}\n";
ui_overlay_renderer();
fs_src =
"#version 420\n\n"
"layout(binding=31) uniform sampler2D fs0;\n"
"layout(binding=30) uniform sampler2DArray fs1;\n"
"layout(location=0) in vec2 tc0;\n"
"layout(location=1) flat in vec4 clip_rect;\n"
"layout(location=0) out vec4 ocol;\n"
"uniform vec4 color;\n"
"uniform float time;\n"
"uniform int sampler_mode;\n"
"uniform int pulse_glow;\n"
"uniform int clip_region;\n"
"uniform int blur_strength;\n"
"\n"
"vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n"
"{\n"
" vec2 coords[9];\n"
" coords[0] = coord - tex_offset\n;"
" coords[1] = coord + vec2(0., -tex_offset.y);\n"
" coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n"
" coords[3] = coord + vec2(-tex_offset.x, 0.);\n"
" coords[4] = coord;\n"
" coords[5] = coord + vec2(tex_offset.x, 0.);\n"
" coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n"
" coords[7] = coord + vec2(0., tex_offset.y);\n"
" coords[8] = coord + tex_offset;\n"
"\n"
" float weights[9] =\n"
" {\n"
" 1., 2., 1.,\n"
" 2., 4., 2.,\n"
" 1., 2., 1.\n"
" };\n"
"\n"
" vec4 blurred = vec4(0.);\n"
" for (int n = 0; n < 9; ++n)\n"
" {\n"
" blurred += texture(tex, coords[n]) * weights[n];\n"
" }\n"
"\n"
" return blurred / 16.f;\n"
"}\n"
"\n"
"vec4 sample_image(sampler2D tex, vec2 coord)\n"
"{\n"
" vec4 original = texture(tex, coord);\n"
" if (blur_strength == 0) return original;\n"
" \n"
" vec2 constraints = 1.f / vec2(640, 360);\n"
" vec2 res_offset = 1.f / textureSize(fs0, 0);\n"
" vec2 tex_offset = max(res_offset, constraints);\n"
"\n"
" // Sample triangle pattern and average\n"
" // TODO: Nicer looking gaussian blur with less sampling\n"
" vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n"
" vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n"
" vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n"
"\n"
" vec4 blurred = blur0 + blur1 + blur2;\n"
" blurred /= 3.;\n"
" return mix(original, blurred, float(blur_strength) / 100.);\n"
"}\n"
"\n"
"void main()\n"
"{\n"
" if (clip_region != 0)\n"
" {"
" if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n"
" gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n"
" {\n"
" discard;\n"
" return;\n"
" }\n"
" }\n"
"\n"
" vec4 diff_color = color;\n"
" if (pulse_glow != 0)\n"
" diff_color.a *= (sin(time) + 1.f) * 0.5f;\n"
"\n"
" switch (sampler_mode)\n"
" {\n"
" case 1:\n"
" ocol = sample_image(fs0, tc0) * diff_color;\n"
" break;\n"
" case 2:\n"
" ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))) * diff_color;\n"
" break;\n"
" default:\n"
" ocol = diff_color;\n"
" break;\n"
" }\n"
"}\n";
gl::texture_view* load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid);
// Smooth filtering required for inputs
input_filter = GL_LINEAR;
}
void create();
void destroy();
gl::texture_view* load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid)
{
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D, desc->w, desc->h, 1, 1, GL_RGBA8);
tex->copy_from(desc->data, gl::texture::format::rgba, gl::texture::type::uint_8_8_8_8, {});
void remove_temp_resources(u64 key);
GLenum remap[] = { GL_RED, GL_ALPHA, GL_BLUE, GL_GREEN };
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
gl::texture_view* find_font(rsx::overlays::font* font);
auto result = view.get();
if (!temp_resource)
{
resources.push_back(std::move(tex));
view_cache[view_cache.size()] = std::move(view);
}
else
{
u64 key = reinterpret_cast<u64>(desc);
temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex));
temp_view_cache[key] = std::move(view);
}
gl::texture_view* find_temp_image(rsx::overlays::image_info* desc, u32 owner_uid);
return result;
}
void set_primitive_type(rsx::overlays::primitive_type type);
void create()
{
overlay_pass::create();
void emit_geometry() override;
rsx::overlays::resource_config configuration;
configuration.load_files();
for (const auto &res : configuration.texture_raw_data)
{
load_simple_image(res.get(), false, UINT32_MAX);
}
configuration.free_resources();
}
void destroy()
{
temp_image_cache.clear();
resources.clear();
font_cache.clear();
overlay_pass::destroy();
}
void remove_temp_resources(u64 key)
{
std::vector<u64> keys_to_remove;
for (const auto& temp_image : temp_image_cache)
{
if (temp_image.second.first == key)
{
keys_to_remove.push_back(temp_image.first);
}
}
for (const auto& _key : keys_to_remove)
{
temp_image_cache.erase(_key);
temp_view_cache.erase(_key);
}
}
gl::texture_view* find_font(rsx::overlays::font *font)
{
const auto font_size = font->get_glyph_data_dimensions();
u64 key = reinterpret_cast<u64>(font);
auto found = view_cache.find(key);
if (found != view_cache.end())
{
if (const auto this_size = found->second->image()->size3D();
font_size.width == this_size.width &&
font_size.height == this_size.height &&
font_size.depth == this_size.depth)
{
return found->second.get();
}
}
// Create font file
std::vector<u8> glyph_data;
font->get_glyph_data(glyph_data);
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D_ARRAY, font_size.width, font_size.height, font_size.depth, 1, GL_R8);
tex->copy_from(glyph_data.data(), gl::texture::format::r, gl::texture::type::ubyte, {});
GLenum remap[] = { GL_RED, GL_RED, GL_RED, GL_RED };
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
auto result = view.get();
font_cache[key] = std::move(tex);
view_cache[key] = std::move(view);
return result;
}
gl::texture_view* find_temp_image(rsx::overlays::image_info *desc, u32 owner_uid)
{
auto key = reinterpret_cast<u64>(desc);
auto cached = temp_view_cache.find(key);
if (cached != temp_view_cache.end())
{
return cached->second.get();
}
else
{
return load_simple_image(desc, true, owner_uid);
}
}
void set_primitive_type(rsx::overlays::primitive_type type)
{
m_current_primitive_type = type;
switch (type)
{
case rsx::overlays::primitive_type::quad_list:
case rsx::overlays::primitive_type::triangle_strip:
primitives = GL_TRIANGLE_STRIP;
break;
case rsx::overlays::primitive_type::line_list:
primitives = GL_LINES;
break;
case rsx::overlays::primitive_type::line_strip:
primitives = GL_LINE_STRIP;
break;
default:
fmt::throw_exception("Unexpected primitive type %d", static_cast<s32>(type));
}
}
void emit_geometry() override
{
if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list)
{
// Emulate quads with disjointed triangle strips
int num_quads = num_drawable_elements / 4;
std::vector<GLint> firsts;
std::vector<GLsizei> counts;
firsts.resize(num_quads);
counts.resize(num_quads);
for (int n = 0; n < num_quads; ++n)
{
firsts[n] = (n * 4);
counts[n] = 4;
}
int old_vao;
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
m_vao.bind();
glMultiDrawArrays(GL_TRIANGLE_STRIP, firsts.data(), counts.data(), num_quads);
glBindVertexArray(old_vao);
}
else
{
overlay_pass::emit_geometry();
}
}
void run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui)
{
program_handle.uniforms["viewport"] = color4f(static_cast<f32>(viewport.width()), static_cast<f32>(viewport.height()), static_cast<f32>(viewport.x1), static_cast<f32>(viewport.y1));
program_handle.uniforms["ui_scale"] = color4f(static_cast<f32>(ui.virtual_width), static_cast<f32>(ui.virtual_height), 1.f, 1.f);
program_handle.uniforms["time"] = static_cast<f32>(get_system_time() / 1000) * 0.005f;
saved_sampler_state save_30(30, m_sampler);
saved_sampler_state save_31(31, m_sampler);
for (auto &cmd : ui.get_compiled().draw_commands)
{
set_primitive_type(cmd.config.primitives);
upload_vertex_data(cmd.verts.data(), ::size32(cmd.verts));
num_drawable_elements = ::size32(cmd.verts);
GLint texture_read = GL_TRUE;
switch (cmd.config.texture_ref)
{
case rsx::overlays::image_resource_id::game_icon:
case rsx::overlays::image_resource_id::backbuffer:
//TODO
case rsx::overlays::image_resource_id::none:
{
texture_read = GL_FALSE;
glBindTexture(GL_TEXTURE_2D, GL_NONE);
break;
}
case rsx::overlays::image_resource_id::raw_image:
{
glBindTexture(GL_TEXTURE_2D, find_temp_image(static_cast<rsx::overlays::image_info*>(cmd.config.external_data_ref), ui.uid)->id());
break;
}
case rsx::overlays::image_resource_id::font_file:
{
texture_read = (GL_TRUE + 1);
glActiveTexture(GL_TEXTURE0 + 30);
glBindTexture(GL_TEXTURE_2D_ARRAY, find_font(cmd.config.font_ref)->id());
glActiveTexture(GL_TEXTURE0 + 31);
break;
}
default:
{
glBindTexture(GL_TEXTURE_2D, view_cache[cmd.config.texture_ref - 1]->id());
break;
}
}
program_handle.uniforms["color"] = cmd.config.color;
program_handle.uniforms["sampler_mode"] = texture_read;
program_handle.uniforms["pulse_glow"] = static_cast<s32>(cmd.config.pulse_glow);
program_handle.uniforms["blur_strength"] = static_cast<s32>(cmd.config.blur_strength);
program_handle.uniforms["clip_region"] = static_cast<s32>(cmd.config.clip_region);
program_handle.uniforms["clip_bounds"] = cmd.config.clip_rect;
overlay_pass::run(viewport, target, false, true);
}
ui.update();
}
void run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui);
};
struct video_out_calibration_pass : public overlay_pass
{
video_out_calibration_pass()
{
vs_src =
"#version 420\n\n"
"layout(location=0) out vec2 tc0;\n"
"\n"
"void main()\n"
"{\n"
" vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n"
" vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n"
" tc0 = coords[gl_VertexID % 4];\n"
" vec2 pos = positions[gl_VertexID % 4];\n"
" gl_Position = vec4(pos, 0., 1.);\n"
"}\n";
video_out_calibration_pass();
fs_src =
"#version 420\n\n"
"layout(binding=31) uniform sampler2D fs0;\n"
"layout(binding=30) uniform sampler2D fs1;\n"
"layout(location=0) in vec2 tc0;\n"
"layout(location=0) out vec4 ocol;\n"
"\n"
"uniform float gamma;\n"
"uniform int limit_range;\n"
"uniform int stereo;\n"
"uniform int stereo_image_count;\n"
"\n"
"vec4 read_source()\n"
"{\n"
" if (stereo == 0) return texture(fs0, tc0);\n"
"\n"
" vec4 left, right;\n"
" if (stereo_image_count == 2)\n"
" {\n"
" left = texture(fs0, tc0);\n"
" right = texture(fs1, tc0);\n"
" }\n"
" else\n"
" {\n"
" vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n"
" vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n"
" left = texture(fs0, coord_left);\n"
" right = texture(fs0, coord_right);\n"
" }\n"
"\n"
" return vec4(left.r, right.g, right.b, 1.);\n"
"}\n"
"\n"
"void main()\n"
"{\n"
" vec4 color = read_source();\n"
" color.rgb = pow(color.rgb, vec3(gamma));\n"
" if (limit_range > 0)\n"
" ocol = ((color * 220.) + 16.) / 255.;\n"
" else\n"
" ocol = color;\n"
"}\n";
input_filter = GL_LINEAR;
}
void run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d)
{
program_handle.uniforms["gamma"] = gamma;
program_handle.uniforms["limit_range"] = limited_rgb + 0;
program_handle.uniforms["stereo"] = _3d + 0;
program_handle.uniforms["stereo_image_count"] = (source[1] == GL_NONE? 1 : 2);
saved_sampler_state saved(31, m_sampler);
glBindTexture(GL_TEXTURE_2D, source[0]);
saved_sampler_state saved2(30, m_sampler);
glBindTexture(GL_TEXTURE_2D, source[1]);
overlay_pass::run(viewport, GL_NONE, false, false);
}
void run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d);
};
}

View File

@ -4,6 +4,8 @@
#include "Utilities/geometry.h"
#include "overlay_utils.h"
#include <functional>
namespace rsx
{
namespace overlays

View File

@ -19,6 +19,7 @@
#include <sys/types.h>
#include <pwd.h>
#include <libgen.h>
#include <limits.h>
#endif
#ifdef __APPLE__

View File

@ -0,0 +1,428 @@
#include "VKCompute.h"
#include "VKHelpers.h"
#include "VKRenderPass.h"
#include "vkutils/buffer_object.h"
#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame
namespace vk
{
std::vector<std::pair<VkDescriptorType, u8>> compute_task::get_descriptor_layout()
{
std::vector<std::pair<VkDescriptorType, u8>> result;
result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
return result;
}
void compute_task::init_descriptors()
{
std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
std::vector<VkDescriptorSetLayoutBinding> bindings;
const auto layout = get_descriptor_layout();
for (const auto &e : layout)
{
descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});
for (unsigned n = 0; n < e.second; ++n)
{
bindings.push_back
({
u32(bindings.size()),
e.first,
1,
VK_SHADER_STAGE_COMPUTE_BIT,
nullptr
});
}
}
// Reserve descriptor pools
m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 3);
VkDescriptorSetLayoutCreateInfo infos = {};
infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
infos.pBindings = bindings.data();
infos.bindingCount = ::size32(bindings);
CHECK_RESULT(vkCreateDescriptorSetLayout(*g_render_device, &infos, nullptr, &m_descriptor_layout));
VkPipelineLayoutCreateInfo layout_info = {};
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
layout_info.setLayoutCount = 1;
layout_info.pSetLayouts = &m_descriptor_layout;
VkPushConstantRange push_constants{};
if (use_push_constants)
{
push_constants.size = push_constants_size;
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
layout_info.pushConstantRangeCount = 1;
layout_info.pPushConstantRanges = &push_constants;
}
CHECK_RESULT(vkCreatePipelineLayout(*g_render_device, &layout_info, nullptr, &m_pipeline_layout));
}
void compute_task::create()
{
if (!initialized)
{
init_descriptors();
switch (vk::get_driver_vendor())
{
case vk::driver_vendor::unknown:
case vk::driver_vendor::INTEL:
// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
// Based on intel's own OpenCL recommended settings
unroll_loops = true;
optimal_kernel_size = 1;
optimal_group_size = 128;
break;
case vk::driver_vendor::NVIDIA:
// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
unroll_loops = true;
optimal_group_size = 32;
optimal_kernel_size = 1;
break;
case vk::driver_vendor::AMD:
case vk::driver_vendor::RADV:
// Wavefronts are multiples of 64
unroll_loops = false;
optimal_kernel_size = 1;
optimal_group_size = 64;
break;
}
const auto& gpu = vk::g_render_device->gpu();
max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
initialized = true;
}
}
void compute_task::destroy()
{
if (initialized)
{
m_shader.destroy();
m_program.reset();
m_param_buffer.reset();
vkDestroyDescriptorSetLayout(*g_render_device, m_descriptor_layout, nullptr);
vkDestroyPipelineLayout(*g_render_device, m_pipeline_layout, nullptr);
m_descriptor_pool.destroy();
initialized = false;
}
}
void compute_task::free_resources()
{
if (m_used_descriptors == 0)
return;
m_descriptor_pool.reset(0);
m_used_descriptors = 0;
}
void compute_task::load_program(VkCommandBuffer cmd)
{
if (!m_program)
{
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
auto handle = m_shader.compile();
VkPipelineShaderStageCreateInfo shader_stage{};
shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
shader_stage.module = handle;
shader_stage.pName = "main";
VkComputePipelineCreateInfo info{};
info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
info.stage = shader_stage;
info.layout = m_pipeline_layout;
info.basePipelineIndex = -1;
info.basePipelineHandle = VK_NULL_HANDLE;
auto compiler = vk::get_pipe_compiler();
m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE);
declare_inputs();
}
ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS);
VkDescriptorSetAllocateInfo alloc_info = {};
alloc_info.descriptorPool = m_descriptor_pool;
alloc_info.descriptorSetCount = 1;
alloc_info.pSetLayouts = &m_descriptor_layout;
alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
CHECK_RESULT(vkAllocateDescriptorSets(*g_render_device, &alloc_info, &m_descriptor_set));
m_used_descriptors++;
bind_resources();
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
}
void compute_task::run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
{
// CmdDispatch is outside renderpass scope only
if (vk::is_renderpass_open(cmd))
{
vk::end_renderpass(cmd);
}
load_program(cmd);
vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
}
void compute_task::run(VkCommandBuffer cmd, u32 num_invocations)
{
u32 invocations_x, invocations_y;
if (num_invocations > max_invocations_x)
{
// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
// Split the 1D job into 2 dimensions to accomodate this
invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
invocations_y = invocations_x;
if (num_invocations % invocations_x) invocations_y++;
}
else
{
invocations_x = num_invocations;
invocations_y = 1;
}
run(cmd, invocations_x, invocations_y, 1);
}
cs_shuffle_base::cs_shuffle_base()
{
work_kernel =
" value = data[index];\n"
" data[index] = %f(value);\n";
loop_advance =
" index++;\n";
suffix =
"}\n";
}
void cs_shuffle_base::build(const char* function_name, u32 _kernel_size)
{
// Initialize to allow detecting optimal settings
create();
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
m_src =
"#version 430\n"
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
"%ub"
"\n"
"#define KERNEL_SIZE %ks\n"
"\n"
"// Generic swap routines\n"
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
"\n"
"// Depth format conversions\n"
"#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n"
"#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n"
"#define d24f_to_f32(bits) (bits << 7)\n"
"#define f32_to_d24f(bits) (bits >> 7)\n"
"#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n"
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
"\n"
"%md"
"void main()\n"
"{\n"
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
" uint index = invocation_id * KERNEL_SIZE;\n"
" uint value;\n"
"%vars"
"\n";
const auto parameters_size = utils::align(push_constants_size, 16) / 16;
const std::pair<std::string, std::string> syntax_replace[] =
{
{ "%ws", std::to_string(optimal_group_size) },
{ "%ks", std::to_string(kernel_size) },
{ "%vars", variables },
{ "%f", function_name },
{ "%md", method_declarations },
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
};
m_src = fmt::replace_all(m_src, syntax_replace);
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
if (kernel_size <= 1)
{
m_src += " {\n" + work_kernel + " }\n";
}
else if (unroll_loops)
{
work_kernel += loop_advance + "\n";
m_src += std::string
(
" //Unrolled loop\n"
" {\n"
);
// Assemble body with manual loop unroll to try loweing GPR usage
for (u32 n = 0; n < kernel_size; ++n)
{
m_src += work_kernel;
}
m_src += " }\n";
}
else
{
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
m_src += " {\n";
m_src += work_kernel;
m_src += loop_advance;
m_src += " }\n";
}
m_src += suffix;
}
void cs_shuffle_base::bind_resources()
{
m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
}
void cs_shuffle_base::set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
{
ensure(use_push_constants);
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
}
void cs_shuffle_base::run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset)
{
m_data = data;
m_data_offset = data_offset;
m_data_length = data_length;
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
if ((num_bytes_to_process + data_offset) > data->size())
{
// Technically robust buffer access should keep the driver from crashing in OOB situations
rsx_log.error("Inadequate buffer length submitted for a compute operation."
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
}
compute_task::run(cmd, num_invocations);
}
cs_interleave_task::cs_interleave_task()
{
use_push_constants = true;
push_constants_size = 16;
variables =
" uint block_length = params[0].x >> 2;\n"
" uint z_offset = params[0].y >> 2;\n"
" uint s_offset = params[0].z >> 2;\n"
" uint depth;\n"
" uint stencil;\n"
" uint stencil_shift;\n"
" uint stencil_offset;\n";
}
void cs_interleave_task::bind_resources()
{
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
}
void cs_interleave_task::run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
{
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
set_parameters(cmd, parameters, 4);
ensure(stencil_offset > data_offset);
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
cs_shuffle_base::run(cmd, data, data_length, data_offset);
}
cs_scatter_d24x8::cs_scatter_d24x8()
{
work_kernel =
" if (index >= block_length)\n"
" return;\n"
"\n"
" value = data[index];\n"
" data[index + z_offset] = (value >> 8);\n"
" stencil_offset = (index / 4);\n"
" stencil_shift = (index % 4) * 8;\n"
" stencil = (value & 0xFF) << stencil_shift;\n"
" atomicOr(data[stencil_offset + s_offset], stencil);\n";
cs_shuffle_base::build("");
}
cs_aggregator::cs_aggregator()
{
ssbo_count = 2;
create();
m_src =
"#version 450\n"
"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"
"void main()\n"
"{\n"
" if (gl_GlobalInvocationID.x < src.length())\n"
" {\n"
" atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
" }\n"
"}\n";
const std::pair<std::string, std::string> syntax_replace[] =
{
{ "%ws", std::to_string(optimal_group_size) },
};
m_src = fmt::replace_all(m_src, syntax_replace);
}
void cs_aggregator::bind_resources()
{
m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
}
void cs_aggregator::run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
{
this->dst = dst;
this->src = src;
word_count = num_words;
block_length = num_words * 4;
const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
compute_task::run(cmd, linear_invocations);
}
}

View File

@ -1,18 +1,14 @@
#pragma once
#include "VKPipelineCompiler.h"
#include "vkutils/descriptors.hpp"
#include "Utilities/StrUtil.h"
#include "vkutils/buffer_object.h"
#include "Emu/IdManager.h"
#include "VKPipelineCompiler.h"
#include "VKRenderPass.h"
#include "VKHelpers.h"
#include "vkutils/buffer_object.h"
#include "vkutils/device.h"
#include "Utilities/StrUtil.h"
#include "util/asm.hpp"
#include <unordered_map>
#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame
#include <unordered_map>
namespace vk
{
@ -38,207 +34,22 @@ namespace vk
u32 optimal_kernel_size = 1;
u32 max_invocations_x = 65535;
virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout()
{
std::vector<std::pair<VkDescriptorType, u8>> result;
result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
return result;
}
virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout();
void init_descriptors()
{
std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
std::vector<VkDescriptorSetLayoutBinding> bindings;
void init_descriptors();
const auto layout = get_descriptor_layout();
for (const auto &e : layout)
{
descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});
void create();
void destroy();
for (unsigned n = 0; n < e.second; ++n)
{
bindings.push_back
({
u32(bindings.size()),
e.first,
1,
VK_SHADER_STAGE_COMPUTE_BIT,
nullptr
});
}
}
void free_resources();
// Reserve descriptor pools
m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 3);
virtual void bind_resources() {}
virtual void declare_inputs() {}
VkDescriptorSetLayoutCreateInfo infos = {};
infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
infos.pBindings = bindings.data();
infos.bindingCount = ::size32(bindings);
void load_program(VkCommandBuffer cmd);
CHECK_RESULT(vkCreateDescriptorSetLayout(*g_render_device, &infos, nullptr, &m_descriptor_layout));
VkPipelineLayoutCreateInfo layout_info = {};
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
layout_info.setLayoutCount = 1;
layout_info.pSetLayouts = &m_descriptor_layout;
VkPushConstantRange push_constants{};
if (use_push_constants)
{
push_constants.size = push_constants_size;
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
layout_info.pushConstantRangeCount = 1;
layout_info.pPushConstantRanges = &push_constants;
}
CHECK_RESULT(vkCreatePipelineLayout(*g_render_device, &layout_info, nullptr, &m_pipeline_layout));
}
void create()
{
if (!initialized)
{
init_descriptors();
switch (vk::get_driver_vendor())
{
case vk::driver_vendor::unknown:
case vk::driver_vendor::INTEL:
// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
// Based on intel's own OpenCL recommended settings
unroll_loops = true;
optimal_kernel_size = 1;
optimal_group_size = 128;
break;
case vk::driver_vendor::NVIDIA:
// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
unroll_loops = true;
optimal_group_size = 32;
optimal_kernel_size = 1;
break;
case vk::driver_vendor::AMD:
case vk::driver_vendor::RADV:
// Wavefronts are multiples of 64
unroll_loops = false;
optimal_kernel_size = 1;
optimal_group_size = 64;
break;
}
const auto& gpu = vk::g_render_device->gpu();
max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
initialized = true;
}
}
void destroy()
{
if (initialized)
{
m_shader.destroy();
m_program.reset();
m_param_buffer.reset();
vkDestroyDescriptorSetLayout(*g_render_device, m_descriptor_layout, nullptr);
vkDestroyPipelineLayout(*g_render_device, m_pipeline_layout, nullptr);
m_descriptor_pool.destroy();
initialized = false;
}
}
void free_resources()
{
if (m_used_descriptors == 0)
return;
m_descriptor_pool.reset(0);
m_used_descriptors = 0;
}
virtual void bind_resources()
{}
virtual void declare_inputs()
{}
void load_program(VkCommandBuffer cmd)
{
if (!m_program)
{
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
auto handle = m_shader.compile();
VkPipelineShaderStageCreateInfo shader_stage{};
shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
shader_stage.module = handle;
shader_stage.pName = "main";
VkComputePipelineCreateInfo info{};
info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
info.stage = shader_stage;
info.layout = m_pipeline_layout;
info.basePipelineIndex = -1;
info.basePipelineHandle = VK_NULL_HANDLE;
auto compiler = vk::get_pipe_compiler();
m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE);
declare_inputs();
}
ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS);
VkDescriptorSetAllocateInfo alloc_info = {};
alloc_info.descriptorPool = m_descriptor_pool;
alloc_info.descriptorSetCount = 1;
alloc_info.pSetLayouts = &m_descriptor_layout;
alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
CHECK_RESULT(vkAllocateDescriptorSets(*g_render_device, &alloc_info, &m_descriptor_set));
m_used_descriptors++;
bind_resources();
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
}
void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
{
// CmdDispatch is outside renderpass scope only
if (vk::is_renderpass_open(cmd))
{
vk::end_renderpass(cmd);
}
load_program(cmd);
vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
}
void run(VkCommandBuffer cmd, u32 num_invocations)
{
u32 invocations_x, invocations_y;
if (num_invocations > max_invocations_x)
{
// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
// Split the 1D job into 2 dimensions to accomodate this
invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
invocations_y = invocations_x;
if (num_invocations % invocations_x) invocations_y++;
}
else
{
invocations_x = num_invocations;
invocations_y = 1;
}
run(cmd, invocations_x, invocations_y, 1);
}
void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z);
void run(VkCommandBuffer cmd, u32 num_invocations);
};
struct cs_shuffle_base : compute_task
@ -251,136 +62,15 @@ namespace vk
std::string variables, work_kernel, loop_advance, suffix;
std::string method_declarations;
cs_shuffle_base()
{
work_kernel =
" value = data[index];\n"
" data[index] = %f(value);\n";
cs_shuffle_base();
loop_advance =
" index++;\n";
void build(const char* function_name, u32 _kernel_size = 0);
suffix =
"}\n";
}
void bind_resources() override;
void build(const char* function_name, u32 _kernel_size = 0)
{
// Initialize to allow detecting optimal settings
create();
void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count);
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
m_src =
"#version 430\n"
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
"%ub"
"\n"
"#define KERNEL_SIZE %ks\n"
"\n"
"// Generic swap routines\n"
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
"\n"
"// Depth format conversions\n"
"#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n"
"#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n"
"#define d24f_to_f32(bits) (bits << 7)\n"
"#define f32_to_d24f(bits) (bits >> 7)\n"
"#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n"
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
"\n"
"%md"
"void main()\n"
"{\n"
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
" uint index = invocation_id * KERNEL_SIZE;\n"
" uint value;\n"
"%vars"
"\n";
const auto parameters_size = utils::align(push_constants_size, 16) / 16;
const std::pair<std::string, std::string> syntax_replace[] =
{
{ "%ws", std::to_string(optimal_group_size) },
{ "%ks", std::to_string(kernel_size) },
{ "%vars", variables },
{ "%f", function_name },
{ "%md", method_declarations },
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
};
m_src = fmt::replace_all(m_src, syntax_replace);
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
if (kernel_size <= 1)
{
m_src += " {\n" + work_kernel + " }\n";
}
else if (unroll_loops)
{
work_kernel += loop_advance + "\n";
m_src += std::string
(
" //Unrolled loop\n"
" {\n"
);
// Assemble body with manual loop unroll to try loweing GPR usage
for (u32 n = 0; n < kernel_size; ++n)
{
m_src += work_kernel;
}
m_src += " }\n";
}
else
{
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
m_src += " {\n";
m_src += work_kernel;
m_src += loop_advance;
m_src += " }\n";
}
m_src += suffix;
}
void bind_resources() override
{
m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
}
void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
{
ensure(use_push_constants);
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
}
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0)
{
m_data = data;
m_data_offset = data_offset;
m_data_length = data_length;
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
if ((num_bytes_to_process + data_offset) > data->size())
{
// Technically robust buffer access should keep the driver from crashing in OOB situations
rsx_log.error("Inadequate buffer length submitted for a compute operation."
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
}
compute_task::run(cmd, num_invocations);
}
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0);
};
struct cs_shuffle_16 : cs_shuffle_base
@ -442,35 +132,11 @@ namespace vk
{
u32 m_ssbo_length = 0;
cs_interleave_task()
{
use_push_constants = true;
push_constants_size = 16;
cs_interleave_task();
variables =
" uint block_length = params[0].x >> 2;\n"
" uint z_offset = params[0].y >> 2;\n"
" uint s_offset = params[0].z >> 2;\n"
" uint depth;\n"
" uint stencil;\n"
" uint stencil_shift;\n"
" uint stencil_offset;\n";
}
void bind_resources() override;
void bind_resources() override
{
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
}
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
{
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
set_parameters(cmd, parameters, 4);
ensure(stencil_offset > data_offset);
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
cs_shuffle_base::run(cmd, data, data_length, data_offset);
}
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset);
};
template<bool _SwapBytes = false>
@ -549,21 +215,7 @@ namespace vk
struct cs_scatter_d24x8 : cs_interleave_task
{
cs_scatter_d24x8()
{
work_kernel =
" if (index >= block_length)\n"
" return;\n"
"\n"
" value = data[index];\n"
" data[index + z_offset] = (value >> 8);\n"
" stencil_offset = (index / 4);\n"
" stencil_shift = (index % 4) * 8;\n"
" stencil = (value & 0xFF) << stencil_shift;\n"
" atomicOr(data[stencil_offset + s_offset], stencil);\n";
cs_shuffle_base::build("");
}
cs_scatter_d24x8();
};
template<bool _DepthFloat = false>
@ -962,51 +614,11 @@ namespace vk
u32 block_length = 0;
u32 word_count = 0;
cs_aggregator()
{
ssbo_count = 2;
cs_aggregator();
create();
void bind_resources() override;
m_src =
"#version 450\n"
"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"
"void main()\n"
"{\n"
" if (gl_GlobalInvocationID.x < src.length())\n"
" {\n"
" atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
" }\n"
"}\n";
const std::pair<std::string, std::string> syntax_replace[] =
{
{ "%ws", std::to_string(optimal_group_size) },
};
m_src = fmt::replace_all(m_src, syntax_replace);
}
void bind_resources() override
{
m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
}
void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
{
this->dst = dst;
this->src = src;
word_count = num_words;
block_length = num_words * 4;
const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
compute_task::run(cmd, linear_invocations);
}
void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words);
};
// TODO: Replace with a proper manager

View File

@ -10,7 +10,7 @@ namespace vk
{
std::unordered_map<u64, std::vector<std::unique_ptr<vk::framebuffer_holder>>> g_framebuffers_cache;
vk::framebuffer_holder *get_framebuffer(VkDevice dev, u16 width, u16 height, VkRenderPass renderpass, const std::vector<vk::image*>& image_list)
vk::framebuffer_holder* get_framebuffer(VkDevice dev, u16 width, u16 height, VkRenderPass renderpass, const std::vector<vk::image*>& image_list)
{
u64 key = u64(width) | (u64(height) << 16);
auto &queue = g_framebuffers_cache[key];

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
#include "stdafx.h"
#include "VKGSRender.h"
#include "vkutils/buffer_object.h"
#include "Emu/RSX/Overlays/overlays.h"
#include "Emu/Cell/Modules/cellVideoOut.h"
#include "util/asm.hpp"

View File

@ -3,6 +3,8 @@
#include "VKCompute.h"
#include "VKOverlays.h"
#include "vkutils/image.h"
namespace vk
{
struct cs_resolve_base : compute_task

View File

@ -87,9 +87,11 @@
</ItemGroup>
<ItemGroup>
<ClCompile Include="Emu\RSX\GL\GLCommonDecompiler.cpp" />
<ClCompile Include="Emu\RSX\GL\GLCompute.cpp" />
<ClCompile Include="Emu\RSX\GL\GLDraw.cpp" />
<ClCompile Include="Emu\RSX\GL\GLFragmentProgram.cpp" />
<ClCompile Include="Emu\RSX\GL\GLGSRender.cpp" />
<ClCompile Include="Emu\RSX\GL\GLOverlays.cpp" />
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
<ClCompile Include="Emu\RSX\GL\GLVertexProgram.cpp" />
<ClCompile Include="Emu\RSX\GL\GLHelpers.cpp" />

View File

@ -15,6 +15,8 @@
<ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
<ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
<ClCompile Include="Emu\RSX\GL\GLOverlays.cpp" />
<ClCompile Include="Emu\RSX\GL\GLCompute.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="Emu\RSX\GL\GLTexture.h" />

View File

@ -66,6 +66,7 @@
<ItemGroup>
<ClCompile Include="Emu\RSX\VK\VKCommandStream.cpp" />
<ClCompile Include="Emu\RSX\VK\VKCommonDecompiler.cpp" />
<ClCompile Include="Emu\RSX\VK\VKCompute.cpp" />
<ClCompile Include="Emu\RSX\VK\VKDMA.cpp" />
<ClCompile Include="Emu\RSX\VK\VKDraw.cpp" />
<ClCompile Include="Emu\RSX\VK\VKFormats.cpp" />
@ -73,6 +74,7 @@
<ClCompile Include="Emu\RSX\VK\VKFramebuffer.cpp" />
<ClCompile Include="Emu\RSX\VK\VKGSRender.cpp" />
<ClCompile Include="Emu\RSX\VK\VKHelpers.cpp" />
<ClCompile Include="Emu\RSX\VK\VKOverlays.cpp" />
<ClCompile Include="Emu\RSX\VK\VKPipelineCompiler.cpp" />
<ClCompile Include="Emu\RSX\VK\VKPresent.cpp" />
<ClCompile Include="Emu\RSX\VK\VKProgramPipeline.cpp" />

View File

@ -62,6 +62,8 @@
<ClCompile Include="Emu\RSX\VK\vkutils\image_helpers.cpp">
<Filter>vkutils</Filter>
</ClCompile>
<ClCompile Include="Emu\RSX\VK\VKOverlays.cpp" />
<ClCompile Include="Emu\RSX\VK\VKCompute.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="Emu\RSX\VK\VKCommonDecompiler.h" />