mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 02:32:36 +01:00
Move code to cpp (#9938)
* GL: move GLOverlays code to cpp * GL: move GLCompute code to cpp * VK: move VKOverlays code to cpp * VK: move VKCompute code to cpp
This commit is contained in:
parent
9cbe77904d
commit
cbd895a29c
@ -430,10 +430,12 @@ target_sources(rpcs3_emu PRIVATE
|
||||
RSX/Capture/rsx_capture.cpp
|
||||
RSX/Capture/rsx_replay.cpp
|
||||
RSX/GL/GLCommonDecompiler.cpp
|
||||
RSX/GL/GLCompute.cpp
|
||||
RSX/GL/GLDraw.cpp
|
||||
RSX/GL/GLFragmentProgram.cpp
|
||||
RSX/GL/GLGSRender.cpp
|
||||
RSX/GL/GLHelpers.cpp
|
||||
RSX/GL/GLOverlays.cpp
|
||||
RSX/GL/GLPipelineCompiler.cpp
|
||||
RSX/GL/GLPresent.cpp
|
||||
RSX/GL/GLRenderTargets.cpp
|
||||
@ -462,6 +464,7 @@ if(TARGET 3rdparty_vulkan)
|
||||
RSX/VK/vkutils/shared.cpp
|
||||
RSX/VK/VKCommandStream.cpp
|
||||
RSX/VK/VKCommonDecompiler.cpp
|
||||
RSX/VK/VKCompute.cpp
|
||||
RSX/VK/VKDMA.cpp
|
||||
RSX/VK/VKDraw.cpp
|
||||
RSX/VK/VKFormats.cpp
|
||||
@ -470,6 +473,7 @@ if(TARGET 3rdparty_vulkan)
|
||||
RSX/VK/VKGSRender.cpp
|
||||
RSX/VK/VKHelpers.cpp
|
||||
RSX/VK/VKMemAlloc.cpp
|
||||
RSX/VK/VKOverlays.cpp
|
||||
RSX/VK/VKPipelineCompiler.cpp
|
||||
RSX/VK/VKPresent.cpp
|
||||
RSX/VK/VKProgramPipeline.cpp
|
||||
|
297
rpcs3/Emu/RSX/GL/GLCompute.cpp
Normal file
297
rpcs3/Emu/RSX/GL/GLCompute.cpp
Normal file
@ -0,0 +1,297 @@
|
||||
#include "GLCompute.h"
|
||||
#include "Utilities/StrUtil.h"
|
||||
|
||||
namespace gl
|
||||
{
|
||||
void compute_task::initialize()
|
||||
{
|
||||
// Set up optimal kernel size
|
||||
const auto& caps = gl::get_driver_caps();
|
||||
if (caps.vendor_AMD || caps.vendor_MESA)
|
||||
{
|
||||
optimal_group_size = 64;
|
||||
unroll_loops = false;
|
||||
}
|
||||
else if (caps.vendor_NVIDIA)
|
||||
{
|
||||
optimal_group_size = 32;
|
||||
}
|
||||
else
|
||||
{
|
||||
optimal_group_size = 128;
|
||||
}
|
||||
|
||||
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, reinterpret_cast<GLint*>(&max_invocations_x));
|
||||
}
|
||||
|
||||
void compute_task::create()
|
||||
{
|
||||
if (!compiled)
|
||||
{
|
||||
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
||||
m_shader.compile();
|
||||
|
||||
m_program.create();
|
||||
m_program.attach(m_shader);
|
||||
m_program.link();
|
||||
|
||||
compiled = true;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_task::destroy()
|
||||
{
|
||||
if (compiled)
|
||||
{
|
||||
m_program.remove();
|
||||
m_shader.remove();
|
||||
|
||||
compiled = false;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_task::run(u32 invocations_x, u32 invocations_y)
|
||||
{
|
||||
GLint old_program;
|
||||
glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);
|
||||
|
||||
bind_resources();
|
||||
m_program.use();
|
||||
glDispatchCompute(invocations_x, invocations_y, 1);
|
||||
|
||||
glUseProgram(old_program);
|
||||
}
|
||||
|
||||
void compute_task::run(u32 num_invocations)
|
||||
{
|
||||
u32 invocations_x, invocations_y;
|
||||
if (num_invocations <= max_invocations_x) [[likely]]
|
||||
{
|
||||
invocations_x = num_invocations;
|
||||
invocations_y = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Since all the invocations will run, the optimal distribution is sqrt(count)
|
||||
const u32 optimal_length = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
||||
invocations_x = optimal_length;
|
||||
invocations_y = invocations_x;
|
||||
|
||||
if (num_invocations % invocations_x) invocations_y++;
|
||||
}
|
||||
|
||||
run(invocations_x, invocations_y);
|
||||
}
|
||||
|
||||
cs_shuffle_base::cs_shuffle_base()
|
||||
{
|
||||
work_kernel =
|
||||
" value = data[index];\n"
|
||||
" data[index] = %f(value);\n";
|
||||
|
||||
loop_advance =
|
||||
" index++;\n";
|
||||
|
||||
suffix =
|
||||
"}\n";
|
||||
}
|
||||
|
||||
void cs_shuffle_base::build(const char* function_name, u32 _kernel_size)
|
||||
{
|
||||
// Initialize to allow detecting optimal settings
|
||||
initialize();
|
||||
|
||||
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
||||
|
||||
m_src =
|
||||
"#version 430\n"
|
||||
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
||||
"layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n"
|
||||
"%ub"
|
||||
"\n"
|
||||
"#define KERNEL_SIZE %ks\n"
|
||||
"\n"
|
||||
"// Generic swap routines\n"
|
||||
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
||||
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
||||
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
||||
"\n"
|
||||
"// Depth format conversions\n"
|
||||
"#define d24f_to_f32(bits) (bits << 7)\n"
|
||||
"#define f32_to_d24f(bits) (bits >> 7)\n"
|
||||
"\n"
|
||||
"uint linear_invocation_id()\n"
|
||||
"{\n"
|
||||
" uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n"
|
||||
" return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"%md"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" uint invocation_id = linear_invocation_id();\n"
|
||||
" uint index = invocation_id * KERNEL_SIZE;\n"
|
||||
" uint value;\n"
|
||||
" %vars"
|
||||
"\n";
|
||||
|
||||
const std::pair<std::string, std::string> syntax_replace[] =
|
||||
{
|
||||
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
|
||||
{ "%ws", std::to_string(optimal_group_size) },
|
||||
{ "%ks", std::to_string(kernel_size) },
|
||||
{ "%vars", variables },
|
||||
{ "%f", function_name },
|
||||
{ "%ub", uniforms },
|
||||
{ "%md", method_declarations }
|
||||
};
|
||||
|
||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
||||
|
||||
if (kernel_size <= 1)
|
||||
{
|
||||
m_src += " {\n" + work_kernel + " }\n";
|
||||
}
|
||||
else if (unroll_loops)
|
||||
{
|
||||
work_kernel += loop_advance + "\n";
|
||||
|
||||
m_src += std::string
|
||||
(
|
||||
" //Unrolled loop\n"
|
||||
" {\n"
|
||||
);
|
||||
|
||||
// Assemble body with manual loop unroll to try loweing GPR usage
|
||||
for (u32 n = 0; n < kernel_size; ++n)
|
||||
{
|
||||
m_src += work_kernel;
|
||||
}
|
||||
|
||||
m_src += " }\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
||||
m_src += " {\n";
|
||||
m_src += work_kernel;
|
||||
m_src += loop_advance;
|
||||
m_src += " }\n";
|
||||
}
|
||||
|
||||
m_src += suffix;
|
||||
}
|
||||
|
||||
void cs_shuffle_base::bind_resources()
|
||||
{
|
||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
|
||||
}
|
||||
|
||||
void cs_shuffle_base::run(const gl::buffer* data, u32 data_length, u32 data_offset)
|
||||
{
|
||||
m_data = data;
|
||||
m_data_offset = data_offset;
|
||||
m_data_length = data_length;
|
||||
|
||||
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
||||
const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
|
||||
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
||||
|
||||
if ((num_bytes_to_process + data_offset) > data->size())
|
||||
{
|
||||
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
||||
rsx_log.error("Inadequate buffer length submitted for a compute operation."
|
||||
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
||||
}
|
||||
|
||||
compute_task::run(num_invocations);
|
||||
}
|
||||
|
||||
cs_shuffle_d32fx8_to_x8d24f::cs_shuffle_d32fx8_to_x8d24f()
|
||||
{
|
||||
uniforms = "uniform uint in_ptr, out_ptr;\n";
|
||||
|
||||
variables =
|
||||
" uint in_offset = in_ptr >> 2;\n"
|
||||
" uint out_offset = out_ptr >> 2;\n"
|
||||
" uint depth, stencil;\n";
|
||||
|
||||
work_kernel =
|
||||
" depth = data[index * 2 + in_offset];\n"
|
||||
" stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n"
|
||||
" value = f32_to_d24f(depth) << 8;\n"
|
||||
" value |= stencil;\n"
|
||||
" data[index + out_ptr] = bswap_u32(value);\n";
|
||||
|
||||
cs_shuffle_base::build("");
|
||||
}
|
||||
|
||||
void cs_shuffle_d32fx8_to_x8d24f::bind_resources()
|
||||
{
|
||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
|
||||
}
|
||||
|
||||
void cs_shuffle_d32fx8_to_x8d24f::run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
|
||||
{
|
||||
u32 data_offset;
|
||||
if (src_offset > dst_offset)
|
||||
{
|
||||
data_offset = dst_offset;
|
||||
m_ssbo_length = (src_offset + num_texels * 8) - data_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
data_offset = src_offset;
|
||||
m_ssbo_length = (dst_offset + num_texels * 4) - data_offset;
|
||||
}
|
||||
|
||||
m_program.uniforms["in_ptr"] = src_offset - data_offset;
|
||||
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
|
||||
cs_shuffle_base::run(data, num_texels * 4, data_offset);
|
||||
}
|
||||
|
||||
cs_shuffle_x8d24f_to_d32fx8::cs_shuffle_x8d24f_to_d32fx8()
|
||||
{
|
||||
uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n";
|
||||
|
||||
variables =
|
||||
" uint in_offset = in_ptr >> 2;\n"
|
||||
" uint out_offset = out_ptr >> 2;\n"
|
||||
" uint depth, stencil;\n";
|
||||
|
||||
work_kernel =
|
||||
" value = data[index + in_offset];\n"
|
||||
" value = bswap_u32(value);\n"
|
||||
" stencil = (value & 0xFFu);\n"
|
||||
" depth = (value >> 8);\n"
|
||||
" data[index * 2 + out_offset] = d24f_to_f32(depth);\n"
|
||||
" data[index * 2 + (out_offset + 1)] = stencil;\n";
|
||||
|
||||
cs_shuffle_base::build("");
|
||||
}
|
||||
|
||||
void cs_shuffle_x8d24f_to_d32fx8::bind_resources()
|
||||
{
|
||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
|
||||
}
|
||||
|
||||
void cs_shuffle_x8d24f_to_d32fx8::run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
|
||||
{
|
||||
u32 data_offset;
|
||||
if (src_offset > dst_offset)
|
||||
{
|
||||
data_offset = dst_offset;
|
||||
m_ssbo_length = (src_offset + num_texels * 4) - data_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
data_offset = src_offset;
|
||||
m_ssbo_length = (dst_offset + num_texels * 8) - data_offset;
|
||||
}
|
||||
|
||||
m_program.uniforms["in_ptr"] = src_offset - data_offset;
|
||||
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
|
||||
cs_shuffle_base::run(data, num_texels * 4, data_offset);
|
||||
}
|
||||
}
|
@ -1,10 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "Utilities/StrUtil.h"
|
||||
#include "Emu/IdManager.h"
|
||||
#include "GLHelpers.h"
|
||||
|
||||
#include "util/asm.hpp"
|
||||
#include <unordered_map>
|
||||
|
||||
namespace gl
|
||||
@ -22,88 +20,14 @@ namespace gl
|
||||
u32 optimal_kernel_size = 1;
|
||||
u32 max_invocations_x = 65535;
|
||||
|
||||
void initialize()
|
||||
{
|
||||
// Set up optimal kernel size
|
||||
const auto& caps = gl::get_driver_caps();
|
||||
if (caps.vendor_AMD || caps.vendor_MESA)
|
||||
{
|
||||
optimal_group_size = 64;
|
||||
unroll_loops = false;
|
||||
}
|
||||
else if (caps.vendor_NVIDIA)
|
||||
{
|
||||
optimal_group_size = 32;
|
||||
}
|
||||
else
|
||||
{
|
||||
optimal_group_size = 128;
|
||||
}
|
||||
void initialize();
|
||||
void create();
|
||||
void destroy();
|
||||
|
||||
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, reinterpret_cast<GLint*>(&max_invocations_x));
|
||||
}
|
||||
virtual void bind_resources() {}
|
||||
|
||||
void create()
|
||||
{
|
||||
if (!compiled)
|
||||
{
|
||||
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
||||
m_shader.compile();
|
||||
|
||||
m_program.create();
|
||||
m_program.attach(m_shader);
|
||||
m_program.link();
|
||||
|
||||
compiled = true;
|
||||
}
|
||||
}
|
||||
|
||||
void destroy()
|
||||
{
|
||||
if (compiled)
|
||||
{
|
||||
m_program.remove();
|
||||
m_shader.remove();
|
||||
|
||||
compiled = false;
|
||||
}
|
||||
}
|
||||
|
||||
virtual void bind_resources()
|
||||
{}
|
||||
|
||||
void run(u32 invocations_x, u32 invocations_y)
|
||||
{
|
||||
GLint old_program;
|
||||
glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);
|
||||
|
||||
bind_resources();
|
||||
m_program.use();
|
||||
glDispatchCompute(invocations_x, invocations_y, 1);
|
||||
|
||||
glUseProgram(old_program);
|
||||
}
|
||||
|
||||
void run(u32 num_invocations)
|
||||
{
|
||||
u32 invocations_x, invocations_y;
|
||||
if (num_invocations <= max_invocations_x) [[likely]]
|
||||
{
|
||||
invocations_x = num_invocations;
|
||||
invocations_y = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Since all the invocations will run, the optimal distribution is sqrt(count)
|
||||
const u32 optimal_length = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
||||
invocations_x = optimal_length;
|
||||
invocations_y = invocations_x;
|
||||
|
||||
if (num_invocations % invocations_x) invocations_y++;
|
||||
}
|
||||
|
||||
run(invocations_x, invocations_y);
|
||||
}
|
||||
void run(u32 invocations_x, u32 invocations_y);
|
||||
void run(u32 num_invocations);
|
||||
};
|
||||
|
||||
struct cs_shuffle_base : compute_task
|
||||
@ -115,130 +39,13 @@ namespace gl
|
||||
|
||||
std::string uniforms, variables, work_kernel, loop_advance, suffix, method_declarations;
|
||||
|
||||
cs_shuffle_base()
|
||||
{
|
||||
work_kernel =
|
||||
" value = data[index];\n"
|
||||
" data[index] = %f(value);\n";
|
||||
cs_shuffle_base();
|
||||
|
||||
loop_advance =
|
||||
" index++;\n";
|
||||
void build(const char* function_name, u32 _kernel_size = 0);
|
||||
|
||||
suffix =
|
||||
"}\n";
|
||||
}
|
||||
void bind_resources() override;
|
||||
|
||||
void build(const char* function_name, u32 _kernel_size = 0)
|
||||
{
|
||||
// Initialize to allow detecting optimal settings
|
||||
initialize();
|
||||
|
||||
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
||||
|
||||
m_src =
|
||||
"#version 430\n"
|
||||
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
||||
"layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n"
|
||||
"%ub"
|
||||
"\n"
|
||||
"#define KERNEL_SIZE %ks\n"
|
||||
"\n"
|
||||
"// Generic swap routines\n"
|
||||
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
||||
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
||||
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
||||
"\n"
|
||||
"// Depth format conversions\n"
|
||||
"#define d24f_to_f32(bits) (bits << 7)\n"
|
||||
"#define f32_to_d24f(bits) (bits >> 7)\n"
|
||||
"\n"
|
||||
"uint linear_invocation_id()\n"
|
||||
"{\n"
|
||||
" uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n"
|
||||
" return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"%md"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" uint invocation_id = linear_invocation_id();\n"
|
||||
" uint index = invocation_id * KERNEL_SIZE;\n"
|
||||
" uint value;\n"
|
||||
" %vars"
|
||||
"\n";
|
||||
|
||||
const std::pair<std::string, std::string> syntax_replace[] =
|
||||
{
|
||||
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
|
||||
{ "%ws", std::to_string(optimal_group_size) },
|
||||
{ "%ks", std::to_string(kernel_size) },
|
||||
{ "%vars", variables },
|
||||
{ "%f", function_name },
|
||||
{ "%ub", uniforms },
|
||||
{ "%md", method_declarations }
|
||||
};
|
||||
|
||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
||||
|
||||
if (kernel_size <= 1)
|
||||
{
|
||||
m_src += " {\n" + work_kernel + " }\n";
|
||||
}
|
||||
else if (unroll_loops)
|
||||
{
|
||||
work_kernel += loop_advance + "\n";
|
||||
|
||||
m_src += std::string
|
||||
(
|
||||
" //Unrolled loop\n"
|
||||
" {\n"
|
||||
);
|
||||
|
||||
// Assemble body with manual loop unroll to try loweing GPR usage
|
||||
for (u32 n = 0; n < kernel_size; ++n)
|
||||
{
|
||||
m_src += work_kernel;
|
||||
}
|
||||
|
||||
m_src += " }\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
||||
m_src += " {\n";
|
||||
m_src += work_kernel;
|
||||
m_src += loop_advance;
|
||||
m_src += " }\n";
|
||||
}
|
||||
|
||||
m_src += suffix;
|
||||
}
|
||||
|
||||
void bind_resources() override
|
||||
{
|
||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
|
||||
}
|
||||
|
||||
void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0)
|
||||
{
|
||||
m_data = data;
|
||||
m_data_offset = data_offset;
|
||||
m_data_length = data_length;
|
||||
|
||||
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
||||
const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
|
||||
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
||||
|
||||
if ((num_bytes_to_process + data_offset) > data->size())
|
||||
{
|
||||
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
||||
rsx_log.error("Inadequate buffer length submitted for a compute operation."
|
||||
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
||||
}
|
||||
|
||||
compute_task::run(num_invocations);
|
||||
}
|
||||
void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0);
|
||||
};
|
||||
|
||||
struct cs_shuffle_16 : cs_shuffle_base
|
||||
@ -272,97 +79,22 @@ namespace gl
|
||||
{
|
||||
u32 m_ssbo_length = 0;
|
||||
|
||||
cs_shuffle_d32fx8_to_x8d24f()
|
||||
{
|
||||
uniforms = "uniform uint in_ptr, out_ptr;\n";
|
||||
cs_shuffle_d32fx8_to_x8d24f();
|
||||
|
||||
variables =
|
||||
" uint in_offset = in_ptr >> 2;\n"
|
||||
" uint out_offset = out_ptr >> 2;\n"
|
||||
" uint depth, stencil;\n";
|
||||
void bind_resources() override;
|
||||
|
||||
work_kernel =
|
||||
" depth = data[index * 2 + in_offset];\n"
|
||||
" stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n"
|
||||
" value = f32_to_d24f(depth) << 8;\n"
|
||||
" value |= stencil;\n"
|
||||
" data[index + out_ptr] = bswap_u32(value);\n";
|
||||
|
||||
cs_shuffle_base::build("");
|
||||
}
|
||||
|
||||
void bind_resources() override
|
||||
{
|
||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
|
||||
}
|
||||
|
||||
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
|
||||
{
|
||||
u32 data_offset;
|
||||
if (src_offset > dst_offset)
|
||||
{
|
||||
data_offset = dst_offset;
|
||||
m_ssbo_length = (src_offset + num_texels * 8) - data_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
data_offset = src_offset;
|
||||
m_ssbo_length = (dst_offset + num_texels * 4) - data_offset;
|
||||
}
|
||||
|
||||
m_program.uniforms["in_ptr"] = src_offset - data_offset;
|
||||
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
|
||||
cs_shuffle_base::run(data, num_texels * 4, data_offset);
|
||||
}
|
||||
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels);
|
||||
};
|
||||
|
||||
struct cs_shuffle_x8d24f_to_d32fx8 : cs_shuffle_base
|
||||
{
|
||||
u32 m_ssbo_length = 0;
|
||||
|
||||
cs_shuffle_x8d24f_to_d32fx8()
|
||||
{
|
||||
uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n";
|
||||
cs_shuffle_x8d24f_to_d32fx8();
|
||||
|
||||
variables =
|
||||
" uint in_offset = in_ptr >> 2;\n"
|
||||
" uint out_offset = out_ptr >> 2;\n"
|
||||
" uint depth, stencil;\n";
|
||||
void bind_resources() override;
|
||||
|
||||
work_kernel =
|
||||
" value = data[index + in_offset];\n"
|
||||
" value = bswap_u32(value);\n"
|
||||
" stencil = (value & 0xFFu);\n"
|
||||
" depth = (value >> 8);\n"
|
||||
" data[index * 2 + out_offset] = d24f_to_f32(depth);\n"
|
||||
" data[index * 2 + (out_offset + 1)] = stencil;\n";
|
||||
|
||||
cs_shuffle_base::build("");
|
||||
}
|
||||
|
||||
void bind_resources() override
|
||||
{
|
||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
|
||||
}
|
||||
|
||||
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
|
||||
{
|
||||
u32 data_offset;
|
||||
if (src_offset > dst_offset)
|
||||
{
|
||||
data_offset = dst_offset;
|
||||
m_ssbo_length = (src_offset + num_texels * 4) - data_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
data_offset = src_offset;
|
||||
m_ssbo_length = (dst_offset + num_texels * 8) - data_offset;
|
||||
}
|
||||
|
||||
m_program.uniforms["in_ptr"] = src_offset - data_offset;
|
||||
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
|
||||
cs_shuffle_base::run(data, num_texels * 4, data_offset);
|
||||
}
|
||||
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels);
|
||||
};
|
||||
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "util/logs.hpp"
|
||||
#include "util/types.hpp"
|
||||
#include "Utilities/geometry.h"
|
||||
#include "OpenGL.h"
|
||||
|
648
rpcs3/Emu/RSX/GL/GLOverlays.cpp
Normal file
648
rpcs3/Emu/RSX/GL/GLOverlays.cpp
Normal file
@ -0,0 +1,648 @@
|
||||
#include "GLOverlays.h"
|
||||
|
||||
extern u64 get_system_time();
|
||||
|
||||
namespace gl
|
||||
{
|
||||
void overlay_pass::create()
|
||||
{
|
||||
if (!compiled)
|
||||
{
|
||||
fs.create(::glsl::program_domain::glsl_fragment_program, fs_src);
|
||||
fs.compile();
|
||||
|
||||
vs.create(::glsl::program_domain::glsl_vertex_program, vs_src);
|
||||
vs.compile();
|
||||
|
||||
program_handle.create();
|
||||
program_handle.attach(vs);
|
||||
program_handle.attach(fs);
|
||||
program_handle.link();
|
||||
|
||||
fbo.create();
|
||||
|
||||
m_sampler.create();
|
||||
m_sampler.apply_defaults(input_filter);
|
||||
|
||||
m_vertex_data_buffer.create();
|
||||
|
||||
int old_vao;
|
||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||
|
||||
m_vao.create();
|
||||
m_vao.bind();
|
||||
|
||||
m_vao.array_buffer = m_vertex_data_buffer;
|
||||
auto ptr = buffer_pointer(&m_vao);
|
||||
m_vao[0] = ptr;
|
||||
|
||||
glBindVertexArray(old_vao);
|
||||
|
||||
compiled = true;
|
||||
}
|
||||
}
|
||||
|
||||
void overlay_pass::destroy()
|
||||
{
|
||||
if (compiled)
|
||||
{
|
||||
program_handle.remove();
|
||||
vs.remove();
|
||||
fs.remove();
|
||||
|
||||
fbo.remove();
|
||||
m_vao.remove();
|
||||
m_vertex_data_buffer.remove();
|
||||
|
||||
m_sampler.remove();
|
||||
|
||||
compiled = false;
|
||||
}
|
||||
}
|
||||
|
||||
void overlay_pass::emit_geometry()
|
||||
{
|
||||
int old_vao;
|
||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||
|
||||
m_vao.bind();
|
||||
glDrawArrays(primitives, 0, num_drawable_elements);
|
||||
|
||||
glBindVertexArray(old_vao);
|
||||
}
|
||||
|
||||
void overlay_pass::run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending)
|
||||
{
|
||||
if (!compiled)
|
||||
{
|
||||
rsx_log.error("You must initialize overlay passes with create() before calling run()");
|
||||
return;
|
||||
}
|
||||
|
||||
GLint program;
|
||||
GLint old_fbo;
|
||||
GLint depth_func;
|
||||
GLint viewport[4];
|
||||
GLboolean color_writes[4];
|
||||
GLboolean depth_write;
|
||||
|
||||
GLint blend_src_rgb;
|
||||
GLint blend_src_a;
|
||||
GLint blend_dst_rgb;
|
||||
GLint blend_dst_a;
|
||||
GLint blend_eq_a;
|
||||
GLint blend_eq_rgb;
|
||||
|
||||
if (target_texture)
|
||||
{
|
||||
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo);
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, fbo.id());
|
||||
|
||||
if (depth_target)
|
||||
{
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, target_texture, 0);
|
||||
glDrawBuffer(GL_NONE);
|
||||
}
|
||||
else
|
||||
{
|
||||
GLenum buffer = GL_COLOR_ATTACHMENT0;
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, target_texture, 0);
|
||||
glDrawBuffers(1, &buffer);
|
||||
}
|
||||
}
|
||||
|
||||
if (!target_texture || glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE)
|
||||
{
|
||||
// Push rasterizer state
|
||||
glGetIntegerv(GL_VIEWPORT, viewport);
|
||||
glGetBooleanv(GL_COLOR_WRITEMASK, color_writes);
|
||||
glGetBooleanv(GL_DEPTH_WRITEMASK, &depth_write);
|
||||
glGetIntegerv(GL_CURRENT_PROGRAM, &program);
|
||||
glGetIntegerv(GL_DEPTH_FUNC, &depth_func);
|
||||
|
||||
GLboolean scissor_enabled = glIsEnabled(GL_SCISSOR_TEST);
|
||||
GLboolean depth_test_enabled = glIsEnabled(GL_DEPTH_TEST);
|
||||
GLboolean cull_face_enabled = glIsEnabled(GL_CULL_FACE);
|
||||
GLboolean blend_enabled = glIsEnabledi(GL_BLEND, 0);
|
||||
GLboolean stencil_test_enabled = glIsEnabled(GL_STENCIL_TEST);
|
||||
|
||||
if (use_blending)
|
||||
{
|
||||
glGetIntegerv(GL_BLEND_SRC_RGB, &blend_src_rgb);
|
||||
glGetIntegerv(GL_BLEND_SRC_ALPHA, &blend_src_a);
|
||||
glGetIntegerv(GL_BLEND_DST_RGB, &blend_dst_rgb);
|
||||
glGetIntegerv(GL_BLEND_DST_ALPHA, &blend_dst_a);
|
||||
glGetIntegerv(GL_BLEND_EQUATION_RGB, &blend_eq_rgb);
|
||||
glGetIntegerv(GL_BLEND_EQUATION_ALPHA, &blend_eq_a);
|
||||
}
|
||||
|
||||
// Set initial state
|
||||
glViewport(region.x1, region.y1, region.width(), region.height());
|
||||
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
|
||||
glDepthMask(depth_target ? GL_TRUE : GL_FALSE);
|
||||
|
||||
// Disabling depth test will also disable depth writes which is not desired
|
||||
glDepthFunc(GL_ALWAYS);
|
||||
glEnable(GL_DEPTH_TEST);
|
||||
|
||||
if (scissor_enabled) glDisable(GL_SCISSOR_TEST);
|
||||
if (cull_face_enabled) glDisable(GL_CULL_FACE);
|
||||
if (stencil_test_enabled) glDisable(GL_STENCIL_TEST);
|
||||
|
||||
if (use_blending)
|
||||
{
|
||||
if (!blend_enabled)
|
||||
glEnablei(GL_BLEND, 0);
|
||||
|
||||
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
|
||||
glBlendEquation(GL_FUNC_ADD);
|
||||
}
|
||||
else if (blend_enabled)
|
||||
{
|
||||
glDisablei(GL_BLEND, 0);
|
||||
}
|
||||
|
||||
// Render
|
||||
program_handle.use();
|
||||
on_load();
|
||||
bind_resources();
|
||||
emit_geometry();
|
||||
|
||||
// Clean up
|
||||
if (target_texture)
|
||||
{
|
||||
if (depth_target)
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
|
||||
else
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
|
||||
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, old_fbo);
|
||||
}
|
||||
|
||||
glUseProgram(program);
|
||||
|
||||
glViewport(viewport[0], viewport[1], viewport[2], viewport[3]);
|
||||
glColorMask(color_writes[0], color_writes[1], color_writes[2], color_writes[3]);
|
||||
glDepthMask(depth_write);
|
||||
glDepthFunc(depth_func);
|
||||
|
||||
if (!depth_test_enabled) glDisable(GL_DEPTH_TEST);
|
||||
if (scissor_enabled) glEnable(GL_SCISSOR_TEST);
|
||||
if (cull_face_enabled) glEnable(GL_CULL_FACE);
|
||||
if (stencil_test_enabled) glEnable(GL_STENCIL_TEST);
|
||||
|
||||
if (use_blending)
|
||||
{
|
||||
if (!blend_enabled)
|
||||
glDisablei(GL_BLEND, 0);
|
||||
|
||||
glBlendFuncSeparate(blend_src_rgb, blend_dst_rgb, blend_src_a, blend_dst_a);
|
||||
glBlendEquationSeparate(blend_eq_rgb, blend_eq_a);
|
||||
}
|
||||
else if (blend_enabled)
|
||||
{
|
||||
glEnablei(GL_BLEND, 0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
rsx_log.error("Overlay pass failed because framebuffer was not complete. Run with debug output enabled to diagnose the problem");
|
||||
}
|
||||
}
|
||||
|
||||
ui_overlay_renderer::ui_overlay_renderer()
|
||||
{
|
||||
vs_src =
|
||||
"#version 420\n\n"
|
||||
"layout(location=0) in vec4 in_pos;\n"
|
||||
"layout(location=0) out vec2 tc0;\n"
|
||||
"layout(location=1) flat out vec4 clip_rect;\n"
|
||||
"uniform vec4 ui_scale;\n"
|
||||
"uniform vec4 viewport;\n"
|
||||
"uniform vec4 clip_bounds;\n"
|
||||
"\n"
|
||||
"vec2 snap_to_grid(vec2 normalized)\n"
|
||||
"{\n"
|
||||
" return (floor(normalized * viewport.xy) + 0.5) / viewport.xy;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"vec4 clip_to_ndc(const in vec4 coord)\n"
|
||||
"{\n"
|
||||
" vec4 ret = (coord * ui_scale.zwzw) / ui_scale.xyxy;\n"
|
||||
" ret.yw = 1. - ret.yw;\n"
|
||||
" return ret;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"vec4 ndc_to_window(const in vec4 coord)\n"
|
||||
"{\n"
|
||||
" return fma(coord, viewport.xyxy, viewport.zwzw);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" tc0.xy = in_pos.zw;\n"
|
||||
" clip_rect = ndc_to_window(clip_to_ndc(clip_bounds)).xwzy; // Swap y1 and y2 due to flipped origin!\n"
|
||||
" vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n"
|
||||
" pos.xy = snap_to_grid(pos.xy);\n"
|
||||
" gl_Position = (pos + pos) - 1.;\n"
|
||||
"}\n";
|
||||
|
||||
fs_src =
|
||||
"#version 420\n\n"
|
||||
"layout(binding=31) uniform sampler2D fs0;\n"
|
||||
"layout(binding=30) uniform sampler2DArray fs1;\n"
|
||||
"layout(location=0) in vec2 tc0;\n"
|
||||
"layout(location=1) flat in vec4 clip_rect;\n"
|
||||
"layout(location=0) out vec4 ocol;\n"
|
||||
"uniform vec4 color;\n"
|
||||
"uniform float time;\n"
|
||||
"uniform int sampler_mode;\n"
|
||||
"uniform int pulse_glow;\n"
|
||||
"uniform int clip_region;\n"
|
||||
"uniform int blur_strength;\n"
|
||||
"\n"
|
||||
"vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n"
|
||||
"{\n"
|
||||
" vec2 coords[9];\n"
|
||||
" coords[0] = coord - tex_offset\n;"
|
||||
" coords[1] = coord + vec2(0., -tex_offset.y);\n"
|
||||
" coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n"
|
||||
" coords[3] = coord + vec2(-tex_offset.x, 0.);\n"
|
||||
" coords[4] = coord;\n"
|
||||
" coords[5] = coord + vec2(tex_offset.x, 0.);\n"
|
||||
" coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n"
|
||||
" coords[7] = coord + vec2(0., tex_offset.y);\n"
|
||||
" coords[8] = coord + tex_offset;\n"
|
||||
"\n"
|
||||
" float weights[9] =\n"
|
||||
" {\n"
|
||||
" 1., 2., 1.,\n"
|
||||
" 2., 4., 2.,\n"
|
||||
" 1., 2., 1.\n"
|
||||
" };\n"
|
||||
"\n"
|
||||
" vec4 blurred = vec4(0.);\n"
|
||||
" for (int n = 0; n < 9; ++n)\n"
|
||||
" {\n"
|
||||
" blurred += texture(tex, coords[n]) * weights[n];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" return blurred / 16.f;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"vec4 sample_image(sampler2D tex, vec2 coord)\n"
|
||||
"{\n"
|
||||
" vec4 original = texture(tex, coord);\n"
|
||||
" if (blur_strength == 0) return original;\n"
|
||||
" \n"
|
||||
" vec2 constraints = 1.f / vec2(640, 360);\n"
|
||||
" vec2 res_offset = 1.f / textureSize(fs0, 0);\n"
|
||||
" vec2 tex_offset = max(res_offset, constraints);\n"
|
||||
"\n"
|
||||
" // Sample triangle pattern and average\n"
|
||||
" // TODO: Nicer looking gaussian blur with less sampling\n"
|
||||
" vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n"
|
||||
" vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n"
|
||||
" vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n"
|
||||
"\n"
|
||||
" vec4 blurred = blur0 + blur1 + blur2;\n"
|
||||
" blurred /= 3.;\n"
|
||||
" return mix(original, blurred, float(blur_strength) / 100.);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" if (clip_region != 0)\n"
|
||||
" {"
|
||||
" if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n"
|
||||
" gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n"
|
||||
" {\n"
|
||||
" discard;\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" vec4 diff_color = color;\n"
|
||||
" if (pulse_glow != 0)\n"
|
||||
" diff_color.a *= (sin(time) + 1.f) * 0.5f;\n"
|
||||
"\n"
|
||||
" switch (sampler_mode)\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" ocol = sample_image(fs0, tc0) * diff_color;\n"
|
||||
" break;\n"
|
||||
" case 2:\n"
|
||||
" ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))) * diff_color;\n"
|
||||
" break;\n"
|
||||
" default:\n"
|
||||
" ocol = diff_color;\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
"}\n";
|
||||
|
||||
// Smooth filtering required for inputs
|
||||
input_filter = GL_LINEAR;
|
||||
}
|
||||
|
||||
gl::texture_view* ui_overlay_renderer::load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid)
|
||||
{
|
||||
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D, desc->w, desc->h, 1, 1, GL_RGBA8);
|
||||
tex->copy_from(desc->data, gl::texture::format::rgba, gl::texture::type::uint_8_8_8_8, {});
|
||||
|
||||
GLenum remap[] = { GL_RED, GL_ALPHA, GL_BLUE, GL_GREEN };
|
||||
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
|
||||
|
||||
auto result = view.get();
|
||||
if (!temp_resource)
|
||||
{
|
||||
resources.push_back(std::move(tex));
|
||||
view_cache[view_cache.size()] = std::move(view);
|
||||
}
|
||||
else
|
||||
{
|
||||
u64 key = reinterpret_cast<u64>(desc);
|
||||
temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex));
|
||||
temp_view_cache[key] = std::move(view);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void ui_overlay_renderer::create()
|
||||
{
|
||||
overlay_pass::create();
|
||||
|
||||
rsx::overlays::resource_config configuration;
|
||||
configuration.load_files();
|
||||
|
||||
for (const auto &res : configuration.texture_raw_data)
|
||||
{
|
||||
load_simple_image(res.get(), false, UINT32_MAX);
|
||||
}
|
||||
|
||||
configuration.free_resources();
|
||||
}
|
||||
|
||||
void ui_overlay_renderer::destroy()
|
||||
{
|
||||
temp_image_cache.clear();
|
||||
resources.clear();
|
||||
font_cache.clear();
|
||||
overlay_pass::destroy();
|
||||
}
|
||||
|
||||
void ui_overlay_renderer::remove_temp_resources(u64 key)
|
||||
{
|
||||
std::vector<u64> keys_to_remove;
|
||||
for (const auto& temp_image : temp_image_cache)
|
||||
{
|
||||
if (temp_image.second.first == key)
|
||||
{
|
||||
keys_to_remove.push_back(temp_image.first);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& _key : keys_to_remove)
|
||||
{
|
||||
temp_image_cache.erase(_key);
|
||||
temp_view_cache.erase(_key);
|
||||
}
|
||||
}
|
||||
|
||||
gl::texture_view* ui_overlay_renderer::find_font(rsx::overlays::font* font)
|
||||
{
|
||||
const auto font_size = font->get_glyph_data_dimensions();
|
||||
|
||||
u64 key = reinterpret_cast<u64>(font);
|
||||
auto found = view_cache.find(key);
|
||||
if (found != view_cache.end())
|
||||
{
|
||||
if (const auto this_size = found->second->image()->size3D();
|
||||
font_size.width == this_size.width &&
|
||||
font_size.height == this_size.height &&
|
||||
font_size.depth == this_size.depth)
|
||||
{
|
||||
return found->second.get();
|
||||
}
|
||||
}
|
||||
|
||||
// Create font file
|
||||
std::vector<u8> glyph_data;
|
||||
font->get_glyph_data(glyph_data);
|
||||
|
||||
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D_ARRAY, font_size.width, font_size.height, font_size.depth, 1, GL_R8);
|
||||
tex->copy_from(glyph_data.data(), gl::texture::format::r, gl::texture::type::ubyte, {});
|
||||
|
||||
GLenum remap[] = { GL_RED, GL_RED, GL_RED, GL_RED };
|
||||
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
|
||||
|
||||
auto result = view.get();
|
||||
font_cache[key] = std::move(tex);
|
||||
view_cache[key] = std::move(view);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
gl::texture_view* ui_overlay_renderer::find_temp_image(rsx::overlays::image_info* desc, u32 owner_uid)
|
||||
{
|
||||
auto key = reinterpret_cast<u64>(desc);
|
||||
auto cached = temp_view_cache.find(key);
|
||||
if (cached != temp_view_cache.end())
|
||||
{
|
||||
return cached->second.get();
|
||||
}
|
||||
else
|
||||
{
|
||||
return load_simple_image(desc, true, owner_uid);
|
||||
}
|
||||
}
|
||||
|
||||
void ui_overlay_renderer::set_primitive_type(rsx::overlays::primitive_type type)
|
||||
{
|
||||
m_current_primitive_type = type;
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case rsx::overlays::primitive_type::quad_list:
|
||||
case rsx::overlays::primitive_type::triangle_strip:
|
||||
primitives = GL_TRIANGLE_STRIP;
|
||||
break;
|
||||
case rsx::overlays::primitive_type::line_list:
|
||||
primitives = GL_LINES;
|
||||
break;
|
||||
case rsx::overlays::primitive_type::line_strip:
|
||||
primitives = GL_LINE_STRIP;
|
||||
break;
|
||||
default:
|
||||
fmt::throw_exception("Unexpected primitive type %d", static_cast<s32>(type));
|
||||
}
|
||||
}
|
||||
|
||||
void ui_overlay_renderer::emit_geometry()
|
||||
{
|
||||
if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list)
|
||||
{
|
||||
// Emulate quads with disjointed triangle strips
|
||||
int num_quads = num_drawable_elements / 4;
|
||||
std::vector<GLint> firsts;
|
||||
std::vector<GLsizei> counts;
|
||||
|
||||
firsts.resize(num_quads);
|
||||
counts.resize(num_quads);
|
||||
|
||||
for (int n = 0; n < num_quads; ++n)
|
||||
{
|
||||
firsts[n] = (n * 4);
|
||||
counts[n] = 4;
|
||||
}
|
||||
|
||||
int old_vao;
|
||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||
|
||||
m_vao.bind();
|
||||
glMultiDrawArrays(GL_TRIANGLE_STRIP, firsts.data(), counts.data(), num_quads);
|
||||
|
||||
glBindVertexArray(old_vao);
|
||||
}
|
||||
else
|
||||
{
|
||||
overlay_pass::emit_geometry();
|
||||
}
|
||||
}
|
||||
|
||||
void ui_overlay_renderer::run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui)
|
||||
{
|
||||
program_handle.uniforms["viewport"] = color4f(static_cast<f32>(viewport.width()), static_cast<f32>(viewport.height()), static_cast<f32>(viewport.x1), static_cast<f32>(viewport.y1));
|
||||
program_handle.uniforms["ui_scale"] = color4f(static_cast<f32>(ui.virtual_width), static_cast<f32>(ui.virtual_height), 1.f, 1.f);
|
||||
program_handle.uniforms["time"] = static_cast<f32>(get_system_time() / 1000) * 0.005f;
|
||||
|
||||
saved_sampler_state save_30(30, m_sampler);
|
||||
saved_sampler_state save_31(31, m_sampler);
|
||||
|
||||
for (auto &cmd : ui.get_compiled().draw_commands)
|
||||
{
|
||||
set_primitive_type(cmd.config.primitives);
|
||||
upload_vertex_data(cmd.verts.data(), ::size32(cmd.verts));
|
||||
num_drawable_elements = ::size32(cmd.verts);
|
||||
GLint texture_read = GL_TRUE;
|
||||
|
||||
switch (cmd.config.texture_ref)
|
||||
{
|
||||
case rsx::overlays::image_resource_id::game_icon:
|
||||
case rsx::overlays::image_resource_id::backbuffer:
|
||||
//TODO
|
||||
case rsx::overlays::image_resource_id::none:
|
||||
{
|
||||
texture_read = GL_FALSE;
|
||||
glBindTexture(GL_TEXTURE_2D, GL_NONE);
|
||||
break;
|
||||
}
|
||||
case rsx::overlays::image_resource_id::raw_image:
|
||||
{
|
||||
glBindTexture(GL_TEXTURE_2D, find_temp_image(static_cast<rsx::overlays::image_info*>(cmd.config.external_data_ref), ui.uid)->id());
|
||||
break;
|
||||
}
|
||||
case rsx::overlays::image_resource_id::font_file:
|
||||
{
|
||||
texture_read = (GL_TRUE + 1);
|
||||
glActiveTexture(GL_TEXTURE0 + 30);
|
||||
glBindTexture(GL_TEXTURE_2D_ARRAY, find_font(cmd.config.font_ref)->id());
|
||||
glActiveTexture(GL_TEXTURE0 + 31);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
glBindTexture(GL_TEXTURE_2D, view_cache[cmd.config.texture_ref - 1]->id());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
program_handle.uniforms["color"] = cmd.config.color;
|
||||
program_handle.uniforms["sampler_mode"] = texture_read;
|
||||
program_handle.uniforms["pulse_glow"] = static_cast<s32>(cmd.config.pulse_glow);
|
||||
program_handle.uniforms["blur_strength"] = static_cast<s32>(cmd.config.blur_strength);
|
||||
program_handle.uniforms["clip_region"] = static_cast<s32>(cmd.config.clip_region);
|
||||
program_handle.uniforms["clip_bounds"] = cmd.config.clip_rect;
|
||||
overlay_pass::run(viewport, target, false, true);
|
||||
}
|
||||
|
||||
ui.update();
|
||||
}
|
||||
|
||||
video_out_calibration_pass::video_out_calibration_pass()
|
||||
{
|
||||
vs_src =
|
||||
"#version 420\n\n"
|
||||
"layout(location=0) out vec2 tc0;\n"
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n"
|
||||
" vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n"
|
||||
" tc0 = coords[gl_VertexID % 4];\n"
|
||||
" vec2 pos = positions[gl_VertexID % 4];\n"
|
||||
" gl_Position = vec4(pos, 0., 1.);\n"
|
||||
"}\n";
|
||||
|
||||
fs_src =
|
||||
"#version 420\n\n"
|
||||
"layout(binding=31) uniform sampler2D fs0;\n"
|
||||
"layout(binding=30) uniform sampler2D fs1;\n"
|
||||
"layout(location=0) in vec2 tc0;\n"
|
||||
"layout(location=0) out vec4 ocol;\n"
|
||||
"\n"
|
||||
"uniform float gamma;\n"
|
||||
"uniform int limit_range;\n"
|
||||
"uniform int stereo;\n"
|
||||
"uniform int stereo_image_count;\n"
|
||||
"\n"
|
||||
"vec4 read_source()\n"
|
||||
"{\n"
|
||||
" if (stereo == 0) return texture(fs0, tc0);\n"
|
||||
"\n"
|
||||
" vec4 left, right;\n"
|
||||
" if (stereo_image_count == 2)\n"
|
||||
" {\n"
|
||||
" left = texture(fs0, tc0);\n"
|
||||
" right = texture(fs1, tc0);\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n"
|
||||
" vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n"
|
||||
" left = texture(fs0, coord_left);\n"
|
||||
" right = texture(fs0, coord_right);\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" return vec4(left.r, right.g, right.b, 1.);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" vec4 color = read_source();\n"
|
||||
" color.rgb = pow(color.rgb, vec3(gamma));\n"
|
||||
" if (limit_range > 0)\n"
|
||||
" ocol = ((color * 220.) + 16.) / 255.;\n"
|
||||
" else\n"
|
||||
" ocol = color;\n"
|
||||
"}\n";
|
||||
|
||||
input_filter = GL_LINEAR;
|
||||
}
|
||||
|
||||
void video_out_calibration_pass::run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d)
|
||||
{
|
||||
program_handle.uniforms["gamma"] = gamma;
|
||||
program_handle.uniforms["limit_range"] = limited_rgb + 0;
|
||||
program_handle.uniforms["stereo"] = _3d + 0;
|
||||
program_handle.uniforms["stereo_image_count"] = (source[1] == GL_NONE? 1 : 2);
|
||||
|
||||
saved_sampler_state saved(31, m_sampler);
|
||||
glBindTexture(GL_TEXTURE_2D, source[0]);
|
||||
|
||||
saved_sampler_state saved2(30, m_sampler);
|
||||
glBindTexture(GL_TEXTURE_2D, source[1]);
|
||||
|
||||
overlay_pass::run(viewport, GL_NONE, false, false);
|
||||
}
|
||||
}
|
@ -1,13 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include "util/types.hpp"
|
||||
#include "GLHelpers.h"
|
||||
#include "../Overlays/overlays.h"
|
||||
#include "GLTexture.h"
|
||||
#include "Emu/RSX/rsx_utils.h"
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
extern u64 get_system_time();
|
||||
|
||||
namespace gl
|
||||
{
|
||||
struct overlay_pass
|
||||
@ -53,61 +52,8 @@ namespace gl
|
||||
}
|
||||
};
|
||||
|
||||
void create()
|
||||
{
|
||||
if (!compiled)
|
||||
{
|
||||
fs.create(::glsl::program_domain::glsl_fragment_program, fs_src);
|
||||
fs.compile();
|
||||
|
||||
vs.create(::glsl::program_domain::glsl_vertex_program, vs_src);
|
||||
vs.compile();
|
||||
|
||||
program_handle.create();
|
||||
program_handle.attach(vs);
|
||||
program_handle.attach(fs);
|
||||
program_handle.link();
|
||||
|
||||
fbo.create();
|
||||
|
||||
m_sampler.create();
|
||||
m_sampler.apply_defaults(input_filter);
|
||||
|
||||
m_vertex_data_buffer.create();
|
||||
|
||||
int old_vao;
|
||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||
|
||||
m_vao.create();
|
||||
m_vao.bind();
|
||||
|
||||
m_vao.array_buffer = m_vertex_data_buffer;
|
||||
auto ptr = buffer_pointer(&m_vao);
|
||||
m_vao[0] = ptr;
|
||||
|
||||
glBindVertexArray(old_vao);
|
||||
|
||||
compiled = true;
|
||||
}
|
||||
}
|
||||
|
||||
void destroy()
|
||||
{
|
||||
if (compiled)
|
||||
{
|
||||
program_handle.remove();
|
||||
vs.remove();
|
||||
fs.remove();
|
||||
|
||||
fbo.remove();
|
||||
m_vao.remove();
|
||||
m_vertex_data_buffer.remove();
|
||||
|
||||
m_sampler.remove();
|
||||
|
||||
compiled = false;
|
||||
}
|
||||
}
|
||||
void create();
|
||||
void destroy();
|
||||
|
||||
virtual void on_load() {}
|
||||
virtual void on_unload() {}
|
||||
@ -121,155 +67,9 @@ namespace gl
|
||||
m_vertex_data_buffer.data(elements_count * sizeof(T), data);
|
||||
}
|
||||
|
||||
virtual void emit_geometry()
|
||||
{
|
||||
int old_vao;
|
||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||
virtual void emit_geometry();
|
||||
|
||||
m_vao.bind();
|
||||
glDrawArrays(primitives, 0, num_drawable_elements);
|
||||
|
||||
glBindVertexArray(old_vao);
|
||||
}
|
||||
|
||||
void run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending = false)
|
||||
{
|
||||
if (!compiled)
|
||||
{
|
||||
rsx_log.error("You must initialize overlay passes with create() before calling run()");
|
||||
return;
|
||||
}
|
||||
|
||||
GLint program;
|
||||
GLint old_fbo;
|
||||
GLint depth_func;
|
||||
GLint viewport[4];
|
||||
GLboolean color_writes[4];
|
||||
GLboolean depth_write;
|
||||
|
||||
GLint blend_src_rgb;
|
||||
GLint blend_src_a;
|
||||
GLint blend_dst_rgb;
|
||||
GLint blend_dst_a;
|
||||
GLint blend_eq_a;
|
||||
GLint blend_eq_rgb;
|
||||
|
||||
if (target_texture)
|
||||
{
|
||||
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo);
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, fbo.id());
|
||||
|
||||
if (depth_target)
|
||||
{
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, target_texture, 0);
|
||||
glDrawBuffer(GL_NONE);
|
||||
}
|
||||
else
|
||||
{
|
||||
GLenum buffer = GL_COLOR_ATTACHMENT0;
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, target_texture, 0);
|
||||
glDrawBuffers(1, &buffer);
|
||||
}
|
||||
}
|
||||
|
||||
if (!target_texture || glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE)
|
||||
{
|
||||
// Push rasterizer state
|
||||
glGetIntegerv(GL_VIEWPORT, viewport);
|
||||
glGetBooleanv(GL_COLOR_WRITEMASK, color_writes);
|
||||
glGetBooleanv(GL_DEPTH_WRITEMASK, &depth_write);
|
||||
glGetIntegerv(GL_CURRENT_PROGRAM, &program);
|
||||
glGetIntegerv(GL_DEPTH_FUNC, &depth_func);
|
||||
|
||||
GLboolean scissor_enabled = glIsEnabled(GL_SCISSOR_TEST);
|
||||
GLboolean depth_test_enabled = glIsEnabled(GL_DEPTH_TEST);
|
||||
GLboolean cull_face_enabled = glIsEnabled(GL_CULL_FACE);
|
||||
GLboolean blend_enabled = glIsEnabledi(GL_BLEND, 0);
|
||||
GLboolean stencil_test_enabled = glIsEnabled(GL_STENCIL_TEST);
|
||||
|
||||
if (use_blending)
|
||||
{
|
||||
glGetIntegerv(GL_BLEND_SRC_RGB, &blend_src_rgb);
|
||||
glGetIntegerv(GL_BLEND_SRC_ALPHA, &blend_src_a);
|
||||
glGetIntegerv(GL_BLEND_DST_RGB, &blend_dst_rgb);
|
||||
glGetIntegerv(GL_BLEND_DST_ALPHA, &blend_dst_a);
|
||||
glGetIntegerv(GL_BLEND_EQUATION_RGB, &blend_eq_rgb);
|
||||
glGetIntegerv(GL_BLEND_EQUATION_ALPHA, &blend_eq_a);
|
||||
}
|
||||
|
||||
// Set initial state
|
||||
glViewport(region.x1, region.y1, region.width(), region.height());
|
||||
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
|
||||
glDepthMask(depth_target ? GL_TRUE : GL_FALSE);
|
||||
|
||||
// Disabling depth test will also disable depth writes which is not desired
|
||||
glDepthFunc(GL_ALWAYS);
|
||||
glEnable(GL_DEPTH_TEST);
|
||||
|
||||
if (scissor_enabled) glDisable(GL_SCISSOR_TEST);
|
||||
if (cull_face_enabled) glDisable(GL_CULL_FACE);
|
||||
if (stencil_test_enabled) glDisable(GL_STENCIL_TEST);
|
||||
|
||||
if (use_blending)
|
||||
{
|
||||
if (!blend_enabled)
|
||||
glEnablei(GL_BLEND, 0);
|
||||
|
||||
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
|
||||
glBlendEquation(GL_FUNC_ADD);
|
||||
}
|
||||
else if (blend_enabled)
|
||||
{
|
||||
glDisablei(GL_BLEND, 0);
|
||||
}
|
||||
|
||||
// Render
|
||||
program_handle.use();
|
||||
on_load();
|
||||
bind_resources();
|
||||
emit_geometry();
|
||||
|
||||
// Clean up
|
||||
if (target_texture)
|
||||
{
|
||||
if (depth_target)
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
|
||||
else
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
|
||||
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, old_fbo);
|
||||
}
|
||||
|
||||
glUseProgram(program);
|
||||
|
||||
glViewport(viewport[0], viewport[1], viewport[2], viewport[3]);
|
||||
glColorMask(color_writes[0], color_writes[1], color_writes[2], color_writes[3]);
|
||||
glDepthMask(depth_write);
|
||||
glDepthFunc(depth_func);
|
||||
|
||||
if (!depth_test_enabled) glDisable(GL_DEPTH_TEST);
|
||||
if (scissor_enabled) glEnable(GL_SCISSOR_TEST);
|
||||
if (cull_face_enabled) glEnable(GL_CULL_FACE);
|
||||
if (stencil_test_enabled) glEnable(GL_STENCIL_TEST);
|
||||
|
||||
if (use_blending)
|
||||
{
|
||||
if (!blend_enabled)
|
||||
glDisablei(GL_BLEND, 0);
|
||||
|
||||
glBlendFuncSeparate(blend_src_rgb, blend_dst_rgb, blend_src_a, blend_dst_a);
|
||||
glBlendEquationSeparate(blend_eq_rgb, blend_eq_a);
|
||||
}
|
||||
else if (blend_enabled)
|
||||
{
|
||||
glEnablei(GL_BLEND, 0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
rsx_log.error("Overlay pass failed because framebuffer was not complete. Run with debug output enabled to diagnose the problem");
|
||||
}
|
||||
}
|
||||
void run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending = false);
|
||||
};
|
||||
|
||||
struct ui_overlay_renderer : public overlay_pass
|
||||
@ -282,443 +82,30 @@ namespace gl
|
||||
std::unordered_map<u64, std::unique_ptr<gl::texture_view>> view_cache;
|
||||
rsx::overlays::primitive_type m_current_primitive_type = rsx::overlays::primitive_type::quad_list;
|
||||
|
||||
ui_overlay_renderer()
|
||||
{
|
||||
vs_src =
|
||||
"#version 420\n\n"
|
||||
"layout(location=0) in vec4 in_pos;\n"
|
||||
"layout(location=0) out vec2 tc0;\n"
|
||||
"layout(location=1) flat out vec4 clip_rect;\n"
|
||||
"uniform vec4 ui_scale;\n"
|
||||
"uniform vec4 viewport;\n"
|
||||
"uniform vec4 clip_bounds;\n"
|
||||
"\n"
|
||||
"vec2 snap_to_grid(vec2 normalized)\n"
|
||||
"{\n"
|
||||
" return (floor(normalized * viewport.xy) + 0.5) / viewport.xy;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"vec4 clip_to_ndc(const in vec4 coord)\n"
|
||||
"{\n"
|
||||
" vec4 ret = (coord * ui_scale.zwzw) / ui_scale.xyxy;\n"
|
||||
" ret.yw = 1. - ret.yw;\n"
|
||||
" return ret;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"vec4 ndc_to_window(const in vec4 coord)\n"
|
||||
"{\n"
|
||||
" return fma(coord, viewport.xyxy, viewport.zwzw);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" tc0.xy = in_pos.zw;\n"
|
||||
" clip_rect = ndc_to_window(clip_to_ndc(clip_bounds)).xwzy; // Swap y1 and y2 due to flipped origin!\n"
|
||||
" vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n"
|
||||
" pos.xy = snap_to_grid(pos.xy);\n"
|
||||
" gl_Position = (pos + pos) - 1.;\n"
|
||||
"}\n";
|
||||
ui_overlay_renderer();
|
||||
|
||||
fs_src =
|
||||
"#version 420\n\n"
|
||||
"layout(binding=31) uniform sampler2D fs0;\n"
|
||||
"layout(binding=30) uniform sampler2DArray fs1;\n"
|
||||
"layout(location=0) in vec2 tc0;\n"
|
||||
"layout(location=1) flat in vec4 clip_rect;\n"
|
||||
"layout(location=0) out vec4 ocol;\n"
|
||||
"uniform vec4 color;\n"
|
||||
"uniform float time;\n"
|
||||
"uniform int sampler_mode;\n"
|
||||
"uniform int pulse_glow;\n"
|
||||
"uniform int clip_region;\n"
|
||||
"uniform int blur_strength;\n"
|
||||
"\n"
|
||||
"vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n"
|
||||
"{\n"
|
||||
" vec2 coords[9];\n"
|
||||
" coords[0] = coord - tex_offset\n;"
|
||||
" coords[1] = coord + vec2(0., -tex_offset.y);\n"
|
||||
" coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n"
|
||||
" coords[3] = coord + vec2(-tex_offset.x, 0.);\n"
|
||||
" coords[4] = coord;\n"
|
||||
" coords[5] = coord + vec2(tex_offset.x, 0.);\n"
|
||||
" coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n"
|
||||
" coords[7] = coord + vec2(0., tex_offset.y);\n"
|
||||
" coords[8] = coord + tex_offset;\n"
|
||||
"\n"
|
||||
" float weights[9] =\n"
|
||||
" {\n"
|
||||
" 1., 2., 1.,\n"
|
||||
" 2., 4., 2.,\n"
|
||||
" 1., 2., 1.\n"
|
||||
" };\n"
|
||||
"\n"
|
||||
" vec4 blurred = vec4(0.);\n"
|
||||
" for (int n = 0; n < 9; ++n)\n"
|
||||
" {\n"
|
||||
" blurred += texture(tex, coords[n]) * weights[n];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" return blurred / 16.f;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"vec4 sample_image(sampler2D tex, vec2 coord)\n"
|
||||
"{\n"
|
||||
" vec4 original = texture(tex, coord);\n"
|
||||
" if (blur_strength == 0) return original;\n"
|
||||
" \n"
|
||||
" vec2 constraints = 1.f / vec2(640, 360);\n"
|
||||
" vec2 res_offset = 1.f / textureSize(fs0, 0);\n"
|
||||
" vec2 tex_offset = max(res_offset, constraints);\n"
|
||||
"\n"
|
||||
" // Sample triangle pattern and average\n"
|
||||
" // TODO: Nicer looking gaussian blur with less sampling\n"
|
||||
" vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n"
|
||||
" vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n"
|
||||
" vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n"
|
||||
"\n"
|
||||
" vec4 blurred = blur0 + blur1 + blur2;\n"
|
||||
" blurred /= 3.;\n"
|
||||
" return mix(original, blurred, float(blur_strength) / 100.);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" if (clip_region != 0)\n"
|
||||
" {"
|
||||
" if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n"
|
||||
" gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n"
|
||||
" {\n"
|
||||
" discard;\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" vec4 diff_color = color;\n"
|
||||
" if (pulse_glow != 0)\n"
|
||||
" diff_color.a *= (sin(time) + 1.f) * 0.5f;\n"
|
||||
"\n"
|
||||
" switch (sampler_mode)\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" ocol = sample_image(fs0, tc0) * diff_color;\n"
|
||||
" break;\n"
|
||||
" case 2:\n"
|
||||
" ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))) * diff_color;\n"
|
||||
" break;\n"
|
||||
" default:\n"
|
||||
" ocol = diff_color;\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
"}\n";
|
||||
gl::texture_view* load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid);
|
||||
|
||||
// Smooth filtering required for inputs
|
||||
input_filter = GL_LINEAR;
|
||||
}
|
||||
void create();
|
||||
void destroy();
|
||||
|
||||
gl::texture_view* load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid)
|
||||
{
|
||||
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D, desc->w, desc->h, 1, 1, GL_RGBA8);
|
||||
tex->copy_from(desc->data, gl::texture::format::rgba, gl::texture::type::uint_8_8_8_8, {});
|
||||
void remove_temp_resources(u64 key);
|
||||
|
||||
GLenum remap[] = { GL_RED, GL_ALPHA, GL_BLUE, GL_GREEN };
|
||||
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
|
||||
gl::texture_view* find_font(rsx::overlays::font* font);
|
||||
|
||||
auto result = view.get();
|
||||
if (!temp_resource)
|
||||
{
|
||||
resources.push_back(std::move(tex));
|
||||
view_cache[view_cache.size()] = std::move(view);
|
||||
}
|
||||
else
|
||||
{
|
||||
u64 key = reinterpret_cast<u64>(desc);
|
||||
temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex));
|
||||
temp_view_cache[key] = std::move(view);
|
||||
}
|
||||
gl::texture_view* find_temp_image(rsx::overlays::image_info* desc, u32 owner_uid);
|
||||
|
||||
return result;
|
||||
}
|
||||
void set_primitive_type(rsx::overlays::primitive_type type);
|
||||
|
||||
void create()
|
||||
{
|
||||
overlay_pass::create();
|
||||
void emit_geometry() override;
|
||||
|
||||
rsx::overlays::resource_config configuration;
|
||||
configuration.load_files();
|
||||
|
||||
for (const auto &res : configuration.texture_raw_data)
|
||||
{
|
||||
load_simple_image(res.get(), false, UINT32_MAX);
|
||||
}
|
||||
|
||||
configuration.free_resources();
|
||||
}
|
||||
|
||||
void destroy()
|
||||
{
|
||||
temp_image_cache.clear();
|
||||
resources.clear();
|
||||
font_cache.clear();
|
||||
overlay_pass::destroy();
|
||||
}
|
||||
|
||||
void remove_temp_resources(u64 key)
|
||||
{
|
||||
std::vector<u64> keys_to_remove;
|
||||
for (const auto& temp_image : temp_image_cache)
|
||||
{
|
||||
if (temp_image.second.first == key)
|
||||
{
|
||||
keys_to_remove.push_back(temp_image.first);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& _key : keys_to_remove)
|
||||
{
|
||||
temp_image_cache.erase(_key);
|
||||
temp_view_cache.erase(_key);
|
||||
}
|
||||
}
|
||||
|
||||
gl::texture_view* find_font(rsx::overlays::font *font)
|
||||
{
|
||||
const auto font_size = font->get_glyph_data_dimensions();
|
||||
|
||||
u64 key = reinterpret_cast<u64>(font);
|
||||
auto found = view_cache.find(key);
|
||||
if (found != view_cache.end())
|
||||
{
|
||||
if (const auto this_size = found->second->image()->size3D();
|
||||
font_size.width == this_size.width &&
|
||||
font_size.height == this_size.height &&
|
||||
font_size.depth == this_size.depth)
|
||||
{
|
||||
return found->second.get();
|
||||
}
|
||||
}
|
||||
|
||||
// Create font file
|
||||
std::vector<u8> glyph_data;
|
||||
font->get_glyph_data(glyph_data);
|
||||
|
||||
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D_ARRAY, font_size.width, font_size.height, font_size.depth, 1, GL_R8);
|
||||
tex->copy_from(glyph_data.data(), gl::texture::format::r, gl::texture::type::ubyte, {});
|
||||
|
||||
GLenum remap[] = { GL_RED, GL_RED, GL_RED, GL_RED };
|
||||
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
|
||||
|
||||
auto result = view.get();
|
||||
font_cache[key] = std::move(tex);
|
||||
view_cache[key] = std::move(view);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
gl::texture_view* find_temp_image(rsx::overlays::image_info *desc, u32 owner_uid)
|
||||
{
|
||||
auto key = reinterpret_cast<u64>(desc);
|
||||
auto cached = temp_view_cache.find(key);
|
||||
if (cached != temp_view_cache.end())
|
||||
{
|
||||
return cached->second.get();
|
||||
}
|
||||
else
|
||||
{
|
||||
return load_simple_image(desc, true, owner_uid);
|
||||
}
|
||||
}
|
||||
|
||||
void set_primitive_type(rsx::overlays::primitive_type type)
|
||||
{
|
||||
m_current_primitive_type = type;
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case rsx::overlays::primitive_type::quad_list:
|
||||
case rsx::overlays::primitive_type::triangle_strip:
|
||||
primitives = GL_TRIANGLE_STRIP;
|
||||
break;
|
||||
case rsx::overlays::primitive_type::line_list:
|
||||
primitives = GL_LINES;
|
||||
break;
|
||||
case rsx::overlays::primitive_type::line_strip:
|
||||
primitives = GL_LINE_STRIP;
|
||||
break;
|
||||
default:
|
||||
fmt::throw_exception("Unexpected primitive type %d", static_cast<s32>(type));
|
||||
}
|
||||
}
|
||||
|
||||
void emit_geometry() override
|
||||
{
|
||||
if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list)
|
||||
{
|
||||
// Emulate quads with disjointed triangle strips
|
||||
int num_quads = num_drawable_elements / 4;
|
||||
std::vector<GLint> firsts;
|
||||
std::vector<GLsizei> counts;
|
||||
|
||||
firsts.resize(num_quads);
|
||||
counts.resize(num_quads);
|
||||
|
||||
for (int n = 0; n < num_quads; ++n)
|
||||
{
|
||||
firsts[n] = (n * 4);
|
||||
counts[n] = 4;
|
||||
}
|
||||
|
||||
int old_vao;
|
||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||
|
||||
m_vao.bind();
|
||||
glMultiDrawArrays(GL_TRIANGLE_STRIP, firsts.data(), counts.data(), num_quads);
|
||||
|
||||
glBindVertexArray(old_vao);
|
||||
}
|
||||
else
|
||||
{
|
||||
overlay_pass::emit_geometry();
|
||||
}
|
||||
}
|
||||
|
||||
void run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui)
|
||||
{
|
||||
program_handle.uniforms["viewport"] = color4f(static_cast<f32>(viewport.width()), static_cast<f32>(viewport.height()), static_cast<f32>(viewport.x1), static_cast<f32>(viewport.y1));
|
||||
program_handle.uniforms["ui_scale"] = color4f(static_cast<f32>(ui.virtual_width), static_cast<f32>(ui.virtual_height), 1.f, 1.f);
|
||||
program_handle.uniforms["time"] = static_cast<f32>(get_system_time() / 1000) * 0.005f;
|
||||
|
||||
saved_sampler_state save_30(30, m_sampler);
|
||||
saved_sampler_state save_31(31, m_sampler);
|
||||
|
||||
for (auto &cmd : ui.get_compiled().draw_commands)
|
||||
{
|
||||
set_primitive_type(cmd.config.primitives);
|
||||
upload_vertex_data(cmd.verts.data(), ::size32(cmd.verts));
|
||||
num_drawable_elements = ::size32(cmd.verts);
|
||||
GLint texture_read = GL_TRUE;
|
||||
|
||||
switch (cmd.config.texture_ref)
|
||||
{
|
||||
case rsx::overlays::image_resource_id::game_icon:
|
||||
case rsx::overlays::image_resource_id::backbuffer:
|
||||
//TODO
|
||||
case rsx::overlays::image_resource_id::none:
|
||||
{
|
||||
texture_read = GL_FALSE;
|
||||
glBindTexture(GL_TEXTURE_2D, GL_NONE);
|
||||
break;
|
||||
}
|
||||
case rsx::overlays::image_resource_id::raw_image:
|
||||
{
|
||||
glBindTexture(GL_TEXTURE_2D, find_temp_image(static_cast<rsx::overlays::image_info*>(cmd.config.external_data_ref), ui.uid)->id());
|
||||
break;
|
||||
}
|
||||
case rsx::overlays::image_resource_id::font_file:
|
||||
{
|
||||
texture_read = (GL_TRUE + 1);
|
||||
glActiveTexture(GL_TEXTURE0 + 30);
|
||||
glBindTexture(GL_TEXTURE_2D_ARRAY, find_font(cmd.config.font_ref)->id());
|
||||
glActiveTexture(GL_TEXTURE0 + 31);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
glBindTexture(GL_TEXTURE_2D, view_cache[cmd.config.texture_ref - 1]->id());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
program_handle.uniforms["color"] = cmd.config.color;
|
||||
program_handle.uniforms["sampler_mode"] = texture_read;
|
||||
program_handle.uniforms["pulse_glow"] = static_cast<s32>(cmd.config.pulse_glow);
|
||||
program_handle.uniforms["blur_strength"] = static_cast<s32>(cmd.config.blur_strength);
|
||||
program_handle.uniforms["clip_region"] = static_cast<s32>(cmd.config.clip_region);
|
||||
program_handle.uniforms["clip_bounds"] = cmd.config.clip_rect;
|
||||
overlay_pass::run(viewport, target, false, true);
|
||||
}
|
||||
|
||||
ui.update();
|
||||
}
|
||||
void run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui);
|
||||
};
|
||||
|
||||
struct video_out_calibration_pass : public overlay_pass
|
||||
{
|
||||
video_out_calibration_pass()
|
||||
{
|
||||
vs_src =
|
||||
"#version 420\n\n"
|
||||
"layout(location=0) out vec2 tc0;\n"
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n"
|
||||
" vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n"
|
||||
" tc0 = coords[gl_VertexID % 4];\n"
|
||||
" vec2 pos = positions[gl_VertexID % 4];\n"
|
||||
" gl_Position = vec4(pos, 0., 1.);\n"
|
||||
"}\n";
|
||||
video_out_calibration_pass();
|
||||
|
||||
fs_src =
|
||||
"#version 420\n\n"
|
||||
"layout(binding=31) uniform sampler2D fs0;\n"
|
||||
"layout(binding=30) uniform sampler2D fs1;\n"
|
||||
"layout(location=0) in vec2 tc0;\n"
|
||||
"layout(location=0) out vec4 ocol;\n"
|
||||
"\n"
|
||||
"uniform float gamma;\n"
|
||||
"uniform int limit_range;\n"
|
||||
"uniform int stereo;\n"
|
||||
"uniform int stereo_image_count;\n"
|
||||
"\n"
|
||||
"vec4 read_source()\n"
|
||||
"{\n"
|
||||
" if (stereo == 0) return texture(fs0, tc0);\n"
|
||||
"\n"
|
||||
" vec4 left, right;\n"
|
||||
" if (stereo_image_count == 2)\n"
|
||||
" {\n"
|
||||
" left = texture(fs0, tc0);\n"
|
||||
" right = texture(fs1, tc0);\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n"
|
||||
" vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n"
|
||||
" left = texture(fs0, coord_left);\n"
|
||||
" right = texture(fs0, coord_right);\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" return vec4(left.r, right.g, right.b, 1.);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" vec4 color = read_source();\n"
|
||||
" color.rgb = pow(color.rgb, vec3(gamma));\n"
|
||||
" if (limit_range > 0)\n"
|
||||
" ocol = ((color * 220.) + 16.) / 255.;\n"
|
||||
" else\n"
|
||||
" ocol = color;\n"
|
||||
"}\n";
|
||||
|
||||
input_filter = GL_LINEAR;
|
||||
}
|
||||
|
||||
void run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d)
|
||||
{
|
||||
program_handle.uniforms["gamma"] = gamma;
|
||||
program_handle.uniforms["limit_range"] = limited_rgb + 0;
|
||||
program_handle.uniforms["stereo"] = _3d + 0;
|
||||
program_handle.uniforms["stereo_image_count"] = (source[1] == GL_NONE? 1 : 2);
|
||||
|
||||
saved_sampler_state saved(31, m_sampler);
|
||||
glBindTexture(GL_TEXTURE_2D, source[0]);
|
||||
|
||||
saved_sampler_state saved2(30, m_sampler);
|
||||
glBindTexture(GL_TEXTURE_2D, source[1]);
|
||||
|
||||
overlay_pass::run(viewport, GL_NONE, false, false);
|
||||
}
|
||||
void run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d);
|
||||
};
|
||||
}
|
||||
|
@ -4,6 +4,8 @@
|
||||
#include "Utilities/geometry.h"
|
||||
#include "overlay_utils.h"
|
||||
|
||||
#include <functional>
|
||||
|
||||
namespace rsx
|
||||
{
|
||||
namespace overlays
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <sys/types.h>
|
||||
#include <pwd.h>
|
||||
#include <libgen.h>
|
||||
#include <limits.h>
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
428
rpcs3/Emu/RSX/VK/VKCompute.cpp
Normal file
428
rpcs3/Emu/RSX/VK/VKCompute.cpp
Normal file
@ -0,0 +1,428 @@
|
||||
#include "VKCompute.h"
|
||||
#include "VKHelpers.h"
|
||||
#include "VKRenderPass.h"
|
||||
#include "vkutils/buffer_object.h"
|
||||
|
||||
#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame
|
||||
|
||||
namespace vk
|
||||
{
|
||||
std::vector<std::pair<VkDescriptorType, u8>> compute_task::get_descriptor_layout()
|
||||
{
|
||||
std::vector<std::pair<VkDescriptorType, u8>> result;
|
||||
result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
|
||||
return result;
|
||||
}
|
||||
|
||||
void compute_task::init_descriptors()
|
||||
{
|
||||
std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
|
||||
std::vector<VkDescriptorSetLayoutBinding> bindings;
|
||||
|
||||
const auto layout = get_descriptor_layout();
|
||||
for (const auto &e : layout)
|
||||
{
|
||||
descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});
|
||||
|
||||
for (unsigned n = 0; n < e.second; ++n)
|
||||
{
|
||||
bindings.push_back
|
||||
({
|
||||
u32(bindings.size()),
|
||||
e.first,
|
||||
1,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
nullptr
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Reserve descriptor pools
|
||||
m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 3);
|
||||
|
||||
VkDescriptorSetLayoutCreateInfo infos = {};
|
||||
infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
|
||||
infos.pBindings = bindings.data();
|
||||
infos.bindingCount = ::size32(bindings);
|
||||
|
||||
CHECK_RESULT(vkCreateDescriptorSetLayout(*g_render_device, &infos, nullptr, &m_descriptor_layout));
|
||||
|
||||
VkPipelineLayoutCreateInfo layout_info = {};
|
||||
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
|
||||
layout_info.setLayoutCount = 1;
|
||||
layout_info.pSetLayouts = &m_descriptor_layout;
|
||||
|
||||
VkPushConstantRange push_constants{};
|
||||
if (use_push_constants)
|
||||
{
|
||||
push_constants.size = push_constants_size;
|
||||
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
|
||||
layout_info.pushConstantRangeCount = 1;
|
||||
layout_info.pPushConstantRanges = &push_constants;
|
||||
}
|
||||
|
||||
CHECK_RESULT(vkCreatePipelineLayout(*g_render_device, &layout_info, nullptr, &m_pipeline_layout));
|
||||
}
|
||||
|
||||
void compute_task::create()
|
||||
{
|
||||
if (!initialized)
|
||||
{
|
||||
init_descriptors();
|
||||
|
||||
switch (vk::get_driver_vendor())
|
||||
{
|
||||
case vk::driver_vendor::unknown:
|
||||
case vk::driver_vendor::INTEL:
|
||||
// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
|
||||
// Based on intel's own OpenCL recommended settings
|
||||
unroll_loops = true;
|
||||
optimal_kernel_size = 1;
|
||||
optimal_group_size = 128;
|
||||
break;
|
||||
case vk::driver_vendor::NVIDIA:
|
||||
// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
|
||||
unroll_loops = true;
|
||||
optimal_group_size = 32;
|
||||
optimal_kernel_size = 1;
|
||||
break;
|
||||
case vk::driver_vendor::AMD:
|
||||
case vk::driver_vendor::RADV:
|
||||
// Wavefronts are multiples of 64
|
||||
unroll_loops = false;
|
||||
optimal_kernel_size = 1;
|
||||
optimal_group_size = 64;
|
||||
break;
|
||||
}
|
||||
|
||||
const auto& gpu = vk::g_render_device->gpu();
|
||||
max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_task::destroy()
|
||||
{
|
||||
if (initialized)
|
||||
{
|
||||
m_shader.destroy();
|
||||
m_program.reset();
|
||||
m_param_buffer.reset();
|
||||
|
||||
vkDestroyDescriptorSetLayout(*g_render_device, m_descriptor_layout, nullptr);
|
||||
vkDestroyPipelineLayout(*g_render_device, m_pipeline_layout, nullptr);
|
||||
m_descriptor_pool.destroy();
|
||||
|
||||
initialized = false;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_task::free_resources()
|
||||
{
|
||||
if (m_used_descriptors == 0)
|
||||
return;
|
||||
|
||||
m_descriptor_pool.reset(0);
|
||||
m_used_descriptors = 0;
|
||||
}
|
||||
|
||||
void compute_task::load_program(VkCommandBuffer cmd)
|
||||
{
|
||||
if (!m_program)
|
||||
{
|
||||
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
||||
auto handle = m_shader.compile();
|
||||
|
||||
VkPipelineShaderStageCreateInfo shader_stage{};
|
||||
shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
|
||||
shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
shader_stage.module = handle;
|
||||
shader_stage.pName = "main";
|
||||
|
||||
VkComputePipelineCreateInfo info{};
|
||||
info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
|
||||
info.stage = shader_stage;
|
||||
info.layout = m_pipeline_layout;
|
||||
info.basePipelineIndex = -1;
|
||||
info.basePipelineHandle = VK_NULL_HANDLE;
|
||||
|
||||
auto compiler = vk::get_pipe_compiler();
|
||||
m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE);
|
||||
declare_inputs();
|
||||
}
|
||||
|
||||
ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS);
|
||||
|
||||
VkDescriptorSetAllocateInfo alloc_info = {};
|
||||
alloc_info.descriptorPool = m_descriptor_pool;
|
||||
alloc_info.descriptorSetCount = 1;
|
||||
alloc_info.pSetLayouts = &m_descriptor_layout;
|
||||
alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
|
||||
|
||||
CHECK_RESULT(vkAllocateDescriptorSets(*g_render_device, &alloc_info, &m_descriptor_set));
|
||||
m_used_descriptors++;
|
||||
|
||||
bind_resources();
|
||||
|
||||
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
|
||||
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
|
||||
}
|
||||
|
||||
void compute_task::run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
|
||||
{
|
||||
// CmdDispatch is outside renderpass scope only
|
||||
if (vk::is_renderpass_open(cmd))
|
||||
{
|
||||
vk::end_renderpass(cmd);
|
||||
}
|
||||
|
||||
load_program(cmd);
|
||||
vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
|
||||
}
|
||||
|
||||
void compute_task::run(VkCommandBuffer cmd, u32 num_invocations)
|
||||
{
|
||||
u32 invocations_x, invocations_y;
|
||||
if (num_invocations > max_invocations_x)
|
||||
{
|
||||
// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
|
||||
// Split the 1D job into 2 dimensions to accomodate this
|
||||
invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
||||
invocations_y = invocations_x;
|
||||
|
||||
if (num_invocations % invocations_x) invocations_y++;
|
||||
}
|
||||
else
|
||||
{
|
||||
invocations_x = num_invocations;
|
||||
invocations_y = 1;
|
||||
}
|
||||
|
||||
run(cmd, invocations_x, invocations_y, 1);
|
||||
}
|
||||
|
||||
cs_shuffle_base::cs_shuffle_base()
|
||||
{
|
||||
work_kernel =
|
||||
" value = data[index];\n"
|
||||
" data[index] = %f(value);\n";
|
||||
|
||||
loop_advance =
|
||||
" index++;\n";
|
||||
|
||||
suffix =
|
||||
"}\n";
|
||||
}
|
||||
|
||||
void cs_shuffle_base::build(const char* function_name, u32 _kernel_size)
|
||||
{
|
||||
// Initialize to allow detecting optimal settings
|
||||
create();
|
||||
|
||||
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
||||
|
||||
m_src =
|
||||
"#version 430\n"
|
||||
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
||||
"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
|
||||
"%ub"
|
||||
"\n"
|
||||
"#define KERNEL_SIZE %ks\n"
|
||||
"\n"
|
||||
"// Generic swap routines\n"
|
||||
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
||||
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
||||
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
||||
"\n"
|
||||
"// Depth format conversions\n"
|
||||
"#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n"
|
||||
"#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n"
|
||||
"#define d24f_to_f32(bits) (bits << 7)\n"
|
||||
"#define f32_to_d24f(bits) (bits >> 7)\n"
|
||||
"#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n"
|
||||
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
|
||||
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
|
||||
"\n"
|
||||
"%md"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
|
||||
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
|
||||
" uint index = invocation_id * KERNEL_SIZE;\n"
|
||||
" uint value;\n"
|
||||
"%vars"
|
||||
"\n";
|
||||
|
||||
const auto parameters_size = utils::align(push_constants_size, 16) / 16;
|
||||
const std::pair<std::string, std::string> syntax_replace[] =
|
||||
{
|
||||
{ "%ws", std::to_string(optimal_group_size) },
|
||||
{ "%ks", std::to_string(kernel_size) },
|
||||
{ "%vars", variables },
|
||||
{ "%f", function_name },
|
||||
{ "%md", method_declarations },
|
||||
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
|
||||
};
|
||||
|
||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
||||
|
||||
if (kernel_size <= 1)
|
||||
{
|
||||
m_src += " {\n" + work_kernel + " }\n";
|
||||
}
|
||||
else if (unroll_loops)
|
||||
{
|
||||
work_kernel += loop_advance + "\n";
|
||||
|
||||
m_src += std::string
|
||||
(
|
||||
" //Unrolled loop\n"
|
||||
" {\n"
|
||||
);
|
||||
|
||||
// Assemble body with manual loop unroll to try loweing GPR usage
|
||||
for (u32 n = 0; n < kernel_size; ++n)
|
||||
{
|
||||
m_src += work_kernel;
|
||||
}
|
||||
|
||||
m_src += " }\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
||||
m_src += " {\n";
|
||||
m_src += work_kernel;
|
||||
m_src += loop_advance;
|
||||
m_src += " }\n";
|
||||
}
|
||||
|
||||
m_src += suffix;
|
||||
}
|
||||
|
||||
void cs_shuffle_base::bind_resources()
|
||||
{
|
||||
m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
}
|
||||
|
||||
void cs_shuffle_base::set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
|
||||
{
|
||||
ensure(use_push_constants);
|
||||
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
|
||||
}
|
||||
|
||||
void cs_shuffle_base::run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset)
|
||||
{
|
||||
m_data = data;
|
||||
m_data_offset = data_offset;
|
||||
m_data_length = data_length;
|
||||
|
||||
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
||||
const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
|
||||
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
||||
|
||||
if ((num_bytes_to_process + data_offset) > data->size())
|
||||
{
|
||||
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
||||
rsx_log.error("Inadequate buffer length submitted for a compute operation."
|
||||
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
||||
}
|
||||
|
||||
compute_task::run(cmd, num_invocations);
|
||||
}
|
||||
|
||||
cs_interleave_task::cs_interleave_task()
|
||||
{
|
||||
use_push_constants = true;
|
||||
push_constants_size = 16;
|
||||
|
||||
variables =
|
||||
" uint block_length = params[0].x >> 2;\n"
|
||||
" uint z_offset = params[0].y >> 2;\n"
|
||||
" uint s_offset = params[0].z >> 2;\n"
|
||||
" uint depth;\n"
|
||||
" uint stencil;\n"
|
||||
" uint stencil_shift;\n"
|
||||
" uint stencil_offset;\n";
|
||||
}
|
||||
|
||||
void cs_interleave_task::bind_resources()
|
||||
{
|
||||
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
}
|
||||
|
||||
void cs_interleave_task::run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
|
||||
{
|
||||
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
|
||||
set_parameters(cmd, parameters, 4);
|
||||
|
||||
ensure(stencil_offset > data_offset);
|
||||
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
|
||||
cs_shuffle_base::run(cmd, data, data_length, data_offset);
|
||||
}
|
||||
|
||||
cs_scatter_d24x8::cs_scatter_d24x8()
|
||||
{
|
||||
work_kernel =
|
||||
" if (index >= block_length)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" value = data[index];\n"
|
||||
" data[index + z_offset] = (value >> 8);\n"
|
||||
" stencil_offset = (index / 4);\n"
|
||||
" stencil_shift = (index % 4) * 8;\n"
|
||||
" stencil = (value & 0xFF) << stencil_shift;\n"
|
||||
" atomicOr(data[stencil_offset + s_offset], stencil);\n";
|
||||
|
||||
cs_shuffle_base::build("");
|
||||
}
|
||||
|
||||
cs_aggregator::cs_aggregator()
|
||||
{
|
||||
ssbo_count = 2;
|
||||
|
||||
create();
|
||||
|
||||
m_src =
|
||||
"#version 450\n"
|
||||
"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
|
||||
|
||||
"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
|
||||
"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"
|
||||
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" if (gl_GlobalInvocationID.x < src.length())\n"
|
||||
" {\n"
|
||||
" atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
|
||||
" }\n"
|
||||
"}\n";
|
||||
|
||||
const std::pair<std::string, std::string> syntax_replace[] =
|
||||
{
|
||||
{ "%ws", std::to_string(optimal_group_size) },
|
||||
};
|
||||
|
||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||
}
|
||||
|
||||
void cs_aggregator::bind_resources()
|
||||
{
|
||||
m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
}
|
||||
|
||||
void cs_aggregator::run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
|
||||
{
|
||||
this->dst = dst;
|
||||
this->src = src;
|
||||
word_count = num_words;
|
||||
block_length = num_words * 4;
|
||||
|
||||
const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
|
||||
compute_task::run(cmd, linear_invocations);
|
||||
}
|
||||
}
|
@ -1,18 +1,14 @@
|
||||
#pragma once
|
||||
#include "VKPipelineCompiler.h"
|
||||
#include "vkutils/descriptors.hpp"
|
||||
#include "Utilities/StrUtil.h"
|
||||
#include "vkutils/buffer_object.h"
|
||||
|
||||
#include "Emu/IdManager.h"
|
||||
|
||||
#include "VKPipelineCompiler.h"
|
||||
#include "VKRenderPass.h"
|
||||
#include "VKHelpers.h"
|
||||
#include "vkutils/buffer_object.h"
|
||||
#include "vkutils/device.h"
|
||||
|
||||
#include "Utilities/StrUtil.h"
|
||||
#include "util/asm.hpp"
|
||||
#include <unordered_map>
|
||||
|
||||
#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame
|
||||
#include <unordered_map>
|
||||
|
||||
namespace vk
|
||||
{
|
||||
@ -38,207 +34,22 @@ namespace vk
|
||||
u32 optimal_kernel_size = 1;
|
||||
u32 max_invocations_x = 65535;
|
||||
|
||||
virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout()
|
||||
{
|
||||
std::vector<std::pair<VkDescriptorType, u8>> result;
|
||||
result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
|
||||
return result;
|
||||
}
|
||||
virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout();
|
||||
|
||||
void init_descriptors()
|
||||
{
|
||||
std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
|
||||
std::vector<VkDescriptorSetLayoutBinding> bindings;
|
||||
void init_descriptors();
|
||||
|
||||
const auto layout = get_descriptor_layout();
|
||||
for (const auto &e : layout)
|
||||
{
|
||||
descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});
|
||||
void create();
|
||||
void destroy();
|
||||
|
||||
for (unsigned n = 0; n < e.second; ++n)
|
||||
{
|
||||
bindings.push_back
|
||||
({
|
||||
u32(bindings.size()),
|
||||
e.first,
|
||||
1,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
nullptr
|
||||
});
|
||||
}
|
||||
}
|
||||
void free_resources();
|
||||
|
||||
// Reserve descriptor pools
|
||||
m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 3);
|
||||
virtual void bind_resources() {}
|
||||
virtual void declare_inputs() {}
|
||||
|
||||
VkDescriptorSetLayoutCreateInfo infos = {};
|
||||
infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
|
||||
infos.pBindings = bindings.data();
|
||||
infos.bindingCount = ::size32(bindings);
|
||||
void load_program(VkCommandBuffer cmd);
|
||||
|
||||
CHECK_RESULT(vkCreateDescriptorSetLayout(*g_render_device, &infos, nullptr, &m_descriptor_layout));
|
||||
|
||||
VkPipelineLayoutCreateInfo layout_info = {};
|
||||
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
|
||||
layout_info.setLayoutCount = 1;
|
||||
layout_info.pSetLayouts = &m_descriptor_layout;
|
||||
|
||||
VkPushConstantRange push_constants{};
|
||||
if (use_push_constants)
|
||||
{
|
||||
push_constants.size = push_constants_size;
|
||||
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
|
||||
layout_info.pushConstantRangeCount = 1;
|
||||
layout_info.pPushConstantRanges = &push_constants;
|
||||
}
|
||||
|
||||
CHECK_RESULT(vkCreatePipelineLayout(*g_render_device, &layout_info, nullptr, &m_pipeline_layout));
|
||||
}
|
||||
|
||||
void create()
|
||||
{
|
||||
if (!initialized)
|
||||
{
|
||||
init_descriptors();
|
||||
|
||||
switch (vk::get_driver_vendor())
|
||||
{
|
||||
case vk::driver_vendor::unknown:
|
||||
case vk::driver_vendor::INTEL:
|
||||
// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
|
||||
// Based on intel's own OpenCL recommended settings
|
||||
unroll_loops = true;
|
||||
optimal_kernel_size = 1;
|
||||
optimal_group_size = 128;
|
||||
break;
|
||||
case vk::driver_vendor::NVIDIA:
|
||||
// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
|
||||
unroll_loops = true;
|
||||
optimal_group_size = 32;
|
||||
optimal_kernel_size = 1;
|
||||
break;
|
||||
case vk::driver_vendor::AMD:
|
||||
case vk::driver_vendor::RADV:
|
||||
// Wavefronts are multiples of 64
|
||||
unroll_loops = false;
|
||||
optimal_kernel_size = 1;
|
||||
optimal_group_size = 64;
|
||||
break;
|
||||
}
|
||||
|
||||
const auto& gpu = vk::g_render_device->gpu();
|
||||
max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void destroy()
|
||||
{
|
||||
if (initialized)
|
||||
{
|
||||
m_shader.destroy();
|
||||
m_program.reset();
|
||||
m_param_buffer.reset();
|
||||
|
||||
vkDestroyDescriptorSetLayout(*g_render_device, m_descriptor_layout, nullptr);
|
||||
vkDestroyPipelineLayout(*g_render_device, m_pipeline_layout, nullptr);
|
||||
m_descriptor_pool.destroy();
|
||||
|
||||
initialized = false;
|
||||
}
|
||||
}
|
||||
|
||||
void free_resources()
|
||||
{
|
||||
if (m_used_descriptors == 0)
|
||||
return;
|
||||
|
||||
m_descriptor_pool.reset(0);
|
||||
m_used_descriptors = 0;
|
||||
}
|
||||
|
||||
virtual void bind_resources()
|
||||
{}
|
||||
|
||||
virtual void declare_inputs()
|
||||
{}
|
||||
|
||||
void load_program(VkCommandBuffer cmd)
|
||||
{
|
||||
if (!m_program)
|
||||
{
|
||||
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
||||
auto handle = m_shader.compile();
|
||||
|
||||
VkPipelineShaderStageCreateInfo shader_stage{};
|
||||
shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
|
||||
shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
shader_stage.module = handle;
|
||||
shader_stage.pName = "main";
|
||||
|
||||
VkComputePipelineCreateInfo info{};
|
||||
info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
|
||||
info.stage = shader_stage;
|
||||
info.layout = m_pipeline_layout;
|
||||
info.basePipelineIndex = -1;
|
||||
info.basePipelineHandle = VK_NULL_HANDLE;
|
||||
|
||||
auto compiler = vk::get_pipe_compiler();
|
||||
m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE);
|
||||
declare_inputs();
|
||||
}
|
||||
|
||||
ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS);
|
||||
|
||||
VkDescriptorSetAllocateInfo alloc_info = {};
|
||||
alloc_info.descriptorPool = m_descriptor_pool;
|
||||
alloc_info.descriptorSetCount = 1;
|
||||
alloc_info.pSetLayouts = &m_descriptor_layout;
|
||||
alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
|
||||
|
||||
CHECK_RESULT(vkAllocateDescriptorSets(*g_render_device, &alloc_info, &m_descriptor_set));
|
||||
m_used_descriptors++;
|
||||
|
||||
bind_resources();
|
||||
|
||||
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
|
||||
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
|
||||
}
|
||||
|
||||
void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
|
||||
{
|
||||
// CmdDispatch is outside renderpass scope only
|
||||
if (vk::is_renderpass_open(cmd))
|
||||
{
|
||||
vk::end_renderpass(cmd);
|
||||
}
|
||||
|
||||
load_program(cmd);
|
||||
vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
|
||||
}
|
||||
|
||||
void run(VkCommandBuffer cmd, u32 num_invocations)
|
||||
{
|
||||
u32 invocations_x, invocations_y;
|
||||
if (num_invocations > max_invocations_x)
|
||||
{
|
||||
// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
|
||||
// Split the 1D job into 2 dimensions to accomodate this
|
||||
invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
||||
invocations_y = invocations_x;
|
||||
|
||||
if (num_invocations % invocations_x) invocations_y++;
|
||||
}
|
||||
else
|
||||
{
|
||||
invocations_x = num_invocations;
|
||||
invocations_y = 1;
|
||||
}
|
||||
|
||||
run(cmd, invocations_x, invocations_y, 1);
|
||||
}
|
||||
void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z);
|
||||
void run(VkCommandBuffer cmd, u32 num_invocations);
|
||||
};
|
||||
|
||||
struct cs_shuffle_base : compute_task
|
||||
@ -251,136 +62,15 @@ namespace vk
|
||||
std::string variables, work_kernel, loop_advance, suffix;
|
||||
std::string method_declarations;
|
||||
|
||||
cs_shuffle_base()
|
||||
{
|
||||
work_kernel =
|
||||
" value = data[index];\n"
|
||||
" data[index] = %f(value);\n";
|
||||
cs_shuffle_base();
|
||||
|
||||
loop_advance =
|
||||
" index++;\n";
|
||||
void build(const char* function_name, u32 _kernel_size = 0);
|
||||
|
||||
suffix =
|
||||
"}\n";
|
||||
}
|
||||
void bind_resources() override;
|
||||
|
||||
void build(const char* function_name, u32 _kernel_size = 0)
|
||||
{
|
||||
// Initialize to allow detecting optimal settings
|
||||
create();
|
||||
void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count);
|
||||
|
||||
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
||||
|
||||
m_src =
|
||||
"#version 430\n"
|
||||
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
||||
"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
|
||||
"%ub"
|
||||
"\n"
|
||||
"#define KERNEL_SIZE %ks\n"
|
||||
"\n"
|
||||
"// Generic swap routines\n"
|
||||
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
||||
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
||||
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
||||
"\n"
|
||||
"// Depth format conversions\n"
|
||||
"#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n"
|
||||
"#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n"
|
||||
"#define d24f_to_f32(bits) (bits << 7)\n"
|
||||
"#define f32_to_d24f(bits) (bits >> 7)\n"
|
||||
"#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n"
|
||||
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
|
||||
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
|
||||
"\n"
|
||||
"%md"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
|
||||
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
|
||||
" uint index = invocation_id * KERNEL_SIZE;\n"
|
||||
" uint value;\n"
|
||||
"%vars"
|
||||
"\n";
|
||||
|
||||
const auto parameters_size = utils::align(push_constants_size, 16) / 16;
|
||||
const std::pair<std::string, std::string> syntax_replace[] =
|
||||
{
|
||||
{ "%ws", std::to_string(optimal_group_size) },
|
||||
{ "%ks", std::to_string(kernel_size) },
|
||||
{ "%vars", variables },
|
||||
{ "%f", function_name },
|
||||
{ "%md", method_declarations },
|
||||
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
|
||||
};
|
||||
|
||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
||||
|
||||
if (kernel_size <= 1)
|
||||
{
|
||||
m_src += " {\n" + work_kernel + " }\n";
|
||||
}
|
||||
else if (unroll_loops)
|
||||
{
|
||||
work_kernel += loop_advance + "\n";
|
||||
|
||||
m_src += std::string
|
||||
(
|
||||
" //Unrolled loop\n"
|
||||
" {\n"
|
||||
);
|
||||
|
||||
// Assemble body with manual loop unroll to try loweing GPR usage
|
||||
for (u32 n = 0; n < kernel_size; ++n)
|
||||
{
|
||||
m_src += work_kernel;
|
||||
}
|
||||
|
||||
m_src += " }\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
||||
m_src += " {\n";
|
||||
m_src += work_kernel;
|
||||
m_src += loop_advance;
|
||||
m_src += " }\n";
|
||||
}
|
||||
|
||||
m_src += suffix;
|
||||
}
|
||||
|
||||
void bind_resources() override
|
||||
{
|
||||
m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
}
|
||||
|
||||
void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
|
||||
{
|
||||
ensure(use_push_constants);
|
||||
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
|
||||
}
|
||||
|
||||
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0)
|
||||
{
|
||||
m_data = data;
|
||||
m_data_offset = data_offset;
|
||||
m_data_length = data_length;
|
||||
|
||||
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
||||
const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
|
||||
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
||||
|
||||
if ((num_bytes_to_process + data_offset) > data->size())
|
||||
{
|
||||
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
||||
rsx_log.error("Inadequate buffer length submitted for a compute operation."
|
||||
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
||||
}
|
||||
|
||||
compute_task::run(cmd, num_invocations);
|
||||
}
|
||||
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0);
|
||||
};
|
||||
|
||||
struct cs_shuffle_16 : cs_shuffle_base
|
||||
@ -442,35 +132,11 @@ namespace vk
|
||||
{
|
||||
u32 m_ssbo_length = 0;
|
||||
|
||||
cs_interleave_task()
|
||||
{
|
||||
use_push_constants = true;
|
||||
push_constants_size = 16;
|
||||
cs_interleave_task();
|
||||
|
||||
variables =
|
||||
" uint block_length = params[0].x >> 2;\n"
|
||||
" uint z_offset = params[0].y >> 2;\n"
|
||||
" uint s_offset = params[0].z >> 2;\n"
|
||||
" uint depth;\n"
|
||||
" uint stencil;\n"
|
||||
" uint stencil_shift;\n"
|
||||
" uint stencil_offset;\n";
|
||||
}
|
||||
void bind_resources() override;
|
||||
|
||||
void bind_resources() override
|
||||
{
|
||||
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
}
|
||||
|
||||
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
|
||||
{
|
||||
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
|
||||
set_parameters(cmd, parameters, 4);
|
||||
|
||||
ensure(stencil_offset > data_offset);
|
||||
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
|
||||
cs_shuffle_base::run(cmd, data, data_length, data_offset);
|
||||
}
|
||||
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset);
|
||||
};
|
||||
|
||||
template<bool _SwapBytes = false>
|
||||
@ -549,21 +215,7 @@ namespace vk
|
||||
|
||||
struct cs_scatter_d24x8 : cs_interleave_task
|
||||
{
|
||||
cs_scatter_d24x8()
|
||||
{
|
||||
work_kernel =
|
||||
" if (index >= block_length)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" value = data[index];\n"
|
||||
" data[index + z_offset] = (value >> 8);\n"
|
||||
" stencil_offset = (index / 4);\n"
|
||||
" stencil_shift = (index % 4) * 8;\n"
|
||||
" stencil = (value & 0xFF) << stencil_shift;\n"
|
||||
" atomicOr(data[stencil_offset + s_offset], stencil);\n";
|
||||
|
||||
cs_shuffle_base::build("");
|
||||
}
|
||||
cs_scatter_d24x8();
|
||||
};
|
||||
|
||||
template<bool _DepthFloat = false>
|
||||
@ -962,51 +614,11 @@ namespace vk
|
||||
u32 block_length = 0;
|
||||
u32 word_count = 0;
|
||||
|
||||
cs_aggregator()
|
||||
{
|
||||
ssbo_count = 2;
|
||||
cs_aggregator();
|
||||
|
||||
create();
|
||||
void bind_resources() override;
|
||||
|
||||
m_src =
|
||||
"#version 450\n"
|
||||
"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
|
||||
|
||||
"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
|
||||
"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"
|
||||
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" if (gl_GlobalInvocationID.x < src.length())\n"
|
||||
" {\n"
|
||||
" atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
|
||||
" }\n"
|
||||
"}\n";
|
||||
|
||||
const std::pair<std::string, std::string> syntax_replace[] =
|
||||
{
|
||||
{ "%ws", std::to_string(optimal_group_size) },
|
||||
};
|
||||
|
||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||
}
|
||||
|
||||
void bind_resources() override
|
||||
{
|
||||
m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
}
|
||||
|
||||
void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
|
||||
{
|
||||
this->dst = dst;
|
||||
this->src = src;
|
||||
word_count = num_words;
|
||||
block_length = num_words * 4;
|
||||
|
||||
const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
|
||||
compute_task::run(cmd, linear_invocations);
|
||||
}
|
||||
void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words);
|
||||
};
|
||||
|
||||
// TODO: Replace with a proper manager
|
||||
|
@ -10,7 +10,7 @@ namespace vk
|
||||
{
|
||||
std::unordered_map<u64, std::vector<std::unique_ptr<vk::framebuffer_holder>>> g_framebuffers_cache;
|
||||
|
||||
vk::framebuffer_holder *get_framebuffer(VkDevice dev, u16 width, u16 height, VkRenderPass renderpass, const std::vector<vk::image*>& image_list)
|
||||
vk::framebuffer_holder* get_framebuffer(VkDevice dev, u16 width, u16 height, VkRenderPass renderpass, const std::vector<vk::image*>& image_list)
|
||||
{
|
||||
u64 key = u64(width) | (u64(height) << 16);
|
||||
auto &queue = g_framebuffers_cache[key];
|
||||
|
1049
rpcs3/Emu/RSX/VK/VKOverlays.cpp
Normal file
1049
rpcs3/Emu/RSX/VK/VKOverlays.cpp
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,7 @@
|
||||
#include "stdafx.h"
|
||||
#include "VKGSRender.h"
|
||||
#include "vkutils/buffer_object.h"
|
||||
#include "Emu/RSX/Overlays/overlays.h"
|
||||
#include "Emu/Cell/Modules/cellVideoOut.h"
|
||||
|
||||
#include "util/asm.hpp"
|
||||
|
@ -3,6 +3,8 @@
|
||||
#include "VKCompute.h"
|
||||
#include "VKOverlays.h"
|
||||
|
||||
#include "vkutils/image.h"
|
||||
|
||||
namespace vk
|
||||
{
|
||||
struct cs_resolve_base : compute_task
|
||||
|
@ -87,9 +87,11 @@
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="Emu\RSX\GL\GLCommonDecompiler.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLCompute.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLDraw.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLFragmentProgram.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLGSRender.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLOverlays.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLVertexProgram.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLHelpers.cpp" />
|
||||
|
@ -15,6 +15,8 @@
|
||||
<ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLOverlays.cpp" />
|
||||
<ClCompile Include="Emu\RSX\GL\GLCompute.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="Emu\RSX\GL\GLTexture.h" />
|
||||
|
@ -66,6 +66,7 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="Emu\RSX\VK\VKCommandStream.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKCommonDecompiler.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKCompute.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKDMA.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKDraw.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKFormats.cpp" />
|
||||
@ -73,6 +74,7 @@
|
||||
<ClCompile Include="Emu\RSX\VK\VKFramebuffer.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKGSRender.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKHelpers.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKOverlays.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKPipelineCompiler.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKPresent.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKProgramPipeline.cpp" />
|
||||
|
@ -62,6 +62,8 @@
|
||||
<ClCompile Include="Emu\RSX\VK\vkutils\image_helpers.cpp">
|
||||
<Filter>vkutils</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Emu\RSX\VK\VKOverlays.cpp" />
|
||||
<ClCompile Include="Emu\RSX\VK\VKCompute.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="Emu\RSX\VK\VKCommonDecompiler.h" />
|
||||
|
Loading…
Reference in New Issue
Block a user