1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 18:53:28 +01:00

glsl: Refactoring, cleanup and optimizations

- Avoid generating unused code
- Reduce GPR usage in emitted code
This commit is contained in:
kd-11 2019-06-15 16:15:44 +03:00 committed by kd-11
parent c963c51a60
commit 6be7c58fa4
6 changed files with 145 additions and 122 deletions

View File

@ -229,23 +229,23 @@ namespace glsl
" bool modulo;\n"
"};\n\n"
"uint get_bits(uvec4 v, bool swap)\n"
"uint get_bits(uint x, uint y, uint z, uint w, bool swap)\n"
"{\n"
" if (swap) return (v.w | v.z << 8 | v.y << 16 | v.x << 24);\n"
" return (v.x | v.y << 8 | v.z << 16 | v.w << 24);\n"
" if (swap) return (w | z << 8 | y << 16 | x << 24);\n"
" return (x | y << 8 | z << 16 | w << 24);\n"
"}\n\n"
"uint get_bits(uvec2 v, bool swap)\n"
"uint get_bits(uint x, uint y, bool swap)\n"
"{\n"
" if (swap) return (v.y | v.x << 8);\n"
" return (v.x | v.y << 8);\n"
" if (swap) return (y | x << 8);\n"
" return (x | y << 8);\n"
"}\n\n"
"int preserve_sign_s16(uint bits)\n"
"{\n"
" //convert raw 16 bit value into signed 32-bit integer counterpart\n"
" uint sign = bits & 0x8000;\n"
" if (sign != 0) return int(bits | 0xFFFF0000);\n"
" if (sign != 0) bits |= 0xFFFF0000;\n"
" return int(bits);\n"
"}\n\n"
@ -282,7 +282,7 @@ namespace glsl
{
OS <<
"#define mov(v, i, s) v[i] = s\n"
"#define ref(v, i) v[i]\n";
"#define ref(v, i) v[i]\n\n";
}
OS <<
@ -290,70 +290,67 @@ namespace glsl
"{\n"
" vec4 result = vec4(0., 0., 0., 1.);\n"
" vec4 scale = vec4(1.);\n"
" uvec4 tmp;\n"
" uint bits;\n"
" bool reverse_order = false;\n"
"\n"
" int first_byte = int((vertex_id * desc.stride) + desc.starting_offset);\n"
" for (int n = 0; n < 4; n++)\n"
" const int elem_size_table[] = { 2, 4, 2, 1, 2, 4, 1 };\n"
" const int elem_size = elem_size_table[desc.type];\n"
" uvec4 tmp;\n"
"\n"
" int n;\n"
" int i = int((vertex_id * desc.stride) + desc.starting_offset);\n"
"\n"
" for (n = 0; n < desc.attribute_size; n++)\n"
" {\n"
" if (n == desc.attribute_size) break;\n"
" tmp.x = texelFetch(input_stream, i++).x;\n"
" if (elem_size == 2)\n"
" {\n"
" tmp.y = texelFetch(input_stream, i++).x;\n"
" tmp.x = get_bits(tmp.x, tmp.y, desc.swap_bytes);\n"
" }\n"
" else if (elem_size == 4)\n"
" {\n"
" tmp.y = texelFetch(input_stream, i++).x;\n"
" tmp.z = texelFetch(input_stream, i++).x;\n"
" tmp.w = texelFetch(input_stream, i++).x;\n"
" tmp.x = get_bits(tmp.x, tmp.y, tmp.z, tmp.w, desc.swap_bytes);\n"
" }\n"
"\n"
" switch (desc.type)\n"
" {\n"
" case 0:\n"
" //signed normalized 16-bit\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" mov(result, n, get_s16(tmp.xy, desc.swap_bytes));\n"
" mov(scale, n, 32767.);\n"
" case 4:\n"
" //signed word\n"
" mov(result, n, preserve_sign_s16(tmp.x));\n"
" break;\n"
" case 1:\n"
" //float\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" tmp.z = texelFetch(input_stream, first_byte++).x;\n"
" tmp.w = texelFetch(input_stream, first_byte++).x;\n"
" mov(result, n, uintBitsToFloat(get_bits(tmp, desc.swap_bytes)));\n"
" mov(result, n, uintBitsToFloat(tmp.x));\n"
" break;\n"
" case 2:\n"
" //half\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" mov(result, n, unpackHalf2x16(uint(get_bits(tmp.xy, desc.swap_bytes))).x);\n"
" mov(result, n, unpackHalf2x16(tmp.x).x);\n"
" break;\n"
" case 3:\n"
" //unsigned byte\n"
" mov(result, n, texelFetch(input_stream, first_byte++).x);\n"
" mov(scale, n, 255.);\n"
" case 6:\n"
" //ub256\n"
" mov(result, n, tmp.x);\n"
" reverse_order = desc.swap_bytes;\n"
" break;\n"
" case 4:\n"
" //signed word\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" mov(result, n, get_s16(tmp.xy, desc.swap_bytes));\n"
" break;\n"
" case 5:\n"
" //cmp\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" tmp.z = texelFetch(input_stream, first_byte++).x;\n"
" tmp.w = texelFetch(input_stream, first_byte++).x;\n"
" bits = get_bits(tmp, desc.swap_bytes);\n"
" result.x = preserve_sign_s16((bits & 0x7FF) << 5);\n"
" result.y = preserve_sign_s16(((bits >> 11) & 0x7FF) << 5);\n"
" result.z = preserve_sign_s16(((bits >> 22) & 0x3FF) << 6);\n"
" result.x = preserve_sign_s16((tmp.x & 0x7FF) << 5);\n"
" result.y = preserve_sign_s16(((tmp.x >> 11) & 0x7FF) << 5);\n"
" result.z = preserve_sign_s16(((tmp.x >> 22) & 0x3FF) << 6);\n"
" result.w = 1.;\n"
" scale = vec4(32767., 32767., 32767., 1.);\n"
" break;\n"
" case 6:\n"
" //ub256\n"
" mov(result, n, float(texelFetch(input_stream, first_byte++).x));\n"
" reverse_order = desc.swap_bytes;\n"
" break;\n"
" }\n"
" }\n\n"
" }\n"
"\n"
" result /= scale;\n"
" return (reverse_order)? result.wzyx: result;\n"
"}\n\n"
@ -410,17 +407,14 @@ namespace glsl
" {\n"
" vertex_id = 0;\n"
" }\n"
" else if (desc.frequency > 1)\n"
" else if (desc.modulo)\n"
" {\n"
" //if a vertex modifier is active; vertex_base must be 0 and is ignored\n"
" if (desc.modulo)\n"
" {\n"
" vertex_id = (" << vertex_id_name << " + int(vertex_index_offset)) % int(desc.frequency);\n"
" }\n"
" else\n"
" {\n"
" vertex_id = vertex_id / int(desc.frequency); \n"
" }\n"
" vertex_id = (" << vertex_id_name << " + int(vertex_index_offset)) % int(desc.frequency);\n"
" }\n"
" else\n"
" {\n"
" vertex_id /= int(desc.frequency); \n"
" }\n"
"\n"
" if (desc.is_volatile)\n"
@ -430,7 +424,7 @@ namespace glsl
"}\n\n";
}
static void insert_rop(std::ostream& OS, bool _32_bit_exports, bool native_half_support)
static void insert_rop(std::ostream& OS, bool _32_bit_exports, bool native_half_support, bool emulate_coverage_tests)
{
const std::string reg0 = _32_bit_exports ? "r0" : "h0";
const std::string reg1 = _32_bit_exports ? "r2" : "h4";
@ -442,18 +436,33 @@ namespace glsl
" if ((rop_control & 0xFF) != 0)\n"
" {\n"
" bool alpha_test = (rop_control & 0x1) > 0;\n"
" uint alpha_func = ((rop_control >> 16) & 0x7);\n"
" bool srgb_convert = (rop_control & 0x2) > 0;\n\n"
" bool a2c_enabled = (rop_control & 0x10) > 0;\n"
" uint alpha_func = ((rop_control >> 16) & 0x7);\n";
if (!_32_bit_exports)
{
OS << " bool srgb_convert = (rop_control & 0x2) > 0;\n\n";
}
if (emulate_coverage_tests)
{
OS << " bool a2c_enabled = (rop_control & 0x10) > 0;\n";
}
OS <<
" if (alpha_test && !comparison_passes(" << reg0 << ".a, alpha_ref, alpha_func))\n"
" {\n"
" discard;\n"
" }\n"
" else if (a2c_enabled && !coverage_test_passes(" << reg0 << ", rop_control >> 5))\n"
" {\n"
" discard;\n"
" }\n";
if (emulate_coverage_tests)
{
OS <<
" else if (a2c_enabled && !coverage_test_passes(" << reg0 << ", rop_control >> 5))\n"
" {\n"
" discard;\n"
" }\n";
}
if (!_32_bit_exports)
{
// Tested using NPUB90375; some shaders (32-bit output only?) do not obey srgb flags
@ -535,41 +544,42 @@ namespace glsl
program_common::insert_compare_op(OS, props.low_precision_tests);
if (props.require_texture_ops && props.emulate_shadow_compare)
if (props.require_shadow_ops && props.emulate_shadow_compare)
{
program_common::insert_compare_op_vector(OS);
}
// NOTES:
// Lowers alpha accuracy down to 2 bits, to mimic A2C banding
// Alpha lower than the real threshold (e.g 0.25 for 4 samples) gets a randomized chance to make it to the lowest transparency state
// Helps to avoid A2C tested foliage disappearing in the distance
OS <<
"bool coverage_test_passes(/*inout*/in vec4 _sample, uint control)\n"
"{\n"
" if ((control & 0x1) == 0) return false;\n"
"\n"
" float samples = ((control & 0x2) != 0)? 4.f : 2.f;\n"
" float hash = _saturate(_rand(gl_FragCoord) + 0.5f) * 0.9f;\n"
" float epsilon = hash / samples;\n"
" float alpha = trunc((_sample.a + epsilon) * samples) / samples;\n"
" //_sample.a = min(_sample.a, alpha);\n" // Cannot blend A2C samples naively as they are order independent! Causes background bleeding
" return (alpha > 0.f);\n"
"}\n\n"
if (props.emulate_coverage_tests)
{
// NOTES:
// Lowers alpha accuracy down to 2 bits, to mimic A2C banding
// Alpha lower than the real threshold (e.g 0.25 for 4 samples) gets a randomized chance to make it to the lowest transparency state
// Helps to avoid A2C tested foliage disappearing in the distance
OS <<
"bool coverage_test_passes(/*inout*/in vec4 _sample, uint control)\n"
"{\n"
" if ((control & 0x1) == 0) return false;\n"
"\n"
" float samples = ((control & 0x2) != 0)? 4.f : 2.f;\n"
" float hash = _saturate(_rand(gl_FragCoord) + 0.5f) * 0.9f;\n"
" float epsilon = hash / samples;\n"
" float alpha = trunc((_sample.a + epsilon) * samples) / samples;\n"
" //_sample.a = min(_sample.a, alpha);\n" // Cannot blend A2C samples naively as they are order independent! Causes background bleeding
" return (alpha > 0.f);\n"
"}\n\n";
}
"vec4 linear_to_srgb(vec4 cl)\n"
"{\n"
" vec4 low = cl * 12.92;\n"
" vec4 high = 1.055 * pow(cl, vec4(1. / 2.4)) - 0.055;\n"
" bvec4 select = lessThan(cl, vec4(0.0031308));\n"
" return clamp(mix(high, low, select), 0., 1.);\n"
"}\n\n"
"float srgb_to_linear(float cs)\n"
"{\n"
" if (cs <= 0.04045) return cs / 12.92;\n"
" return pow((cs + 0.055) / 1.055, 2.4);\n"
"}\n\n";
if (!props.fp32_outputs)
{
OS <<
"vec4 linear_to_srgb(vec4 cl)\n"
"{\n"
" vec4 low = cl * 12.92;\n"
" vec4 high = 1.055 * pow(cl, vec4(1. / 2.4)) - 0.055;\n"
" bvec4 select = lessThan(cl, vec4(0.0031308));\n"
" return clamp(mix(high, low, select), 0., 1.);\n"
"}\n\n";
}
if (props.require_depth_conversion)
{
@ -617,7 +627,7 @@ namespace glsl
if (props.require_texture_ops)
{
if (props.emulate_shadow_compare)
if (props.require_shadow_ops && props.emulate_shadow_compare)
{
OS <<
"vec4 shadowCompare(sampler2D tex, vec3 p, uint func)\n"
@ -648,6 +658,12 @@ namespace glsl
" return mix(direct, indexed, choice);\n"
"}\n\n"
#endif
"vec4 srgb_to_linear(vec4 cs)\n"
"{\n"
" vec4 a = cs / 12.92;\n"
" vec4 b = pow((cs + 0.055) / 1.055, vec4(2.4));\n"
" return _select(a, b, greaterThan(cs, vec4(0.04045)));\n"
"}\n\n"
//TODO: Move all the texture read control operations here
"vec4 process_texel(vec4 rgba, uint control_bits)\n"
@ -656,23 +672,25 @@ namespace glsl
" uint remap_bits = (control_bits >> 16) & 0xFFFF;\n"
" if (remap_bits != 0x8D5) rgba = remap_vector(rgba, remap_bits);\n\n"
#endif
" if ((control_bits & 0xFF) == 0) return rgba;\n\n"
" if ((control_bits & 0x10) > 0)\n"
" if (control_bits == 0)\n"
" {\n"
" //Alphakill\n"
" if (rgba.a < 0.0000000001)\n"
" return rgba;\n"
" }\n"
"\n"
" if ((control_bits & 0x10) != 0)\n"
" {\n"
" // Alphakill\n"
" if (rgba.a < 0.000001)\n"
" {\n"
" discard;\n"
" return rgba;\n"
" }\n"
" }\n\n"
" }\n"
"\n"
" //TODO: Verify gamma control bit ordering, looks to be 0x7 for rgb, 0xF for rgba\n"
" uint srgb_in = (control_bits & 0xF);\n"
" if ((srgb_in & 0x1) > 0) rgba.r = srgb_to_linear(rgba.r);\n"
" if ((srgb_in & 0x2) > 0) rgba.g = srgb_to_linear(rgba.g);\n"
" if ((srgb_in & 0x4) > 0) rgba.b = srgb_to_linear(rgba.b);\n"
" if ((srgb_in & 0x8) > 0) rgba.a = srgb_to_linear(rgba.a);\n"
" return rgba;\n"
" uvec4 mask = uvec4(control_bits & 0xF) & uvec4(0x1, 0x2, 0x4, 0x8);\n"
" vec4 convert = srgb_to_linear(rgba);\n"
" return _select(rgba, convert, notEqual(mask, uvec4(0)));\n"
"}\n\n"
"#define TEX_NAME(index) tex##index\n"

View File

@ -1,4 +1,4 @@
#pragma once
#pragma once
namespace glsl
{
@ -22,9 +22,12 @@ namespace glsl
bool require_lit_emulation;
// Only relevant for fragment programs
bool fp32_outputs;
bool require_wpos;
bool require_depth_conversion;
bool require_texture_ops;
bool require_shadow_ops;
bool emulate_coverage_tests;
bool emulate_shadow_compare;
bool low_precision_tests;
};

View File

@ -199,9 +199,12 @@ void GLFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS)
glsl::shader_properties properties2;
properties2.domain = glsl::glsl_fragment_program;
properties2.require_lit_emulation = properties.has_lit_op;
properties2.fp32_outputs = !!(m_prog.ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
properties2.require_depth_conversion = m_prog.redirected_textures != 0;
properties2.require_wpos = properties.has_wpos_input;
properties2.require_texture_ops = properties.has_tex_op;
properties2.require_shadow_ops = m_prog.shadow_textures != 0;
properties2.emulate_coverage_tests = g_cfg.video.antialiasing_level == msaa_level::none;
properties2.emulate_shadow_compare = device_props.emulate_depth_compare;
properties2.low_precision_tests = ::gl::get_driver_caps().vendor_NVIDIA;
@ -350,7 +353,11 @@ void GLFragmentDecompilerThread::insertMainEnd(std::stringstream & OS)
OS << "\n" << " fs_main(" + parameters + ");\n\n";
glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS), device_props.has_native_half_support);
glsl::insert_rop(
OS,
!!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS),
device_props.has_native_half_support,
g_cfg.video.antialiasing_level == msaa_level::none);
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
{

View File

@ -157,15 +157,9 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
{
const auto& dev_caps = gl::get_driver_caps();
glsl::shader_properties properties2;
glsl::shader_properties properties2{};
properties2.domain = glsl::glsl_vertex_program;
properties2.require_lit_emulation = properties.has_lit_op;
// Unused
properties2.require_depth_conversion = false;
properties2.require_wpos = false;
properties2.require_texture_ops = false;
properties2.emulate_shadow_compare = false;
properties2.low_precision_tests = false;
insert_glsl_legacy_function(OS, properties2);
glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false);

View File

@ -229,9 +229,12 @@ void VKFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS)
glsl::shader_properties properties2;
properties2.domain = glsl::glsl_fragment_program;
properties2.require_lit_emulation = properties.has_lit_op;
properties2.fp32_outputs = !!(m_prog.ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
properties2.require_depth_conversion = m_prog.redirected_textures != 0;
properties2.require_wpos = properties.has_wpos_input;
properties2.require_texture_ops = properties.has_tex_op;
properties2.require_shadow_ops = m_prog.shadow_textures != 0;
properties2.emulate_coverage_tests = g_cfg.video.antialiasing_level == msaa_level::none;
properties2.emulate_shadow_compare = device_props.emulate_depth_compare;
properties2.low_precision_tests = vk::get_driver_vendor() == vk::driver_vendor::NVIDIA;
@ -383,7 +386,11 @@ void VKFragmentDecompilerThread::insertMainEnd(std::stringstream & OS)
OS << "\n" << " fs_main(" + parameters + ");\n\n";
glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS), device_props.has_native_half_support);
glsl::insert_rop(
OS,
!!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS),
device_props.has_native_half_support,
g_cfg.video.antialiasing_level == msaa_level::none);
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
{

View File

@ -45,7 +45,7 @@ void VKVertexDecompilerThread::insertHeader(std::stringstream &OS)
OS << "layout(std140, set = 0, binding = 1) uniform VertexLayoutBuffer\n";
OS << "{\n";
OS << " uint vertex_base_index;\n";
OS << " uint vertex_index_offset;\n";
OS << " uint vertex_index_offset;\n";
OS << " uvec4 input_attributes_blob[16 / 2];\n";
OS << "};\n\n";
@ -193,15 +193,9 @@ void VKVertexDecompilerThread::insertOutputs(std::stringstream & OS, const std::
void VKVertexDecompilerThread::insertMainStart(std::stringstream & OS)
{
glsl::shader_properties properties2;
glsl::shader_properties properties2{};
properties2.domain = glsl::glsl_vertex_program;
properties2.require_lit_emulation = properties.has_lit_op;
// Unused
properties2.require_depth_conversion = false;
properties2.require_wpos = false;
properties2.require_texture_ops = false;
properties2.emulate_shadow_compare = false;
properties2.low_precision_tests = false;
glsl::insert_glsl_legacy_function(OS, properties2);
glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_spirv);