glsl: Refactoring, cleanup and optimizations

- Avoid generating unused code - Reduce GPR usage in emitted code
2024-11-22 18:53:28 +01:00 · 2019-06-15 16:15:44 +03:00 · 2019-06-15 16:15:44 +03:00 · 6be7c58fa4
commit 6be7c58fa4
parent c963c51a60
6 changed files with 145 additions and 122 deletions
--- a/rpcs3/Emu/RSX/Common/GLSLCommon.h
+++ b/rpcs3/Emu/RSX/Common/GLSLCommon.h
@ -229,23 +229,23 @@ namespace glsl
 		"	bool modulo;\n"
 		"};\n\n"

-		"uint get_bits(uvec4 v, bool swap)\n"
+		"uint get_bits(uint x, uint y, uint z, uint w, bool swap)\n"
 		"{\n"
-		"	if (swap) return (v.w | v.z << 8 | v.y << 16 | v.x << 24);\n"
-		"	return (v.x | v.y << 8 | v.z << 16 | v.w << 24);\n"
+		"	if (swap) return (w | z << 8 | y << 16 | x << 24);\n"
+		"	return (x | y << 8 | z << 16 | w << 24);\n"
 		"}\n\n"

-		"uint get_bits(uvec2 v, bool swap)\n"
+		"uint get_bits(uint x, uint y, bool swap)\n"
 		"{\n"
-		"	if (swap) return (v.y | v.x << 8);\n"
-		"	return (v.x | v.y << 8);\n"
+		"	if (swap) return (y | x << 8);\n"
+		"	return (x | y << 8);\n"
 		"}\n\n"

 		"int preserve_sign_s16(uint bits)\n"
 		"{\n"
 		"	//convert raw 16 bit value into signed 32-bit integer counterpart\n"
 		"	uint sign = bits & 0x8000;\n"
-		"	if (sign != 0) return int(bits | 0xFFFF0000);\n"
+		"	if (sign != 0) bits |= 0xFFFF0000;\n"
 		"	return int(bits);\n"
 		"}\n\n"

@ -282,7 +282,7 @@ namespace glsl
 		{
 			OS <<
 			"#define mov(v, i, s) v[i] = s\n"
-			"#define ref(v, i) v[i]\n";
+			"#define ref(v, i) v[i]\n\n";
 		}

 		OS <<
@ -290,70 +290,67 @@ namespace glsl
 		"{\n"
 		"	vec4 result = vec4(0., 0., 0., 1.);\n"
 		"	vec4 scale = vec4(1.);\n"
-		"	uvec4 tmp;\n"
-		"	uint bits;\n"
 		"	bool reverse_order = false;\n"
 		"\n"
-		"	int first_byte = int((vertex_id * desc.stride) + desc.starting_offset);\n"
-		"	for (int n = 0; n < 4; n++)\n"
+		"	const int elem_size_table[] = { 2, 4, 2, 1, 2, 4, 1 };\n"
+		"	const int elem_size = elem_size_table[desc.type];\n"
+		"	uvec4 tmp;\n"
+		"\n"
+		"	int n;\n"
+		"	int i = int((vertex_id * desc.stride) + desc.starting_offset);\n"
+		"\n"
+		"	for (n = 0; n < desc.attribute_size; n++)\n"
 		"	{\n"
-		"		if (n == desc.attribute_size) break;\n"
+		"		tmp.x = texelFetch(input_stream, i++).x;\n"
+		"		if (elem_size == 2)\n"
+		"		{\n"
+		"			tmp.y = texelFetch(input_stream, i++).x;\n"
+		"			tmp.x = get_bits(tmp.x, tmp.y, desc.swap_bytes);\n"
+		"		}\n"
+		"		else if (elem_size == 4)\n"
+		"		{\n"
+		"			tmp.y = texelFetch(input_stream, i++).x;\n"
+		"			tmp.z = texelFetch(input_stream, i++).x;\n"
+		"			tmp.w = texelFetch(input_stream, i++).x;\n"
+		"			tmp.x = get_bits(tmp.x, tmp.y, tmp.z, tmp.w, desc.swap_bytes);\n"
+		"		}\n"
 		"\n"
 		"		switch (desc.type)\n"
 		"		{\n"
 		"		case 0:\n"
 		"			//signed normalized 16-bit\n"
-		"			tmp.x = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.y = texelFetch(input_stream, first_byte++).x;\n"
-		"			mov(result, n, get_s16(tmp.xy, desc.swap_bytes));\n"
 		"			mov(scale, n, 32767.);\n"
+		"		case 4:\n"
+		"			//signed word\n"
+		"			mov(result, n, preserve_sign_s16(tmp.x));\n"
 		"			break;\n"
 		"		case 1:\n"
 		"			//float\n"
-		"			tmp.x = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.y = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.z = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.w = texelFetch(input_stream, first_byte++).x;\n"
-		"			mov(result, n, uintBitsToFloat(get_bits(tmp, desc.swap_bytes)));\n"
+		"			mov(result, n, uintBitsToFloat(tmp.x));\n"
 		"			break;\n"
 		"		case 2:\n"
 		"			//half\n"
-		"			tmp.x = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.y = texelFetch(input_stream, first_byte++).x;\n"
-		"			mov(result, n, unpackHalf2x16(uint(get_bits(tmp.xy, desc.swap_bytes))).x);\n"
+		"			mov(result, n, unpackHalf2x16(tmp.x).x);\n"
 		"			break;\n"
 		"		case 3:\n"
 		"			//unsigned byte\n"
-		"			mov(result, n, texelFetch(input_stream, first_byte++).x);\n"
 		"			mov(scale, n, 255.);\n"
+		"		case 6:\n"
+		"			//ub256\n"
+		"			mov(result, n, tmp.x);\n"
 		"			reverse_order = desc.swap_bytes;\n"
 		"			break;\n"
-		"		case 4:\n"
-		"			//signed word\n"
-		"			tmp.x = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.y = texelFetch(input_stream, first_byte++).x;\n"
-		"			mov(result, n, get_s16(tmp.xy, desc.swap_bytes));\n"
-		"			break;\n"
 		"		case 5:\n"
 		"			//cmp\n"
-		"			tmp.x = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.y = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.z = texelFetch(input_stream, first_byte++).x;\n"
-		"			tmp.w = texelFetch(input_stream, first_byte++).x;\n"
-		"			bits = get_bits(tmp, desc.swap_bytes);\n"
-		"			result.x = preserve_sign_s16((bits & 0x7FF) << 5);\n"
-		"			result.y = preserve_sign_s16(((bits >> 11) & 0x7FF) << 5);\n"
-		"			result.z = preserve_sign_s16(((bits >> 22) & 0x3FF) << 6);\n"
+		"			result.x = preserve_sign_s16((tmp.x & 0x7FF) << 5);\n"
+		"			result.y = preserve_sign_s16(((tmp.x >> 11) & 0x7FF) << 5);\n"
+		"			result.z = preserve_sign_s16(((tmp.x >> 22) & 0x3FF) << 6);\n"
 		"			result.w = 1.;\n"
 		"			scale = vec4(32767., 32767., 32767., 1.);\n"
 		"			break;\n"
-		"		case 6:\n"
-		"			//ub256\n"
-		"			mov(result, n, float(texelFetch(input_stream, first_byte++).x));\n"
-		"			reverse_order = desc.swap_bytes;\n"
-		"			break;\n"
 		"		}\n"
-		"	}\n\n"
+		"	}\n"
+		"\n"
 		"	result /= scale;\n"
 		"	return (reverse_order)? result.wzyx: result;\n"
 		"}\n\n"
@ -410,17 +407,14 @@ namespace glsl
 		"	{\n"
 		"		vertex_id = 0;\n"
 		"	}\n"
-		"	else if (desc.frequency > 1)\n"
+		"	else if (desc.modulo)\n"
 		"	{\n"
 		"		//if a vertex modifier is active; vertex_base must be 0 and is ignored\n"
-		"		if (desc.modulo)\n"
-		"		{\n"
-		"			vertex_id = (" << vertex_id_name << " + int(vertex_index_offset)) % int(desc.frequency);\n"
-		"		}\n"
-		"		else\n"
-		"		{\n"
-		"			vertex_id = vertex_id / int(desc.frequency); \n"
-		"		}\n"
+		"		vertex_id = (" << vertex_id_name << " + int(vertex_index_offset)) % int(desc.frequency);\n"
+		"	}\n"
+		"	else\n"
+		"	{\n"
+		"		vertex_id /= int(desc.frequency); \n"
 		"	}\n"
 		"\n"
 		"	if (desc.is_volatile)\n"
@ -430,7 +424,7 @@ namespace glsl
 		"}\n\n";
 	}

-	static void insert_rop(std::ostream& OS, bool _32_bit_exports, bool native_half_support)
+	static void insert_rop(std::ostream& OS, bool _32_bit_exports, bool native_half_support, bool emulate_coverage_tests)
 	{
 		const std::string reg0 = _32_bit_exports ? "r0" : "h0";
 		const std::string reg1 = _32_bit_exports ? "r2" : "h4";
@ -442,18 +436,33 @@ namespace glsl
 		"	if ((rop_control & 0xFF) != 0)\n"
 		"	{\n"
 		"		bool alpha_test = (rop_control & 0x1) > 0;\n"
-		"		uint alpha_func = ((rop_control >> 16) & 0x7);\n"
-		"		bool srgb_convert = (rop_control & 0x2) > 0;\n\n"
-		"		bool a2c_enabled = (rop_control & 0x10) > 0;\n"
+		"		uint alpha_func = ((rop_control >> 16) & 0x7);\n";
+
+		if (!_32_bit_exports)
+		{
+			OS << "		bool srgb_convert = (rop_control & 0x2) > 0;\n\n";
+		}
+
+		if (emulate_coverage_tests)
+		{
+			OS << "		bool a2c_enabled = (rop_control & 0x10) > 0;\n";
+		}
+
+		OS <<
 		"		if (alpha_test && !comparison_passes(" << reg0 << ".a, alpha_ref, alpha_func))\n"
 		"		{\n"
 		"			discard;\n"
-		"		}\n"
-		"		else if (a2c_enabled && !coverage_test_passes(" << reg0 << ", rop_control >> 5))\n"
-		"		{\n"
-		"			discard;\n"
 		"		}\n";

+		if (emulate_coverage_tests)
+		{
+			OS <<
+			"		else if (a2c_enabled && !coverage_test_passes(" << reg0 << ", rop_control >> 5))\n"
+			"		{\n"
+			"			discard;\n"
+			"		}\n";
+		}
+
 		if (!_32_bit_exports)
 		{
 			// Tested using NPUB90375; some shaders (32-bit output only?) do not obey srgb flags
@ -535,41 +544,42 @@ namespace glsl

 		program_common::insert_compare_op(OS, props.low_precision_tests);

-		if (props.require_texture_ops && props.emulate_shadow_compare)
+		if (props.require_shadow_ops && props.emulate_shadow_compare)
 		{
 			program_common::insert_compare_op_vector(OS);
 		}

-		// NOTES:
-		// Lowers alpha accuracy down to 2 bits, to mimic A2C banding
-		// Alpha lower than the real threshold (e.g 0.25 for 4 samples) gets a randomized chance to make it to the lowest transparency state
-		// Helps to avoid A2C tested foliage disappearing in the distance
-		OS <<
-		"bool coverage_test_passes(/*inout*/in vec4 _sample, uint control)\n"
-		"{\n"
-		"	if ((control & 0x1) == 0) return false;\n"
-		"\n"
-		"	float samples = ((control & 0x2) != 0)? 4.f : 2.f;\n"
-		"	float hash    = _saturate(_rand(gl_FragCoord) + 0.5f) * 0.9f;\n"
-		"	float epsilon = hash / samples;\n"
-		"	float alpha   = trunc((_sample.a + epsilon) * samples) / samples;\n"
-		"	//_sample.a     = min(_sample.a, alpha);\n" // Cannot blend A2C samples naively as they are order independent! Causes background bleeding
-		"	return (alpha > 0.f);\n"
-		"}\n\n"
+		if (props.emulate_coverage_tests)
+		{
+			// NOTES:
+			// Lowers alpha accuracy down to 2 bits, to mimic A2C banding
+			// Alpha lower than the real threshold (e.g 0.25 for 4 samples) gets a randomized chance to make it to the lowest transparency state
+			// Helps to avoid A2C tested foliage disappearing in the distance
+			OS <<
+			"bool coverage_test_passes(/*inout*/in vec4 _sample, uint control)\n"
+			"{\n"
+			"	if ((control & 0x1) == 0) return false;\n"
+			"\n"
+			"	float samples = ((control & 0x2) != 0)? 4.f : 2.f;\n"
+			"	float hash    = _saturate(_rand(gl_FragCoord) + 0.5f) * 0.9f;\n"
+			"	float epsilon = hash / samples;\n"
+			"	float alpha   = trunc((_sample.a + epsilon) * samples) / samples;\n"
+			"	//_sample.a     = min(_sample.a, alpha);\n" // Cannot blend A2C samples naively as they are order independent! Causes background bleeding
+			"	return (alpha > 0.f);\n"
+			"}\n\n";
+		}

-		"vec4 linear_to_srgb(vec4 cl)\n"
-		"{\n"
-		"	vec4 low = cl * 12.92;\n"
-		"	vec4 high = 1.055 * pow(cl, vec4(1. / 2.4)) - 0.055;\n"
-		"	bvec4 select = lessThan(cl, vec4(0.0031308));\n"
-		"	return clamp(mix(high, low, select), 0., 1.);\n"
-		"}\n\n"
-
-		"float srgb_to_linear(float cs)\n"
-		"{\n"
-		"	if (cs <= 0.04045) return cs / 12.92;\n"
-		"	return pow((cs + 0.055) / 1.055, 2.4);\n"
-		"}\n\n";
+		if (!props.fp32_outputs)
+		{
+			OS <<
+			"vec4 linear_to_srgb(vec4 cl)\n"
+			"{\n"
+			"	vec4 low = cl * 12.92;\n"
+			"	vec4 high = 1.055 * pow(cl, vec4(1. / 2.4)) - 0.055;\n"
+			"	bvec4 select = lessThan(cl, vec4(0.0031308));\n"
+			"	return clamp(mix(high, low, select), 0., 1.);\n"
+			"}\n\n";
+		}

 		if (props.require_depth_conversion)
 		{
@ -617,7 +627,7 @@ namespace glsl

 		if (props.require_texture_ops)
 		{
-			if (props.emulate_shadow_compare)
+			if (props.require_shadow_ops && props.emulate_shadow_compare)
 			{
 				OS <<
 				"vec4 shadowCompare(sampler2D tex, vec3 p, uint func)\n"
@ -648,6 +658,12 @@ namespace glsl
 			"	return mix(direct, indexed, choice);\n"
 			"}\n\n"
 #endif
+			"vec4 srgb_to_linear(vec4 cs)\n"
+			"{\n"
+			"	vec4 a = cs / 12.92;\n"
+			"	vec4 b = pow((cs + 0.055) / 1.055, vec4(2.4));\n"
+			"	return _select(a, b, greaterThan(cs, vec4(0.04045)));\n"
+			"}\n\n"

 			//TODO: Move all the texture read control operations here
 			"vec4 process_texel(vec4 rgba, uint control_bits)\n"
@ -656,23 +672,25 @@ namespace glsl
 			"	uint remap_bits = (control_bits >> 16) & 0xFFFF;\n"
 			"	if (remap_bits != 0x8D5) rgba = remap_vector(rgba, remap_bits);\n\n"
 #endif
-			"	if ((control_bits & 0xFF) == 0) return rgba;\n\n"
-			"	if ((control_bits & 0x10) > 0)\n"
+			"	if (control_bits == 0)\n"
 			"	{\n"
-			"		//Alphakill\n"
-			"		if (rgba.a < 0.0000000001)\n"
+			"		return rgba;\n"
+			"	}\n"
+			"\n"
+			"	if ((control_bits & 0x10) != 0)\n"
+			"	{\n"
+			"		// Alphakill\n"
+			"		if (rgba.a < 0.000001)\n"
 			"		{\n"
 			"			discard;\n"
 			"			return rgba;\n"
 			"		}\n"
-			"	}\n\n"
+			"	}\n"
+			"\n"
 			"	//TODO: Verify gamma control bit ordering, looks to be 0x7 for rgb, 0xF for rgba\n"
-			"	uint srgb_in = (control_bits & 0xF);\n"
-			"	if ((srgb_in & 0x1) > 0) rgba.r = srgb_to_linear(rgba.r);\n"
-			"	if ((srgb_in & 0x2) > 0) rgba.g = srgb_to_linear(rgba.g);\n"
-			"	if ((srgb_in & 0x4) > 0) rgba.b = srgb_to_linear(rgba.b);\n"
-			"	if ((srgb_in & 0x8) > 0) rgba.a = srgb_to_linear(rgba.a);\n"
-			"	return rgba;\n"
+			"	uvec4 mask = uvec4(control_bits & 0xF) & uvec4(0x1, 0x2, 0x4, 0x8);\n"
+			"	vec4 convert = srgb_to_linear(rgba);\n"
+			"	return _select(rgba, convert, notEqual(mask, uvec4(0)));\n"
 			"}\n\n"

 			"#define TEX_NAME(index) tex##index\n"
--- a/rpcs3/Emu/RSX/Common/GLSLTypes.h
+++ b/rpcs3/Emu/RSX/Common/GLSLTypes.h
@ -1,4 +1,4 @@
-#pragma once
+#pragma once

 namespace glsl
 {
@ -22,9 +22,12 @@ namespace glsl
 		bool require_lit_emulation;

 		// Only relevant for fragment programs
+		bool fp32_outputs;
 		bool require_wpos;
 		bool require_depth_conversion;
 		bool require_texture_ops;
+		bool require_shadow_ops;
+		bool emulate_coverage_tests;
 		bool emulate_shadow_compare;
 		bool low_precision_tests;
 	};
--- a/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp
+++ b/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp
@ -199,9 +199,12 @@ void GLFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS)
 	glsl::shader_properties properties2;
 	properties2.domain = glsl::glsl_fragment_program;
 	properties2.require_lit_emulation = properties.has_lit_op;
+	properties2.fp32_outputs = !!(m_prog.ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
 	properties2.require_depth_conversion = m_prog.redirected_textures != 0;
 	properties2.require_wpos = properties.has_wpos_input;
 	properties2.require_texture_ops = properties.has_tex_op;
+	properties2.require_shadow_ops = m_prog.shadow_textures != 0;
+	properties2.emulate_coverage_tests = g_cfg.video.antialiasing_level == msaa_level::none;
 	properties2.emulate_shadow_compare = device_props.emulate_depth_compare;
 	properties2.low_precision_tests = ::gl::get_driver_caps().vendor_NVIDIA;

@ -350,7 +353,11 @@ void GLFragmentDecompilerThread::insertMainEnd(std::stringstream & OS)

 	OS << "\n" << "	fs_main(" + parameters + ");\n\n";

-	glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS), device_props.has_native_half_support);
+	glsl::insert_rop(
+		OS,
+		!!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS),
+		device_props.has_native_half_support,
+		g_cfg.video.antialiasing_level == msaa_level::none);

 	if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
 	{
--- a/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp
+++ b/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp
@ -157,15 +157,9 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
 {
 	const auto& dev_caps = gl::get_driver_caps();

-	glsl::shader_properties properties2;
+	glsl::shader_properties properties2{};
 	properties2.domain = glsl::glsl_vertex_program;
 	properties2.require_lit_emulation = properties.has_lit_op;
-	// Unused
-	properties2.require_depth_conversion = false;
-	properties2.require_wpos = false;
-	properties2.require_texture_ops = false;
-	properties2.emulate_shadow_compare = false;
-	properties2.low_precision_tests = false;

 	insert_glsl_legacy_function(OS, properties2);
 	glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false);
--- a/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp
+++ b/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp
@ -229,9 +229,12 @@ void VKFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS)
 	glsl::shader_properties properties2;
 	properties2.domain = glsl::glsl_fragment_program;
 	properties2.require_lit_emulation = properties.has_lit_op;
+	properties2.fp32_outputs = !!(m_prog.ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
 	properties2.require_depth_conversion = m_prog.redirected_textures != 0;
 	properties2.require_wpos = properties.has_wpos_input;
 	properties2.require_texture_ops = properties.has_tex_op;
+	properties2.require_shadow_ops = m_prog.shadow_textures != 0;
+	properties2.emulate_coverage_tests = g_cfg.video.antialiasing_level == msaa_level::none;
 	properties2.emulate_shadow_compare = device_props.emulate_depth_compare;
 	properties2.low_precision_tests = vk::get_driver_vendor() == vk::driver_vendor::NVIDIA;

@ -383,7 +386,11 @@ void VKFragmentDecompilerThread::insertMainEnd(std::stringstream & OS)

 	OS << "\n" << "	fs_main(" + parameters + ");\n\n";

-	glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS), device_props.has_native_half_support);
+	glsl::insert_rop(
+		OS,
+		!!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS),
+		device_props.has_native_half_support,
+		g_cfg.video.antialiasing_level == msaa_level::none);

 	if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
 	{
--- a/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp
+++ b/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp
@ -45,7 +45,7 @@ void VKVertexDecompilerThread::insertHeader(std::stringstream &OS)
 	OS << "layout(std140, set = 0, binding = 1) uniform VertexLayoutBuffer\n";
 	OS << "{\n";
 	OS << "	uint  vertex_base_index;\n";
-	OS << " uint  vertex_index_offset;\n";
+	OS << "	uint  vertex_index_offset;\n";
 	OS << "	uvec4 input_attributes_blob[16 / 2];\n";
 	OS << "};\n\n";

@ -193,15 +193,9 @@ void VKVertexDecompilerThread::insertOutputs(std::stringstream & OS, const std::

 void VKVertexDecompilerThread::insertMainStart(std::stringstream & OS)
 {
-	glsl::shader_properties properties2;
+	glsl::shader_properties properties2{};
 	properties2.domain = glsl::glsl_vertex_program;
 	properties2.require_lit_emulation = properties.has_lit_op;
-	// Unused
-	properties2.require_depth_conversion = false;
-	properties2.require_wpos = false;
-	properties2.require_texture_ops = false;
-	properties2.emulate_shadow_compare = false;
-	properties2.low_precision_tests = false;

 	glsl::insert_glsl_legacy_function(OS, properties2);
 	glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_spirv);