From 9baef8c705e0efea1fc7ef015fcebe55a647e9ca Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sat, 26 Sep 2020 20:42:31 +0300 Subject: [PATCH] rsx: Emit simpler fragment program code - Optimize clamp16 - Use bfe instead of shift-and --- .../RSX/Common/FragmentProgramDecompiler.cpp | 45 +++++------ rpcs3/Emu/RSX/Common/GLSLCommon.h | 81 ++++++++++--------- 2 files changed, 63 insertions(+), 63 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp index 5ca7a4fc87..7ec7e635d9 100644 --- a/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp @@ -191,7 +191,7 @@ std::string FragmentProgramDecompiler::AddReg(u32 index, bool fp16) const std::string type_name = (fp16 && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4); const std::string reg_name = std::string(fp16 ? "h" : "r") + std::to_string(index); - return m_parr.AddParam(PF_PARAM_NONE, type_name, reg_name, type_name + "(0., 0., 0., 0.)"); + return m_parr.AddParam(PF_PARAM_NONE, type_name, reg_name, type_name + "(0.)"); } bool FragmentProgramDecompiler::HasReg(u32 index, bool fp16) @@ -255,12 +255,12 @@ std::string FragmentProgramDecompiler::AddTex() std::string FragmentProgramDecompiler::AddType3() { - return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "src3", getFloatTypeName(4) + "(1., 1., 1., 1.)"); + return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "src3", getFloatTypeName(4) + "(1.)"); } std::string FragmentProgramDecompiler::AddX2d() { - return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "x2d", getFloatTypeName(4) + "(0., 0., 0., 0.)"); + return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "x2d", getFloatTypeName(4) + "(0.)"); } std::string FragmentProgramDecompiler::ClampValue(const std::string& code, u32 precision) @@ -366,6 +366,7 @@ std::string FragmentProgramDecompiler::Format(const std::string& code, bool igno std::string FragmentProgramDecompiler::GetRawCond() { static constexpr std::string_view f = "xyzw"; + const auto zero = getFloatTypeName(4) + "(0.)"; std::string swizzle, cond; swizzle.reserve(5); @@ -381,17 +382,17 @@ std::string FragmentProgramDecompiler::GetRawCond() } if (src0.exec_if_gr && src0.exec_if_eq) - cond = compareFunction(COMPARE::FUNCTION_SGE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)"); + cond = compareFunction(COMPARE::FUNCTION_SGE, AddCond() + swizzle, zero); else if (src0.exec_if_lt && src0.exec_if_eq) - cond = compareFunction(COMPARE::FUNCTION_SLE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)"); + cond = compareFunction(COMPARE::FUNCTION_SLE, AddCond() + swizzle, zero); else if (src0.exec_if_gr && src0.exec_if_lt) - cond = compareFunction(COMPARE::FUNCTION_SNE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)"); + cond = compareFunction(COMPARE::FUNCTION_SNE, AddCond() + swizzle, zero); else if (src0.exec_if_gr) - cond = compareFunction(COMPARE::FUNCTION_SGT, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)"); + cond = compareFunction(COMPARE::FUNCTION_SGT, AddCond() + swizzle, zero); else if (src0.exec_if_lt) - cond = compareFunction(COMPARE::FUNCTION_SLT, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)"); + cond = compareFunction(COMPARE::FUNCTION_SLT, AddCond() + swizzle, zero); else //if(src0.exec_if_eq) - cond = compareFunction(COMPARE::FUNCTION_SEQ, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)"); + cond = compareFunction(COMPARE::FUNCTION_SEQ, AddCond() + swizzle, zero); return cond; } @@ -698,7 +699,7 @@ std::string FragmentProgramDecompiler::BuildCode() const bool fp16_out = !(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS); const std::string float4_type = (fp16_out && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4); - const std::string init_value = float4_type + "(0., 0., 0., 0.)"; + const std::string init_value = float4_type + "(0.)"; std::array output_register_names; std::array ouput_register_indices = { 0, 2, 3, 4 }; bool shader_is_valid = false; @@ -769,7 +770,7 @@ std::string FragmentProgramDecompiler::BuildCode() "{\n" " // Treat NaNs as 0\n" " bvec4 nans = isnan(x);\n" - " x = _select(x, $float4(0., 0., 0., 0.), nans);\n" + " x = _select(x, $float4(0.), nans);\n" " return clamp(x, _min, _max);\n" "}\n\n"; @@ -780,7 +781,7 @@ std::string FragmentProgramDecompiler::BuildCode() "{\n" " // Treat NaNs as 0\n" " bvec4 nans = isnan(x);\n" - " x = _select(x, $half4(0., 0., 0., 0.), nans);\n" + " x = _select(x, $half4(0.), nans);\n" " return clamp(x, $half_t(_min), $half_t(_max));\n" "}\n\n"; } @@ -791,32 +792,26 @@ std::string FragmentProgramDecompiler::BuildCode() if (!device_props.has_native_half_support) { // Accurate float to half clamping (preserves IEEE-754 NaN) - std::string clamp_func = - "$float4 clamp16($float4 x)\n" - "{\n"; - + std::string clamp_func; if (glsl) { clamp_func += - " uvec4 bits = floatBitsToUint(x);\n" - " uvec4 extend = uvec4(0x7f800000);\n" - " bvec4 test = equal(bits & extend, extend);\n" - " vec4 clamped = clamp(x, -65504., +65504.);\n" - " return _select(clamped, x, test);\n"; + "vec2 clamp16(vec2 val){ return unpackHalf2x16(packHalf2x16(val)); }\n" + "vec4 clamp16(vec4 val){ return vec4(clamp16(val.xy), clamp16(val.zw)); }\n\n"; } else { clamp_func += + "$float4 clamp16($float4 x)\n" + "{\n" " if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n" " if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n" " if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n" " if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n" - " return x;\n"; + " return x;\n" + "}\n\n"; } - clamp_func += - "}\n\n"; - OS << Format(clamp_func); } else diff --git a/rpcs3/Emu/RSX/Common/GLSLCommon.h b/rpcs3/Emu/RSX/Common/GLSLCommon.h index cfa715fe29..7a9bd4a618 100644 --- a/rpcs3/Emu/RSX/Common/GLSLCommon.h +++ b/rpcs3/Emu/RSX/Common/GLSLCommon.h @@ -275,13 +275,13 @@ namespace glsl "uint gen_bits(const in uint x, const in uint y, const in uint z, const in uint w, const in bool swap)\n" "{\n" " return (swap) ?\n" - " bitfieldInsert(bitfieldInsert(bitfieldInsert(w, z, 8, 8), y, 16, 8), x, 24, 8) :\n" - " bitfieldInsert(bitfieldInsert(bitfieldInsert(x, y, 8, 8), z, 16, 8), w, 24, 8);\n" + " _set_bits(_set_bits(_set_bits(w, z, 8, 8), y, 16, 8), x, 24, 8) :\n" + " _set_bits(_set_bits(_set_bits(x, y, 8, 8), z, 16, 8), w, 24, 8);\n" "}\n\n" "uint gen_bits(const in uint x, const in uint y, const in bool swap)\n" "{\n" - " return (swap)? bitfieldInsert(y, x, 8, 8) : bitfieldInsert(x, y, 8, 8);\n" + " return (swap)? _set_bits(y, x, 8, 8) : _set_bits(x, y, 8, 8);\n" "}\n\n" "vec4 sext(const in ivec4 bits)\n" @@ -337,8 +337,8 @@ namespace glsl " }\n" " else if (desc.type == VTX_FMT_FLOAT16)\n" " {\n" - " tmp.x = bitfieldInsert(result.x, result.y, 16, 16);\n" - " tmp.y = bitfieldInsert(result.z, result.w, 16, 16);\n" + " tmp.x = _set_bits(result.x, result.y, 16, 16);\n" + " tmp.y = _set_bits(result.z, result.w, 16, 16);\n" " ret.xy = unpackHalf2x16(tmp.x);\n" " ret.zw = unpackHalf2x16(tmp.y);\n" " }\n" @@ -348,9 +348,9 @@ namespace glsl " }\n" " else //if (desc.type == VTX_FMT_COMP32)\n" " {\n" - " result = uvec4(bitfieldExtract(result.x, 0, 11),\n" - " bitfieldExtract(result.x, 11, 11),\n" - " bitfieldExtract(result.x, 22, 10),\n" + " result = uvec4(_get_bits(result.x, 0, 11),\n" + " _get_bits(result.x, 11, 11),\n" + " _get_bits(result.x, 22, 10),\n" " uint(scale.x));\n" " ret = sext(ivec4(result) << ivec4(5, 5, 6, 0));\n" " }\n\n" @@ -395,14 +395,14 @@ namespace glsl OS << " attribute_desc result;\n" - " result.stride = bitfieldExtract(attrib.x, 0, 8);\n" - " result.frequency = bitfieldExtract(attrib.x, 8, 16);\n" - " result.type = bitfieldExtract(attrib.x, 24, 3);\n" - " result.attribute_size = bitfieldExtract(attrib.x, 27, 3);\n" - " result.starting_offset = bitfieldExtract(attrib.y, 0, 29);\n" - " result.swap_bytes = bitfieldExtract(attrib.y, 29, 1) != 0;\n" - " result.is_volatile = bitfieldExtract(attrib.y, 30, 1) != 0;\n" - " result.modulo = bitfieldExtract(attrib.y, 31, 1) != 0;\n" + " result.stride = _get_bits(attrib.x, 0, 8);\n" + " result.frequency = _get_bits(attrib.x, 8, 16);\n" + " result.type = _get_bits(attrib.x, 24, 3);\n" + " result.attribute_size = _get_bits(attrib.x, 27, 3);\n" + " result.starting_offset = _get_bits(attrib.y, 0, 29);\n" + " result.swap_bytes = _test_bit(attrib.y, 29);\n" + " result.is_volatile = _test_bit(attrib.y, 30);\n" + " result.modulo = _test_bit(attrib.y, 31);\n" " return result;\n" "}\n\n" @@ -434,14 +434,16 @@ namespace glsl static void insert_rop_init(std::ostream& OS) { OS << - " if ((rop_control & (1u << 9)) != 0)\n" + " if (_test_bit(rop_control, 9))\n" " {\n" " // Convert x,y to linear address\n" - " uvec2 stipple_coord = uvec2(gl_FragCoord.xy) % uvec2(32u, 32u);\n" - " uint address = stipple_coord.y * 32u + stipple_coord.x;\n" - " uint mask = (1u << (address & 31u));\n\n" + " const ivec2 stipple_coord = ivec2(gl_FragCoord.xy) % ivec2(32, 32);\n" + " const int address = stipple_coord.y * 32 + stipple_coord.x;\n" + " const int bit_offset = (address & 31);\n" + " const int word_index = _get_bits(address, 7, 3);\n" + " const int sub_index = _get_bits(address, 5, 2);\n\n" - " if ((stipple_pattern[address >> 7u][(address >> 5u) & 3u] & mask) == 0u)\n" + " if (_test_bit(stipple_pattern[word_index][sub_index], bit_offset))\n" " {\n" " _kill();\n" " }\n" @@ -463,26 +465,26 @@ namespace glsl " {\n" " discard;\n" " }\n" - " else if ((rop_control & 0xFFu) != 0)\n"; + " else if (_get_bits(rop_control, 0, 8) != 0)\n"; } else { - OS << " if ((rop_control & 0xFFu) != 0)\n"; + OS << " if (_get_bits(rop_control, 0, 8) != 0)\n"; } OS << " {\n" - " bool alpha_test = (rop_control & 0x1u) > 0;\n" - " uint alpha_func = ((rop_control >> 16) & 0x7u);\n"; + " const bool alpha_test = _test_bit(rop_control, 0);\n" + " const uint alpha_func = _get_bits(rop_control, 16, 3);\n"; if (!props.fp32_outputs) { - OS << " bool srgb_convert = (rop_control & 0x2u) > 0;\n\n"; + OS << " const bool srgb_convert = _test_bit(rop_control, 1);\n\n"; } if (props.emulate_coverage_tests) { - OS << " bool a2c_enabled = (rop_control & 0x10u) > 0;\n"; + OS << " const bool a2c_enabled = _test_bit(rop_control, 4);\n"; } OS << @@ -540,6 +542,9 @@ namespace glsl { OS << "#define _select mix\n"; OS << "#define _saturate(x) clamp(x, 0., 1.)\n"; + OS << "#define _get_bits(x, off, count) bitfieldExtract(x, off, count)\n"; + OS << "#define _set_bits(x, y, off, count) bitfieldInsert(x, y, off, count)\n"; + OS << "#define _test_bit(x, y) (_get_bits(x, y, 1) != 0)\n"; OS << "#define _rand(seed) fract(sin(dot(seed.xy, vec2(12.9898f, 78.233f))) * 43758.5453f)\n\n"; if (props.domain == glsl::program_domain::glsl_fragment_program) @@ -641,7 +646,7 @@ namespace glsl OS << "bool coverage_test_passes(const in vec4 _sample, const in uint control)\n" "{\n" - " if ((control & 0x1u) == 0) return false;\n" + " if (!_test_bit(control, 0)) return false;\n" "\n" " float random = _rand(gl_FragCoord);\n" " return (_sample.a > random);\n" @@ -671,18 +676,18 @@ namespace glsl " if (depth_float == 0)\n" " value = uint(depth_value * 16777215.);\n" " else\n" - " value = (floatBitsToUint(depth_value) >> 7) & 0xffffff;\n" + " value = _get_bits(floatBitsToUint(depth_value), 7, 24);\n" "\n" - " uint b = (value & 0xff);\n" - " uint g = (value >> 8) & 0xff;\n" - " uint r = (value >> 16) & 0xff;\n" + " uint b = _get_bits(value, 0, 8);\n" + " uint g = _get_bits(value, 8, 8);\n" + " uint r = _get_bits(value, 16, 8);\n" " return vec4(float(g)/255., float(b)/255., 1., float(r)/255.);\n" "}\n\n" "vec4 remap_vector(const in vec4 color, const in uint remap)\n" "{\n" " vec4 result;\n" - " if ((remap & 0xFF) == 0xE4)\n" + " if (_get_bits(remap, 0, 8) == 0xE4)\n" " {\n" " result = color;\n" " }\n" @@ -699,7 +704,7 @@ namespace glsl " result.b = color[remap_channel.b];\n" " }\n\n" - " if ((remap >> 8) == 0xAA)\n" + " if (_get_bits(remap, 8, 8) == 0xAA)\n" " return result;\n\n" " uvec4 remap_select = uvec4(remap) >> uvec4(10, 12, 14, 8);\n" @@ -755,7 +760,7 @@ namespace glsl " return rgba;\n" " }\n" "\n" - " if ((control_bits & 0x10u) != 0)\n" + " if (_test_bit(control_bits, 4))\n" " {\n" " // Alphakill\n" " if (rgba.a < 0.000001)\n" @@ -765,7 +770,7 @@ namespace glsl " }\n" " }\n" "\n" - " if ((control_bits & 0x20u) != 0)\n" + " if (_test_bit(control_bits, 5))\n" " {\n" " // Renormalize to 8-bit (PS3) accuracy\n" " rgba = floor(rgba * 255.);\n" @@ -882,9 +887,9 @@ namespace glsl case FUNCTION::FUNCTION_DPH: return "$Ty(dot(vec4($0.xyz, 1.0), $1))"; case FUNCTION::FUNCTION_SFL: - return "$Ty(0., 0., 0., 0.)"; + return "$Ty(0.)"; case FUNCTION::FUNCTION_STR: - return "$Ty(1., 1., 1., 1.)"; + return "$Ty(1.)"; case FUNCTION::FUNCTION_FRACT: return "fract($0)"; case FUNCTION::FUNCTION_REFL: